From 1425e68636b67c73aaa224c138e098318be4d6d6 Mon Sep 17 00:00:00 2001 From: Neil Vallecer Date: Tue, 9 Jun 2026 23:25:37 +0800 Subject: [PATCH] fix(provider): clamp max_tokens to fit OpenRouter provider context window - it currently rejects requests where input + max_output exceeds the serving provider's context lmit (which may be tighter than the model-level value) - use the smaller of ContextWindow and MaxOutput as the cap, with a 4096-token input reserve --- packages/provider/discover.go | 2 ++ packages/provider/openai.go | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/packages/provider/discover.go b/packages/provider/discover.go index 7d1798e..8da3e82 100644 --- a/packages/provider/discover.go +++ b/packages/provider/discover.go @@ -286,6 +286,8 @@ func DiscoverOpenRouter(ctx context.Context, baseURL string) ([]Model, error) { ctxWin := d.ContextLength if ctxWin == 0 { ctxWin = d.TopProvider.ContextLength + } else if d.TopProvider.ContextLength > 0 && d.TopProvider.ContextLength < ctxWin { + ctxWin = d.TopProvider.ContextLength } maxOut := 0 if d.TopProvider.MaxCompletionTokens != nil { diff --git a/packages/provider/openai.go b/packages/provider/openai.go index 9e69486..f33d94f 100644 --- a/packages/provider/openai.go +++ b/packages/provider/openai.go @@ -183,6 +183,31 @@ func (c *openaiClient) buildRequest(req Request) (*oaiRequest, error) { if maxTok <= 0 { maxTok = m.MaxOutput } + // Clamp max_tokens so output + minimum input fits within the context + // window. Some providers (OpenRouter) enforce input + max_output <= + // context_length and reject requests where the total exceeds it. + // This reservation ensures basic user input (system prompt + first + // message + tool definitions) has room. + // + // Use the smaller of ContextWindow and MaxOutput as the cap because + // some providers (OpenRouter) report inflated model-level context + // windows (e.g. 1000000) while the serving provider enforces a much + // tighter limit (e.g. 262144). When both are wrong we still land on + // the output cap, which is the actual limit reported by the provider. + if m.ContextWindow > 0 && m.MaxOutput > 0 { + ctxCap := m.ContextWindow + if m.MaxOutput < ctxCap { + ctxCap = m.MaxOutput + } + const reserve = 4096 + clamped := ctxCap - reserve + if clamped < 1024 { + clamped = 1024 + } + if maxTok > clamped { + maxTok = clamped + } + } if m.Reasoning { if maxTok > 0 { out.MaxCompletionTok = &maxTok