fix(provider): clamp max_tokens to fit OpenRouter provider context window

- it currently rejects requests where input + max_output exceeds the
serving provider's context lmit (which may be tighter than the
model-level value)
- use the smaller of ContextWindow and MaxOutput as the cap, with a
4096-token input reserve
This commit is contained in:
Neil Vallecer 2026-06-09 23:25:37 +08:00
parent 3d031dde26
commit 1425e68636
2 changed files with 27 additions and 0 deletions

View file

@ -286,6 +286,8 @@ func DiscoverOpenRouter(ctx context.Context, baseURL string) ([]Model, error) {
ctxWin := d.ContextLength
if ctxWin == 0 {
ctxWin = d.TopProvider.ContextLength
} else if d.TopProvider.ContextLength > 0 && d.TopProvider.ContextLength < ctxWin {
ctxWin = d.TopProvider.ContextLength
}
maxOut := 0
if d.TopProvider.MaxCompletionTokens != nil {

View file

@ -183,6 +183,31 @@ func (c *openaiClient) buildRequest(req Request) (*oaiRequest, error) {
if maxTok <= 0 {
maxTok = m.MaxOutput
}
// Clamp max_tokens so output + minimum input fits within the context
// window. Some providers (OpenRouter) enforce input + max_output <=
// context_length and reject requests where the total exceeds it.
// This reservation ensures basic user input (system prompt + first
// message + tool definitions) has room.
//
// Use the smaller of ContextWindow and MaxOutput as the cap because
// some providers (OpenRouter) report inflated model-level context
// windows (e.g. 1000000) while the serving provider enforces a much
// tighter limit (e.g. 262144). When both are wrong we still land on
// the output cap, which is the actual limit reported by the provider.
if m.ContextWindow > 0 && m.MaxOutput > 0 {
ctxCap := m.ContextWindow
if m.MaxOutput < ctxCap {
ctxCap = m.MaxOutput
}
const reserve = 4096
clamped := ctxCap - reserve
if clamped < 1024 {
clamped = 1024
}
if maxTok > clamped {
maxTok = clamped
}
}
if m.Reasoning {
if maxTok > 0 {
out.MaxCompletionTok = &maxTok