From 1425e68636b67c73aaa224c138e098318be4d6d6 Mon Sep 17 00:00:00 2001
From: Neil Vallecer <neilvallecer12@gmail.com>
Date: Tue, 9 Jun 2026 23:25:37 +0800
Subject: [PATCH] fix(provider): clamp max_tokens to fit OpenRouter provider
 context window

- it currently rejects requests where input + max_output exceeds the
serving provider's context lmit (which may be tighter than the
model-level value)
- use the smaller of ContextWindow and MaxOutput as the cap, with a
4096-token input reserve
---
 packages/provider/discover.go |  2 ++
 packages/provider/openai.go   | 25 +++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/packages/provider/discover.go b/packages/provider/discover.go
index 7d1798e..8da3e82 100644
--- a/packages/provider/discover.go
+++ b/packages/provider/discover.go
@@ -286,6 +286,8 @@ func DiscoverOpenRouter(ctx context.Context, baseURL string) ([]Model, error) {
 		ctxWin := d.ContextLength
 		if ctxWin == 0 {
 			ctxWin = d.TopProvider.ContextLength
+		} else if d.TopProvider.ContextLength > 0 && d.TopProvider.ContextLength < ctxWin {
+			ctxWin = d.TopProvider.ContextLength
 		}
 		maxOut := 0
 		if d.TopProvider.MaxCompletionTokens != nil {
diff --git a/packages/provider/openai.go b/packages/provider/openai.go
index 9e69486..f33d94f 100644
--- a/packages/provider/openai.go
+++ b/packages/provider/openai.go
@@ -183,6 +183,31 @@ func (c *openaiClient) buildRequest(req Request) (*oaiRequest, error) {
 	if maxTok <= 0 {
 		maxTok = m.MaxOutput
 	}
+	// Clamp max_tokens so output + minimum input fits within the context
+	// window. Some providers (OpenRouter) enforce input + max_output <=
+	// context_length and reject requests where the total exceeds it.
+	// This reservation ensures basic user input (system prompt + first
+	// message + tool definitions) has room.
+	//
+	// Use the smaller of ContextWindow and MaxOutput as the cap because
+	// some providers (OpenRouter) report inflated model-level context
+	// windows (e.g. 1000000) while the serving provider enforces a much
+	// tighter limit (e.g. 262144). When both are wrong we still land on
+	// the output cap, which is the actual limit reported by the provider.
+	if m.ContextWindow > 0 && m.MaxOutput > 0 {
+		ctxCap := m.ContextWindow
+		if m.MaxOutput < ctxCap {
+			ctxCap = m.MaxOutput
+		}
+		const reserve = 4096
+		clamped := ctxCap - reserve
+		if clamped < 1024 {
+			clamped = 1024
+		}
+		if maxTok > clamped {
+			maxTok = clamped
+		}
+	}
 	if m.Reasoning {
 		if maxTok > 0 {
 			out.MaxCompletionTok = &maxTok