Merge #24: clamp max_tokens to fit context window (proportional reserve)

Fixes OpenRouter rejecting requests where max_tokens equals the served context window. Prefer top_provider.context_length on discovery and clamp max_tokens to ContextWindow minus a proportional reserve (window/8 capped at 4096). Reworked from the original PR so the reserve derives from the window, not MaxOutput: models whose output already fits are untouched and small-window models are not over-penalized. Co-authored-by: Neil-urk12 <neil-urk12@users.noreply.github.com>
2026-06-27 05:46:34 +02:00 · 2026-06-09 19:29:54 +02:00 · 2026-06-09 19:29:54 +02:00 · ffca64d4fd
commit ffca64d4fd
parent c2c9a5ea28 d2fa18270d
4 changed files with 260 additions and 1 deletions
--- a/packages/provider/discover.go
+++ b/packages/provider/discover.go
@ -284,7 +284,7 @@ func DiscoverOpenRouter(ctx context.Context, baseURL string) ([]Model, error) {
 			display = d.ID
 		}
 		ctxWin := d.ContextLength
-		if ctxWin == 0 {
+		if d.TopProvider.ContextLength > 0 && (ctxWin == 0 || d.TopProvider.ContextLength < ctxWin) {
 			ctxWin = d.TopProvider.ContextLength
 		}
 		maxOut := 0
--- a/packages/provider/discover_openrouter_test.go
+++ b/packages/provider/discover_openrouter_test.go
@ -0,0 +1,53 @@
+package provider
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+)
+
+// TestDiscoverOpenRouterPrefersServedContextLength verifies the
+// discovery fix: OpenRouter reports an inflated model-level
+// context_length (the model's theoretical max) alongside the serving
+// provider's real top_provider.context_length. When the served limit is
+// smaller, discovery must use it, because that's the limit OpenRouter
+// actually enforces. Using the inflated value made every context-window
+// check useless and let max_tokens exceed the real ceiling.
+func TestDiscoverOpenRouterPrefersServedContextLength(t *testing.T) {
+	const body = `{"data":[
+		{"id":"infl","name":"Inflated","context_length":1000000,
+		 "top_provider":{"context_length":262144}},
+		{"id":"served-bigger","name":"ServedBigger","context_length":131072,
+		 "top_provider":{"context_length":262144}},
+		{"id":"no-top","name":"NoTop","context_length":128000,
+		 "top_provider":{"context_length":0}}
+	]}`
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		_, _ = w.Write([]byte(body))
+	}))
+	defer srv.Close()
+
+	models, err := DiscoverOpenRouter(context.Background(), srv.URL)
+	if err != nil {
+		t.Fatal(err)
+	}
+	byID := map[string]Model{}
+	for _, m := range models {
+		byID[m.ID] = m
+	}
+
+	// Inflated model-level window: use the smaller served limit.
+	if got := byID["infl"].ContextWindow; got != 262144 {
+		t.Errorf("inflated model ContextWindow = %d; want 262144 (served limit)", got)
+	}
+	// Served limit larger than model-level: keep the smaller model value.
+	if got := byID["served-bigger"].ContextWindow; got != 131072 {
+		t.Errorf("served-bigger ContextWindow = %d; want 131072 (model value, the smaller one)", got)
+	}
+	// No served limit: fall back to the model-level value.
+	if got := byID["no-top"].ContextWindow; got != 128000 {
+		t.Errorf("no-top ContextWindow = %d; want 128000 (model value, no top_provider)", got)
+	}
+}
--- a/packages/provider/openai.go
+++ b/packages/provider/openai.go
@ -183,6 +183,43 @@ func (c *openaiClient) buildRequest(req Request) (*oaiRequest, error) {
 	if maxTok <= 0 {
 		maxTok = m.MaxOutput
 	}
+	// Clamp max_tokens so output plus a minimum input reservation fits
+	// within the context window. Some providers (OpenRouter) enforce
+	// input + max_output <= context_length and reject requests where the
+	// total exceeds it. Reserving headroom guarantees the system prompt,
+	// first message, and tool definitions have room.
+	//
+	// The cap is derived from the context window, never from MaxOutput:
+	// MaxOutput is already the output ceiling, so subtracting from it
+	// would shrink every model's budget even when its output comfortably
+	// fits the window. We only lower maxTok when ContextWindow - reserve
+	// is actually tighter than the requested budget, which is exactly the
+	// pathological case (e.g. a model whose MaxOutput equals its window).
+	//
+	// The reserve is proportional (window/8, capped at 4096) rather than a
+	// flat 4096 so small-window models aren't over-penalized: a flat 4096
+	// would halve gpt-4's 8192 budget, while window/8 reserves a sensible
+	// 1024 there and still tops out at 4096 for large contexts.
+	//
+	// Some providers (OpenRouter) report inflated model-level context
+	// windows (e.g. 1000000) while the serving provider enforces a much
+	// tighter limit (e.g. 262144). Discovery already prefers the serving
+	// provider's smaller context_length, so m.ContextWindow is the real
+	// limit by the time we get here.
+	if m.ContextWindow > 0 {
+		reserve := m.ContextWindow / 8
+		const maxReserve = 4096
+		if reserve > maxReserve {
+			reserve = maxReserve
+		}
+		clamped := m.ContextWindow - reserve
+		if clamped < 1 {
+			clamped = 1
+		}
+		if maxTok > clamped {
+			maxTok = clamped
+		}
+	}
 	if m.Reasoning {
 		if maxTok > 0 {
 			out.MaxCompletionTok = &maxTok
--- a/packages/provider/openai_clamp_test.go
+++ b/packages/provider/openai_clamp_test.go
@ -0,0 +1,169 @@
+package provider
+
+import "testing"
+
+// withLiveModels installs a synthetic catalog overlay for the duration
+// of a test and restores the previous state afterwards. It lets these
+// tests pin exact ContextWindow / MaxOutput values without depending on
+// the real catalog.
+func withLiveModels(t *testing.T, models []Model) {
+	t.Helper()
+	activeMu.Lock()
+	prevActive := active
+	prevSet := activeSet
+	activeMu.Unlock()
+
+	SetLiveModels(models)
+
+	t.Cleanup(func() {
+		activeMu.Lock()
+		active = prevActive
+		activeSet = prevSet
+		activeMu.Unlock()
+	})
+}
+
+// outputBudget pulls whichever max-output field buildRequest populated
+// for a non-reasoning model (out.MaxTokens) so tests can assert the
+// clamped value regardless of the reasoning path.
+func outputBudget(t *testing.T, out *oaiRequest) int {
+	t.Helper()
+	switch {
+	case out.MaxTokens != nil:
+		return *out.MaxTokens
+	case out.MaxCompletionTok != nil:
+		return *out.MaxCompletionTok
+	default:
+		t.Fatalf("no output budget set on request")
+		return 0
+	}
+}
+
+// TestBuildRequestDoesNotClampWhenOutputFitsWindow is the regression
+// guard for the original PR #24 behavior: a normal model whose MaxOutput
+// comfortably fits its context window must NOT have its budget reduced.
+// The earlier implementation subtracted 4096 from min(window, MaxOutput),
+// which silently shrank every OpenAI/DeepSeek model by 4096 tokens.
+func TestBuildRequestDoesNotClampWhenOutputFitsWindow(t *testing.T) {
+	withLiveModels(t, []Model{{
+		Provider:      "openai",
+		ID:            "fits-fine",
+		ContextWindow: 128000,
+		MaxOutput:     16384,
+	}})
+	c := &openaiClient{name: "openai"}
+
+	out, err := c.buildRequest(Request{
+		Model:    "fits-fine",
+		Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if got := outputBudget(t, out); got != 16384 {
+		t.Fatalf("output budget = %d; want 16384 (must not clamp when output fits window)", got)
+	}
+}
+
+// TestBuildRequestClampsLargeWindowAtMaxReserve covers the bug PR #24
+// targets for a large-window model whose MaxOutput equals its (served)
+// context window. Sending max_tokens == window leaves no room for input,
+// so OpenRouter rejects it. The proportional reserve (window/8) is capped
+// at 4096, so a 262144 window reserves the cap, not window/8.
+func TestBuildRequestClampsLargeWindowAtMaxReserve(t *testing.T) {
+	const window = 262144
+	withLiveModels(t, []Model{{
+		Provider:      "openrouter",
+		ID:            "nemotron-tight",
+		ContextWindow: window,
+		MaxOutput:     window,
+	}})
+	c := &openaiClient{name: "openrouter"}
+
+	out, err := c.buildRequest(Request{
+		Model:    "nemotron-tight",
+		Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	want := window - 4096 // window/8 = 32768 > 4096 cap, so reserve = 4096
+	if got := outputBudget(t, out); got != want {
+		t.Fatalf("output budget = %d; want %d (window - capped reserve)", got, want)
+	}
+}
+
+// TestBuildRequestProportionalReserveSmallWindow verifies the reserve is
+// proportional for small windows so they aren't over-penalized. gpt-4's
+// 8192/8192 model reserves window/8 = 1024 (not a flat 4096, which would
+// halve its budget), leaving 7168.
+func TestBuildRequestProportionalReserveSmallWindow(t *testing.T) {
+	const window = 8192
+	withLiveModels(t, []Model{{
+		Provider:      "openai",
+		ID:            "gpt-4-like",
+		ContextWindow: window,
+		MaxOutput:     window,
+	}})
+	c := &openaiClient{name: "openai"}
+
+	out, err := c.buildRequest(Request{
+		Model:    "gpt-4-like",
+		Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	want := window - window/8 // 8192 - 1024 = 7168
+	if got := outputBudget(t, out); got != want {
+		t.Fatalf("output budget = %d; want %d (window - window/8)", got, want)
+	}
+}
+
+// TestBuildRequestClampFloor verifies a tiny context window never yields
+// a zero or negative budget. window/8 of 16 is 2, leaving 14.
+func TestBuildRequestClampFloor(t *testing.T) {
+	withLiveModels(t, []Model{{
+		Provider:      "openrouter",
+		ID:            "tiny-window",
+		ContextWindow: 16,
+		MaxOutput:     16,
+	}})
+	c := &openaiClient{name: "openrouter"}
+
+	out, err := c.buildRequest(Request{
+		Model:    "tiny-window",
+		Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if got := outputBudget(t, out); got != 14 {
+		t.Fatalf("output budget = %d; want 14 (window - window/8, still positive)", got)
+	}
+}
+
+// TestBuildRequestClampDoesNotInflate confirms the clamp only ever
+// lowers the budget: a small explicit MaxTokens under the clamp ceiling
+// passes through untouched.
+func TestBuildRequestClampDoesNotInflate(t *testing.T) {
+	withLiveModels(t, []Model{{
+		Provider:      "openrouter",
+		ID:            "roomy",
+		ContextWindow: 262144,
+		MaxOutput:     262144,
+	}})
+	c := &openaiClient{name: "openrouter"}
+
+	out, err := c.buildRequest(Request{
+		Model:     "roomy",
+		MaxTokens: 8000,
+		Messages:  []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if got := outputBudget(t, out); got != 8000 {
+		t.Fatalf("output budget = %d; want 8000 (explicit request below ceiling, unchanged)", got)
+	}
+}