diff --git a/packages/provider/discover.go b/packages/provider/discover.go index 7d1798e..be1d290 100644 --- a/packages/provider/discover.go +++ b/packages/provider/discover.go @@ -284,7 +284,7 @@ func DiscoverOpenRouter(ctx context.Context, baseURL string) ([]Model, error) { display = d.ID } ctxWin := d.ContextLength - if ctxWin == 0 { + if d.TopProvider.ContextLength > 0 && (ctxWin == 0 || d.TopProvider.ContextLength < ctxWin) { ctxWin = d.TopProvider.ContextLength } maxOut := 0 diff --git a/packages/provider/discover_openrouter_test.go b/packages/provider/discover_openrouter_test.go new file mode 100644 index 0000000..d343e85 --- /dev/null +++ b/packages/provider/discover_openrouter_test.go @@ -0,0 +1,53 @@ +package provider + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" +) + +// TestDiscoverOpenRouterPrefersServedContextLength verifies the +// discovery fix: OpenRouter reports an inflated model-level +// context_length (the model's theoretical max) alongside the serving +// provider's real top_provider.context_length. When the served limit is +// smaller, discovery must use it, because that's the limit OpenRouter +// actually enforces. Using the inflated value made every context-window +// check useless and let max_tokens exceed the real ceiling. +func TestDiscoverOpenRouterPrefersServedContextLength(t *testing.T) { + const body = `{"data":[ + {"id":"infl","name":"Inflated","context_length":1000000, + "top_provider":{"context_length":262144}}, + {"id":"served-bigger","name":"ServedBigger","context_length":131072, + "top_provider":{"context_length":262144}}, + {"id":"no-top","name":"NoTop","context_length":128000, + "top_provider":{"context_length":0}} + ]}` + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(body)) + })) + defer srv.Close() + + models, err := DiscoverOpenRouter(context.Background(), srv.URL) + if err != nil { + t.Fatal(err) + } + byID := map[string]Model{} + for _, m := range models { + byID[m.ID] = m + } + + // Inflated model-level window: use the smaller served limit. + if got := byID["infl"].ContextWindow; got != 262144 { + t.Errorf("inflated model ContextWindow = %d; want 262144 (served limit)", got) + } + // Served limit larger than model-level: keep the smaller model value. + if got := byID["served-bigger"].ContextWindow; got != 131072 { + t.Errorf("served-bigger ContextWindow = %d; want 131072 (model value, the smaller one)", got) + } + // No served limit: fall back to the model-level value. + if got := byID["no-top"].ContextWindow; got != 128000 { + t.Errorf("no-top ContextWindow = %d; want 128000 (model value, no top_provider)", got) + } +} diff --git a/packages/provider/openai.go b/packages/provider/openai.go index 9e69486..22c0d6e 100644 --- a/packages/provider/openai.go +++ b/packages/provider/openai.go @@ -183,6 +183,43 @@ func (c *openaiClient) buildRequest(req Request) (*oaiRequest, error) { if maxTok <= 0 { maxTok = m.MaxOutput } + // Clamp max_tokens so output plus a minimum input reservation fits + // within the context window. Some providers (OpenRouter) enforce + // input + max_output <= context_length and reject requests where the + // total exceeds it. Reserving headroom guarantees the system prompt, + // first message, and tool definitions have room. + // + // The cap is derived from the context window, never from MaxOutput: + // MaxOutput is already the output ceiling, so subtracting from it + // would shrink every model's budget even when its output comfortably + // fits the window. We only lower maxTok when ContextWindow - reserve + // is actually tighter than the requested budget, which is exactly the + // pathological case (e.g. a model whose MaxOutput equals its window). + // + // The reserve is proportional (window/8, capped at 4096) rather than a + // flat 4096 so small-window models aren't over-penalized: a flat 4096 + // would halve gpt-4's 8192 budget, while window/8 reserves a sensible + // 1024 there and still tops out at 4096 for large contexts. + // + // Some providers (OpenRouter) report inflated model-level context + // windows (e.g. 1000000) while the serving provider enforces a much + // tighter limit (e.g. 262144). Discovery already prefers the serving + // provider's smaller context_length, so m.ContextWindow is the real + // limit by the time we get here. + if m.ContextWindow > 0 { + reserve := m.ContextWindow / 8 + const maxReserve = 4096 + if reserve > maxReserve { + reserve = maxReserve + } + clamped := m.ContextWindow - reserve + if clamped < 1 { + clamped = 1 + } + if maxTok > clamped { + maxTok = clamped + } + } if m.Reasoning { if maxTok > 0 { out.MaxCompletionTok = &maxTok diff --git a/packages/provider/openai_clamp_test.go b/packages/provider/openai_clamp_test.go new file mode 100644 index 0000000..429ba81 --- /dev/null +++ b/packages/provider/openai_clamp_test.go @@ -0,0 +1,169 @@ +package provider + +import "testing" + +// withLiveModels installs a synthetic catalog overlay for the duration +// of a test and restores the previous state afterwards. It lets these +// tests pin exact ContextWindow / MaxOutput values without depending on +// the real catalog. +func withLiveModels(t *testing.T, models []Model) { + t.Helper() + activeMu.Lock() + prevActive := active + prevSet := activeSet + activeMu.Unlock() + + SetLiveModels(models) + + t.Cleanup(func() { + activeMu.Lock() + active = prevActive + activeSet = prevSet + activeMu.Unlock() + }) +} + +// outputBudget pulls whichever max-output field buildRequest populated +// for a non-reasoning model (out.MaxTokens) so tests can assert the +// clamped value regardless of the reasoning path. +func outputBudget(t *testing.T, out *oaiRequest) int { + t.Helper() + switch { + case out.MaxTokens != nil: + return *out.MaxTokens + case out.MaxCompletionTok != nil: + return *out.MaxCompletionTok + default: + t.Fatalf("no output budget set on request") + return 0 + } +} + +// TestBuildRequestDoesNotClampWhenOutputFitsWindow is the regression +// guard for the original PR #24 behavior: a normal model whose MaxOutput +// comfortably fits its context window must NOT have its budget reduced. +// The earlier implementation subtracted 4096 from min(window, MaxOutput), +// which silently shrank every OpenAI/DeepSeek model by 4096 tokens. +func TestBuildRequestDoesNotClampWhenOutputFitsWindow(t *testing.T) { + withLiveModels(t, []Model{{ + Provider: "openai", + ID: "fits-fine", + ContextWindow: 128000, + MaxOutput: 16384, + }}) + c := &openaiClient{name: "openai"} + + out, err := c.buildRequest(Request{ + Model: "fits-fine", + Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}}, + }) + if err != nil { + t.Fatal(err) + } + if got := outputBudget(t, out); got != 16384 { + t.Fatalf("output budget = %d; want 16384 (must not clamp when output fits window)", got) + } +} + +// TestBuildRequestClampsLargeWindowAtMaxReserve covers the bug PR #24 +// targets for a large-window model whose MaxOutput equals its (served) +// context window. Sending max_tokens == window leaves no room for input, +// so OpenRouter rejects it. The proportional reserve (window/8) is capped +// at 4096, so a 262144 window reserves the cap, not window/8. +func TestBuildRequestClampsLargeWindowAtMaxReserve(t *testing.T) { + const window = 262144 + withLiveModels(t, []Model{{ + Provider: "openrouter", + ID: "nemotron-tight", + ContextWindow: window, + MaxOutput: window, + }}) + c := &openaiClient{name: "openrouter"} + + out, err := c.buildRequest(Request{ + Model: "nemotron-tight", + Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}}, + }) + if err != nil { + t.Fatal(err) + } + want := window - 4096 // window/8 = 32768 > 4096 cap, so reserve = 4096 + if got := outputBudget(t, out); got != want { + t.Fatalf("output budget = %d; want %d (window - capped reserve)", got, want) + } +} + +// TestBuildRequestProportionalReserveSmallWindow verifies the reserve is +// proportional for small windows so they aren't over-penalized. gpt-4's +// 8192/8192 model reserves window/8 = 1024 (not a flat 4096, which would +// halve its budget), leaving 7168. +func TestBuildRequestProportionalReserveSmallWindow(t *testing.T) { + const window = 8192 + withLiveModels(t, []Model{{ + Provider: "openai", + ID: "gpt-4-like", + ContextWindow: window, + MaxOutput: window, + }}) + c := &openaiClient{name: "openai"} + + out, err := c.buildRequest(Request{ + Model: "gpt-4-like", + Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}}, + }) + if err != nil { + t.Fatal(err) + } + want := window - window/8 // 8192 - 1024 = 7168 + if got := outputBudget(t, out); got != want { + t.Fatalf("output budget = %d; want %d (window - window/8)", got, want) + } +} + +// TestBuildRequestClampFloor verifies a tiny context window never yields +// a zero or negative budget. window/8 of 16 is 2, leaving 14. +func TestBuildRequestClampFloor(t *testing.T) { + withLiveModels(t, []Model{{ + Provider: "openrouter", + ID: "tiny-window", + ContextWindow: 16, + MaxOutput: 16, + }}) + c := &openaiClient{name: "openrouter"} + + out, err := c.buildRequest(Request{ + Model: "tiny-window", + Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}}, + }) + if err != nil { + t.Fatal(err) + } + if got := outputBudget(t, out); got != 14 { + t.Fatalf("output budget = %d; want 14 (window - window/8, still positive)", got) + } +} + +// TestBuildRequestClampDoesNotInflate confirms the clamp only ever +// lowers the budget: a small explicit MaxTokens under the clamp ceiling +// passes through untouched. +func TestBuildRequestClampDoesNotInflate(t *testing.T) { + withLiveModels(t, []Model{{ + Provider: "openrouter", + ID: "roomy", + ContextWindow: 262144, + MaxOutput: 262144, + }}) + c := &openaiClient{name: "openrouter"} + + out, err := c.buildRequest(Request{ + Model: "roomy", + MaxTokens: 8000, + Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}}, + }) + if err != nil { + t.Fatal(err) + } + if got := outputBudget(t, out); got != 8000 { + t.Fatalf("output budget = %d; want 8000 (explicit request below ceiling, unchanged)", got) + } +}