Merge #24: clamp max_tokens to fit context window (proportional reserve)

Fixes OpenRouter rejecting requests where max_tokens equals the served
context window. Prefer top_provider.context_length on discovery and clamp
max_tokens to ContextWindow minus a proportional reserve (window/8 capped
at 4096). Reworked from the original PR so the reserve derives from the
window, not MaxOutput: models whose output already fits are untouched and
small-window models are not over-penalized.

Co-authored-by: Neil-urk12 <neil-urk12@users.noreply.github.com>
This commit is contained in:
patriceckhart 2026-06-09 19:29:54 +02:00
commit ffca64d4fd
4 changed files with 260 additions and 1 deletions

View file

@ -284,7 +284,7 @@ func DiscoverOpenRouter(ctx context.Context, baseURL string) ([]Model, error) {
display = d.ID
}
ctxWin := d.ContextLength
if ctxWin == 0 {
if d.TopProvider.ContextLength > 0 && (ctxWin == 0 || d.TopProvider.ContextLength < ctxWin) {
ctxWin = d.TopProvider.ContextLength
}
maxOut := 0

View file

@ -0,0 +1,53 @@
package provider
import (
"context"
"net/http"
"net/http/httptest"
"testing"
)
// TestDiscoverOpenRouterPrefersServedContextLength verifies the
// discovery fix: OpenRouter reports an inflated model-level
// context_length (the model's theoretical max) alongside the serving
// provider's real top_provider.context_length. When the served limit is
// smaller, discovery must use it, because that's the limit OpenRouter
// actually enforces. Using the inflated value made every context-window
// check useless and let max_tokens exceed the real ceiling.
func TestDiscoverOpenRouterPrefersServedContextLength(t *testing.T) {
const body = `{"data":[
{"id":"infl","name":"Inflated","context_length":1000000,
"top_provider":{"context_length":262144}},
{"id":"served-bigger","name":"ServedBigger","context_length":131072,
"top_provider":{"context_length":262144}},
{"id":"no-top","name":"NoTop","context_length":128000,
"top_provider":{"context_length":0}}
]}`
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(body))
}))
defer srv.Close()
models, err := DiscoverOpenRouter(context.Background(), srv.URL)
if err != nil {
t.Fatal(err)
}
byID := map[string]Model{}
for _, m := range models {
byID[m.ID] = m
}
// Inflated model-level window: use the smaller served limit.
if got := byID["infl"].ContextWindow; got != 262144 {
t.Errorf("inflated model ContextWindow = %d; want 262144 (served limit)", got)
}
// Served limit larger than model-level: keep the smaller model value.
if got := byID["served-bigger"].ContextWindow; got != 131072 {
t.Errorf("served-bigger ContextWindow = %d; want 131072 (model value, the smaller one)", got)
}
// No served limit: fall back to the model-level value.
if got := byID["no-top"].ContextWindow; got != 128000 {
t.Errorf("no-top ContextWindow = %d; want 128000 (model value, no top_provider)", got)
}
}

View file

@ -183,6 +183,43 @@ func (c *openaiClient) buildRequest(req Request) (*oaiRequest, error) {
if maxTok <= 0 {
maxTok = m.MaxOutput
}
// Clamp max_tokens so output plus a minimum input reservation fits
// within the context window. Some providers (OpenRouter) enforce
// input + max_output <= context_length and reject requests where the
// total exceeds it. Reserving headroom guarantees the system prompt,
// first message, and tool definitions have room.
//
// The cap is derived from the context window, never from MaxOutput:
// MaxOutput is already the output ceiling, so subtracting from it
// would shrink every model's budget even when its output comfortably
// fits the window. We only lower maxTok when ContextWindow - reserve
// is actually tighter than the requested budget, which is exactly the
// pathological case (e.g. a model whose MaxOutput equals its window).
//
// The reserve is proportional (window/8, capped at 4096) rather than a
// flat 4096 so small-window models aren't over-penalized: a flat 4096
// would halve gpt-4's 8192 budget, while window/8 reserves a sensible
// 1024 there and still tops out at 4096 for large contexts.
//
// Some providers (OpenRouter) report inflated model-level context
// windows (e.g. 1000000) while the serving provider enforces a much
// tighter limit (e.g. 262144). Discovery already prefers the serving
// provider's smaller context_length, so m.ContextWindow is the real
// limit by the time we get here.
if m.ContextWindow > 0 {
reserve := m.ContextWindow / 8
const maxReserve = 4096
if reserve > maxReserve {
reserve = maxReserve
}
clamped := m.ContextWindow - reserve
if clamped < 1 {
clamped = 1
}
if maxTok > clamped {
maxTok = clamped
}
}
if m.Reasoning {
if maxTok > 0 {
out.MaxCompletionTok = &maxTok

View file

@ -0,0 +1,169 @@
package provider
import "testing"
// withLiveModels installs a synthetic catalog overlay for the duration
// of a test and restores the previous state afterwards. It lets these
// tests pin exact ContextWindow / MaxOutput values without depending on
// the real catalog.
func withLiveModels(t *testing.T, models []Model) {
t.Helper()
activeMu.Lock()
prevActive := active
prevSet := activeSet
activeMu.Unlock()
SetLiveModels(models)
t.Cleanup(func() {
activeMu.Lock()
active = prevActive
activeSet = prevSet
activeMu.Unlock()
})
}
// outputBudget pulls whichever max-output field buildRequest populated
// for a non-reasoning model (out.MaxTokens) so tests can assert the
// clamped value regardless of the reasoning path.
func outputBudget(t *testing.T, out *oaiRequest) int {
t.Helper()
switch {
case out.MaxTokens != nil:
return *out.MaxTokens
case out.MaxCompletionTok != nil:
return *out.MaxCompletionTok
default:
t.Fatalf("no output budget set on request")
return 0
}
}
// TestBuildRequestDoesNotClampWhenOutputFitsWindow is the regression
// guard for the original PR #24 behavior: a normal model whose MaxOutput
// comfortably fits its context window must NOT have its budget reduced.
// The earlier implementation subtracted 4096 from min(window, MaxOutput),
// which silently shrank every OpenAI/DeepSeek model by 4096 tokens.
func TestBuildRequestDoesNotClampWhenOutputFitsWindow(t *testing.T) {
withLiveModels(t, []Model{{
Provider: "openai",
ID: "fits-fine",
ContextWindow: 128000,
MaxOutput: 16384,
}})
c := &openaiClient{name: "openai"}
out, err := c.buildRequest(Request{
Model: "fits-fine",
Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}},
})
if err != nil {
t.Fatal(err)
}
if got := outputBudget(t, out); got != 16384 {
t.Fatalf("output budget = %d; want 16384 (must not clamp when output fits window)", got)
}
}
// TestBuildRequestClampsLargeWindowAtMaxReserve covers the bug PR #24
// targets for a large-window model whose MaxOutput equals its (served)
// context window. Sending max_tokens == window leaves no room for input,
// so OpenRouter rejects it. The proportional reserve (window/8) is capped
// at 4096, so a 262144 window reserves the cap, not window/8.
func TestBuildRequestClampsLargeWindowAtMaxReserve(t *testing.T) {
const window = 262144
withLiveModels(t, []Model{{
Provider: "openrouter",
ID: "nemotron-tight",
ContextWindow: window,
MaxOutput: window,
}})
c := &openaiClient{name: "openrouter"}
out, err := c.buildRequest(Request{
Model: "nemotron-tight",
Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}},
})
if err != nil {
t.Fatal(err)
}
want := window - 4096 // window/8 = 32768 > 4096 cap, so reserve = 4096
if got := outputBudget(t, out); got != want {
t.Fatalf("output budget = %d; want %d (window - capped reserve)", got, want)
}
}
// TestBuildRequestProportionalReserveSmallWindow verifies the reserve is
// proportional for small windows so they aren't over-penalized. gpt-4's
// 8192/8192 model reserves window/8 = 1024 (not a flat 4096, which would
// halve its budget), leaving 7168.
func TestBuildRequestProportionalReserveSmallWindow(t *testing.T) {
const window = 8192
withLiveModels(t, []Model{{
Provider: "openai",
ID: "gpt-4-like",
ContextWindow: window,
MaxOutput: window,
}})
c := &openaiClient{name: "openai"}
out, err := c.buildRequest(Request{
Model: "gpt-4-like",
Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}},
})
if err != nil {
t.Fatal(err)
}
want := window - window/8 // 8192 - 1024 = 7168
if got := outputBudget(t, out); got != want {
t.Fatalf("output budget = %d; want %d (window - window/8)", got, want)
}
}
// TestBuildRequestClampFloor verifies a tiny context window never yields
// a zero or negative budget. window/8 of 16 is 2, leaving 14.
func TestBuildRequestClampFloor(t *testing.T) {
withLiveModels(t, []Model{{
Provider: "openrouter",
ID: "tiny-window",
ContextWindow: 16,
MaxOutput: 16,
}})
c := &openaiClient{name: "openrouter"}
out, err := c.buildRequest(Request{
Model: "tiny-window",
Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}},
})
if err != nil {
t.Fatal(err)
}
if got := outputBudget(t, out); got != 14 {
t.Fatalf("output budget = %d; want 14 (window - window/8, still positive)", got)
}
}
// TestBuildRequestClampDoesNotInflate confirms the clamp only ever
// lowers the budget: a small explicit MaxTokens under the clamp ceiling
// passes through untouched.
func TestBuildRequestClampDoesNotInflate(t *testing.T) {
withLiveModels(t, []Model{{
Provider: "openrouter",
ID: "roomy",
ContextWindow: 262144,
MaxOutput: 262144,
}})
c := &openaiClient{name: "openrouter"}
out, err := c.buildRequest(Request{
Model: "roomy",
MaxTokens: 8000,
Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}},
})
if err != nil {
t.Fatal(err)
}
if got := outputBudget(t, out); got != 8000 {
t.Fatalf("output budget = %d; want 8000 (explicit request below ceiling, unchanged)", got)
}
}