mirror of
https://github.com/patriceckhart/zot.git
synced 2026-06-27 05:46:34 +02:00
Merge #24: clamp max_tokens to fit context window (proportional reserve)
Fixes OpenRouter rejecting requests where max_tokens equals the served context window. Prefer top_provider.context_length on discovery and clamp max_tokens to ContextWindow minus a proportional reserve (window/8 capped at 4096). Reworked from the original PR so the reserve derives from the window, not MaxOutput: models whose output already fits are untouched and small-window models are not over-penalized. Co-authored-by: Neil-urk12 <neil-urk12@users.noreply.github.com>
This commit is contained in:
commit
ffca64d4fd
4 changed files with 260 additions and 1 deletions
|
|
@ -284,7 +284,7 @@ func DiscoverOpenRouter(ctx context.Context, baseURL string) ([]Model, error) {
|
|||
display = d.ID
|
||||
}
|
||||
ctxWin := d.ContextLength
|
||||
if ctxWin == 0 {
|
||||
if d.TopProvider.ContextLength > 0 && (ctxWin == 0 || d.TopProvider.ContextLength < ctxWin) {
|
||||
ctxWin = d.TopProvider.ContextLength
|
||||
}
|
||||
maxOut := 0
|
||||
|
|
|
|||
53
packages/provider/discover_openrouter_test.go
Normal file
53
packages/provider/discover_openrouter_test.go
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
package provider
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestDiscoverOpenRouterPrefersServedContextLength verifies the
|
||||
// discovery fix: OpenRouter reports an inflated model-level
|
||||
// context_length (the model's theoretical max) alongside the serving
|
||||
// provider's real top_provider.context_length. When the served limit is
|
||||
// smaller, discovery must use it, because that's the limit OpenRouter
|
||||
// actually enforces. Using the inflated value made every context-window
|
||||
// check useless and let max_tokens exceed the real ceiling.
|
||||
func TestDiscoverOpenRouterPrefersServedContextLength(t *testing.T) {
|
||||
const body = `{"data":[
|
||||
{"id":"infl","name":"Inflated","context_length":1000000,
|
||||
"top_provider":{"context_length":262144}},
|
||||
{"id":"served-bigger","name":"ServedBigger","context_length":131072,
|
||||
"top_provider":{"context_length":262144}},
|
||||
{"id":"no-top","name":"NoTop","context_length":128000,
|
||||
"top_provider":{"context_length":0}}
|
||||
]}`
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(body))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
models, err := DiscoverOpenRouter(context.Background(), srv.URL)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
byID := map[string]Model{}
|
||||
for _, m := range models {
|
||||
byID[m.ID] = m
|
||||
}
|
||||
|
||||
// Inflated model-level window: use the smaller served limit.
|
||||
if got := byID["infl"].ContextWindow; got != 262144 {
|
||||
t.Errorf("inflated model ContextWindow = %d; want 262144 (served limit)", got)
|
||||
}
|
||||
// Served limit larger than model-level: keep the smaller model value.
|
||||
if got := byID["served-bigger"].ContextWindow; got != 131072 {
|
||||
t.Errorf("served-bigger ContextWindow = %d; want 131072 (model value, the smaller one)", got)
|
||||
}
|
||||
// No served limit: fall back to the model-level value.
|
||||
if got := byID["no-top"].ContextWindow; got != 128000 {
|
||||
t.Errorf("no-top ContextWindow = %d; want 128000 (model value, no top_provider)", got)
|
||||
}
|
||||
}
|
||||
|
|
@ -183,6 +183,43 @@ func (c *openaiClient) buildRequest(req Request) (*oaiRequest, error) {
|
|||
if maxTok <= 0 {
|
||||
maxTok = m.MaxOutput
|
||||
}
|
||||
// Clamp max_tokens so output plus a minimum input reservation fits
|
||||
// within the context window. Some providers (OpenRouter) enforce
|
||||
// input + max_output <= context_length and reject requests where the
|
||||
// total exceeds it. Reserving headroom guarantees the system prompt,
|
||||
// first message, and tool definitions have room.
|
||||
//
|
||||
// The cap is derived from the context window, never from MaxOutput:
|
||||
// MaxOutput is already the output ceiling, so subtracting from it
|
||||
// would shrink every model's budget even when its output comfortably
|
||||
// fits the window. We only lower maxTok when ContextWindow - reserve
|
||||
// is actually tighter than the requested budget, which is exactly the
|
||||
// pathological case (e.g. a model whose MaxOutput equals its window).
|
||||
//
|
||||
// The reserve is proportional (window/8, capped at 4096) rather than a
|
||||
// flat 4096 so small-window models aren't over-penalized: a flat 4096
|
||||
// would halve gpt-4's 8192 budget, while window/8 reserves a sensible
|
||||
// 1024 there and still tops out at 4096 for large contexts.
|
||||
//
|
||||
// Some providers (OpenRouter) report inflated model-level context
|
||||
// windows (e.g. 1000000) while the serving provider enforces a much
|
||||
// tighter limit (e.g. 262144). Discovery already prefers the serving
|
||||
// provider's smaller context_length, so m.ContextWindow is the real
|
||||
// limit by the time we get here.
|
||||
if m.ContextWindow > 0 {
|
||||
reserve := m.ContextWindow / 8
|
||||
const maxReserve = 4096
|
||||
if reserve > maxReserve {
|
||||
reserve = maxReserve
|
||||
}
|
||||
clamped := m.ContextWindow - reserve
|
||||
if clamped < 1 {
|
||||
clamped = 1
|
||||
}
|
||||
if maxTok > clamped {
|
||||
maxTok = clamped
|
||||
}
|
||||
}
|
||||
if m.Reasoning {
|
||||
if maxTok > 0 {
|
||||
out.MaxCompletionTok = &maxTok
|
||||
|
|
|
|||
169
packages/provider/openai_clamp_test.go
Normal file
169
packages/provider/openai_clamp_test.go
Normal file
|
|
@ -0,0 +1,169 @@
|
|||
package provider
|
||||
|
||||
import "testing"
|
||||
|
||||
// withLiveModels installs a synthetic catalog overlay for the duration
|
||||
// of a test and restores the previous state afterwards. It lets these
|
||||
// tests pin exact ContextWindow / MaxOutput values without depending on
|
||||
// the real catalog.
|
||||
func withLiveModels(t *testing.T, models []Model) {
|
||||
t.Helper()
|
||||
activeMu.Lock()
|
||||
prevActive := active
|
||||
prevSet := activeSet
|
||||
activeMu.Unlock()
|
||||
|
||||
SetLiveModels(models)
|
||||
|
||||
t.Cleanup(func() {
|
||||
activeMu.Lock()
|
||||
active = prevActive
|
||||
activeSet = prevSet
|
||||
activeMu.Unlock()
|
||||
})
|
||||
}
|
||||
|
||||
// outputBudget pulls whichever max-output field buildRequest populated
|
||||
// for a non-reasoning model (out.MaxTokens) so tests can assert the
|
||||
// clamped value regardless of the reasoning path.
|
||||
func outputBudget(t *testing.T, out *oaiRequest) int {
|
||||
t.Helper()
|
||||
switch {
|
||||
case out.MaxTokens != nil:
|
||||
return *out.MaxTokens
|
||||
case out.MaxCompletionTok != nil:
|
||||
return *out.MaxCompletionTok
|
||||
default:
|
||||
t.Fatalf("no output budget set on request")
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// TestBuildRequestDoesNotClampWhenOutputFitsWindow is the regression
|
||||
// guard for the original PR #24 behavior: a normal model whose MaxOutput
|
||||
// comfortably fits its context window must NOT have its budget reduced.
|
||||
// The earlier implementation subtracted 4096 from min(window, MaxOutput),
|
||||
// which silently shrank every OpenAI/DeepSeek model by 4096 tokens.
|
||||
func TestBuildRequestDoesNotClampWhenOutputFitsWindow(t *testing.T) {
|
||||
withLiveModels(t, []Model{{
|
||||
Provider: "openai",
|
||||
ID: "fits-fine",
|
||||
ContextWindow: 128000,
|
||||
MaxOutput: 16384,
|
||||
}})
|
||||
c := &openaiClient{name: "openai"}
|
||||
|
||||
out, err := c.buildRequest(Request{
|
||||
Model: "fits-fine",
|
||||
Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if got := outputBudget(t, out); got != 16384 {
|
||||
t.Fatalf("output budget = %d; want 16384 (must not clamp when output fits window)", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestBuildRequestClampsLargeWindowAtMaxReserve covers the bug PR #24
|
||||
// targets for a large-window model whose MaxOutput equals its (served)
|
||||
// context window. Sending max_tokens == window leaves no room for input,
|
||||
// so OpenRouter rejects it. The proportional reserve (window/8) is capped
|
||||
// at 4096, so a 262144 window reserves the cap, not window/8.
|
||||
func TestBuildRequestClampsLargeWindowAtMaxReserve(t *testing.T) {
|
||||
const window = 262144
|
||||
withLiveModels(t, []Model{{
|
||||
Provider: "openrouter",
|
||||
ID: "nemotron-tight",
|
||||
ContextWindow: window,
|
||||
MaxOutput: window,
|
||||
}})
|
||||
c := &openaiClient{name: "openrouter"}
|
||||
|
||||
out, err := c.buildRequest(Request{
|
||||
Model: "nemotron-tight",
|
||||
Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
want := window - 4096 // window/8 = 32768 > 4096 cap, so reserve = 4096
|
||||
if got := outputBudget(t, out); got != want {
|
||||
t.Fatalf("output budget = %d; want %d (window - capped reserve)", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
// TestBuildRequestProportionalReserveSmallWindow verifies the reserve is
|
||||
// proportional for small windows so they aren't over-penalized. gpt-4's
|
||||
// 8192/8192 model reserves window/8 = 1024 (not a flat 4096, which would
|
||||
// halve its budget), leaving 7168.
|
||||
func TestBuildRequestProportionalReserveSmallWindow(t *testing.T) {
|
||||
const window = 8192
|
||||
withLiveModels(t, []Model{{
|
||||
Provider: "openai",
|
||||
ID: "gpt-4-like",
|
||||
ContextWindow: window,
|
||||
MaxOutput: window,
|
||||
}})
|
||||
c := &openaiClient{name: "openai"}
|
||||
|
||||
out, err := c.buildRequest(Request{
|
||||
Model: "gpt-4-like",
|
||||
Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
want := window - window/8 // 8192 - 1024 = 7168
|
||||
if got := outputBudget(t, out); got != want {
|
||||
t.Fatalf("output budget = %d; want %d (window - window/8)", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
// TestBuildRequestClampFloor verifies a tiny context window never yields
|
||||
// a zero or negative budget. window/8 of 16 is 2, leaving 14.
|
||||
func TestBuildRequestClampFloor(t *testing.T) {
|
||||
withLiveModels(t, []Model{{
|
||||
Provider: "openrouter",
|
||||
ID: "tiny-window",
|
||||
ContextWindow: 16,
|
||||
MaxOutput: 16,
|
||||
}})
|
||||
c := &openaiClient{name: "openrouter"}
|
||||
|
||||
out, err := c.buildRequest(Request{
|
||||
Model: "tiny-window",
|
||||
Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if got := outputBudget(t, out); got != 14 {
|
||||
t.Fatalf("output budget = %d; want 14 (window - window/8, still positive)", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestBuildRequestClampDoesNotInflate confirms the clamp only ever
|
||||
// lowers the budget: a small explicit MaxTokens under the clamp ceiling
|
||||
// passes through untouched.
|
||||
func TestBuildRequestClampDoesNotInflate(t *testing.T) {
|
||||
withLiveModels(t, []Model{{
|
||||
Provider: "openrouter",
|
||||
ID: "roomy",
|
||||
ContextWindow: 262144,
|
||||
MaxOutput: 262144,
|
||||
}})
|
||||
c := &openaiClient{name: "openrouter"}
|
||||
|
||||
out, err := c.buildRequest(Request{
|
||||
Model: "roomy",
|
||||
MaxTokens: 8000,
|
||||
Messages: []Message{{Role: RoleUser, Content: []Content{TextBlock{Text: "hi"}}}},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if got := outputBudget(t, out); got != 8000 {
|
||||
t.Fatalf("output budget = %d; want 8000 (explicit request below ceiling, unchanged)", got)
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue