From 3cf22fc32b90e9d85167b7bad2b5268b9e9f443f Mon Sep 17 00:00:00 2001 From: Raymond Gasper Date: Tue, 9 Jun 2026 12:24:04 -0400 Subject: [PATCH 1/2] fix: request model's full output-token budget per turn Turns omitted MaxTokens on the provider request, so Bedrock applied its conservative 4096 default and silently truncated long writes/edits with stopReason=length. In the TUI this read like the interaction timed out. Thread the resolved model's catalog MaxOutput through to the request: catalog Model.MaxOutput -> Resolved.MaxOutput -> Agent.MaxTokens -> provider.Request.MaxTokens Zero still falls back to each provider's own default, so models without a catalog MaxOutput are unaffected. The SDK path inherits this via NewAgent. Also surface StopLength explicitly in the TUI ('response hit the output limit -- ask it to continue') instead of ending silently. Tests: TestAgentPropagatesMaxTokens (Agent.MaxTokens reaches the wire) and TestBedrockBuildRequestMaxTokens (non-zero flows through; zero -> 4096). --- packages/agent/build.go | 9 ++++++ packages/agent/modes/interactive.go | 11 ++++++++ packages/core/agent.go | 9 ++++++ packages/core/agent_retry_test.go | 35 +++++++++++++++++++++++ packages/provider/amazon_bedrock_test.go | 36 ++++++++++++++++++++++++ 5 files changed, 100 insertions(+) diff --git a/packages/agent/build.go b/packages/agent/build.go index 4d23a5e..76aac8f 100644 --- a/packages/agent/build.go +++ b/packages/agent/build.go @@ -32,6 +32,13 @@ type Resolved struct { MaxSteps int Sandbox *tools.Sandbox + // MaxOutput is the resolved model's maximum output-token budget + // (from the catalog). Passed to the agent so each turn requests + // the model's full output capacity instead of the provider's + // conservative default (e.g. Bedrock's 4096, which truncates + // long writes/edits with stopReason=length). + MaxOutput int + // SkillTool is the on-demand skill loader registered with the // agent's tool registry, or nil if no SKILL.md files were // discovered. Exposed so the tui can list / preview skills. @@ -501,6 +508,7 @@ func Resolve(args Args, requireCred bool) (Resolved, error) { ToolSummary: summaries, SystemPrompt: sys, MaxSteps: max, + MaxOutput: resolvedModel.MaxOutput, Sandbox: sandbox, SkillTool: skillTool, systemAppend: append_, @@ -768,6 +776,7 @@ func (r *Resolved) UseSandbox(s *tools.Sandbox) { func (r Resolved) NewAgent() *core.Agent { a := core.NewAgent(r.NewClient(), r.Model, r.SystemPrompt, r.ToolRegistry) a.MaxSteps = r.MaxSteps + a.MaxTokens = r.MaxOutput a.Reasoning = r.Reasoning return a } diff --git a/packages/agent/modes/interactive.go b/packages/agent/modes/interactive.go index cee5117..caa8115 100644 --- a/packages/agent/modes/interactive.go +++ b/packages/agent/modes/interactive.go @@ -4656,6 +4656,17 @@ func (i *Interactive) handleEvent(ev core.AgentEvent) { i.statusOK = "cancelled" return } + if e.Stop == provider.StopLength { + // The model hit its output-token cap mid-response, so the + // reply (often a long write/edit) is truncated. Surface it + // explicitly — otherwise the turn just ends and reads like + // the UI gave up. The agent already requests the model's + // full MaxOutput budget, so this means the response genuinely + // exceeded that ceiling; ask the user to continue. + i.statusErr = "response hit the model's output-token limit and was cut off — ask it to continue" + i.statusOK = "" + return + } // Don't surface mid-loop stream errors as a red banner here. // EvTurnEnd fires after every step in a multi-step tool loop, // so a transient 503 / network blip would briefly paint a red diff --git a/packages/core/agent.go b/packages/core/agent.go index 85ad854..a0ad5a0 100644 --- a/packages/core/agent.go +++ b/packages/core/agent.go @@ -22,6 +22,14 @@ type Agent struct { MaxSteps int Reasoning string + // MaxTokens caps the model's output tokens per turn. Zero leaves + // the field unset on the provider request, letting each provider + // apply its own default (which can be conservative — Bedrock + // defaults to 4096, truncating long writes/edits). Hosts populate + // this from the resolved model's MaxOutput so large single-turn + // responses aren't silently cut off with stopReason=length. + MaxTokens int + // BeforeToolExecute, if set, is called immediately before each // tool runs. Returning (allowed=false, reason) short-circuits // the call with an error result containing reason. Optionally, @@ -515,6 +523,7 @@ func (a *Agent) oneTurn(ctx context.Context, sink func(AgentEvent)) (provider.St Messages: repairToolUseResultPairs(a.Messages()), Tools: a.Tools.Specs(), Reasoning: a.Reasoning, + MaxTokens: a.MaxTokens, } stream, err := a.Client.Stream(ctx, req) if err != nil { diff --git a/packages/core/agent_retry_test.go b/packages/core/agent_retry_test.go index 457c24d..d234817 100644 --- a/packages/core/agent_retry_test.go +++ b/packages/core/agent_retry_test.go @@ -109,3 +109,38 @@ func TestAgentDropsPartialAssistantBeforeRetry(t *testing.T) { t.Fatalf("final assistant text = %q; want recovered", got) } } + +// captureClient records the last Request it received so tests can +// assert what the agent put on the wire. +type captureClient struct { + lastReq provider.Request +} + +func (c *captureClient) Name() string { return "capture" } + +func (c *captureClient) Stream(ctx context.Context, req provider.Request) (<-chan provider.Event, error) { + c.lastReq = req + out := make(chan provider.Event, 3) + go func() { + defer close(out) + out <- provider.EventStart{Provider: "capture", Model: req.Model} + out <- provider.EventDone{Stop: provider.StopEnd, Message: provider.Message{ + Role: provider.RoleAssistant, + Content: []provider.Content{provider.TextBlock{Text: "ok"}}, + }} + }() + return out, nil +} + +func TestAgentPropagatesMaxTokens(t *testing.T) { + client := &captureClient{} + a := NewAgent(client, "fake-model", "system", Registry{}) + a.MaxTokens = 64000 + + if err := a.Prompt(context.Background(), "hello", nil, nil); err != nil { + t.Fatalf("Prompt returned %v", err) + } + if client.lastReq.MaxTokens != 64000 { + t.Fatalf("request MaxTokens = %d; want 64000 (Agent.MaxTokens not propagated)", client.lastReq.MaxTokens) + } +} diff --git a/packages/provider/amazon_bedrock_test.go b/packages/provider/amazon_bedrock_test.go index aa13c9a..5b00bb9 100644 --- a/packages/provider/amazon_bedrock_test.go +++ b/packages/provider/amazon_bedrock_test.go @@ -163,6 +163,42 @@ func TestBedrockModelSupportsCaching(t *testing.T) { } } +func TestBedrockBuildRequestMaxTokens(t *testing.T) { + client := &bedrockClient{region: "us-east-1"} + + // A non-zero MaxTokens must flow through to InferenceConfig so the + // model gets its full output budget. This is the regression guard + // for long writes/edits being truncated at Bedrock's 4096 default. + req, err := client.buildRequest(Request{ + Model: "anthropic.claude-sonnet-4-5-20250929-v1:0", + MaxTokens: 64000, + Messages: []Message{ + {Role: RoleUser, Content: []Content{TextBlock{Text: "hello"}}}, + }, + }) + if err != nil { + t.Fatal(err) + } + if req.InferenceConfig.MaxTokens != 64000 { + t.Errorf("MaxTokens = %d, want 64000", req.InferenceConfig.MaxTokens) + } + + // Zero still falls back to the conservative provider default so an + // unset budget never sends maxTokens:0 (which Bedrock rejects). + reqZero, err := client.buildRequest(Request{ + Model: "anthropic.claude-sonnet-4-5-20250929-v1:0", + Messages: []Message{ + {Role: RoleUser, Content: []Content{TextBlock{Text: "hello"}}}, + }, + }) + if err != nil { + t.Fatal(err) + } + if reqZero.InferenceConfig.MaxTokens != 4096 { + t.Errorf("zero MaxTokens default = %d, want 4096", reqZero.InferenceConfig.MaxTokens) + } +} + func TestBedrockBuildRequestCachingClaudeModel(t *testing.T) { // A Claude model (PriceCacheWrite > 0) should get cachePoint markers // in the system array and on the last user message. From a373e828965db386a7ac674bccd3bb230796ec0d Mon Sep 17 00:00:00 2001 From: patriceckhart Date: Tue, 9 Jun 2026 18:38:09 +0200 Subject: [PATCH 2/2] style: drop em-dashes from output-token-budget strings/comments Co-authored-by: Raymond Gasper --- packages/agent/modes/interactive.go | 4 ++-- packages/core/agent.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/agent/modes/interactive.go b/packages/agent/modes/interactive.go index caa8115..cc26cd0 100644 --- a/packages/agent/modes/interactive.go +++ b/packages/agent/modes/interactive.go @@ -4659,11 +4659,11 @@ func (i *Interactive) handleEvent(ev core.AgentEvent) { if e.Stop == provider.StopLength { // The model hit its output-token cap mid-response, so the // reply (often a long write/edit) is truncated. Surface it - // explicitly — otherwise the turn just ends and reads like + // explicitly, otherwise the turn just ends and reads like // the UI gave up. The agent already requests the model's // full MaxOutput budget, so this means the response genuinely // exceeded that ceiling; ask the user to continue. - i.statusErr = "response hit the model's output-token limit and was cut off — ask it to continue" + i.statusErr = "response hit the model's output-token limit and was cut off, ask it to continue" i.statusOK = "" return } diff --git a/packages/core/agent.go b/packages/core/agent.go index a0ad5a0..cf8731c 100644 --- a/packages/core/agent.go +++ b/packages/core/agent.go @@ -24,7 +24,7 @@ type Agent struct { // MaxTokens caps the model's output tokens per turn. Zero leaves // the field unset on the provider request, letting each provider - // apply its own default (which can be conservative — Bedrock + // apply its own default (which can be conservative, e.g. Bedrock // defaults to 4096, truncating long writes/edits). Hosts populate // this from the resolved model's MaxOutput so large single-turn // responses aren't silently cut off with stopReason=length.