From 3cf22fc32b90e9d85167b7bad2b5268b9e9f443f Mon Sep 17 00:00:00 2001
From: Raymond Gasper <raymondgasper@fastmail.com>
Date: Tue, 9 Jun 2026 12:24:04 -0400
Subject: [PATCH 1/2] fix: request model's full output-token budget per turn

Turns omitted MaxTokens on the provider request, so Bedrock applied its
conservative 4096 default and silently truncated long writes/edits with
stopReason=length. In the TUI this read like the interaction timed out.

Thread the resolved model's catalog MaxOutput through to the request:
  catalog Model.MaxOutput -> Resolved.MaxOutput -> Agent.MaxTokens
  -> provider.Request.MaxTokens
Zero still falls back to each provider's own default, so models without a
catalog MaxOutput are unaffected. The SDK path inherits this via NewAgent.

Also surface StopLength explicitly in the TUI ('response hit the output
limit -- ask it to continue') instead of ending silently.

Tests: TestAgentPropagatesMaxTokens (Agent.MaxTokens reaches the wire) and
TestBedrockBuildRequestMaxTokens (non-zero flows through; zero -> 4096).
---
 packages/agent/build.go                  |  9 ++++++
 packages/agent/modes/interactive.go      | 11 ++++++++
 packages/core/agent.go                   |  9 ++++++
 packages/core/agent_retry_test.go        | 35 +++++++++++++++++++++++
 packages/provider/amazon_bedrock_test.go | 36 ++++++++++++++++++++++++
 5 files changed, 100 insertions(+)

diff --git a/packages/agent/build.go b/packages/agent/build.go
index 4d23a5e..76aac8f 100644
--- a/packages/agent/build.go
+++ b/packages/agent/build.go
@@ -32,6 +32,13 @@ type Resolved struct {
 	MaxSteps     int
 	Sandbox      *tools.Sandbox
 
+	// MaxOutput is the resolved model's maximum output-token budget
+	// (from the catalog). Passed to the agent so each turn requests
+	// the model's full output capacity instead of the provider's
+	// conservative default (e.g. Bedrock's 4096, which truncates
+	// long writes/edits with stopReason=length).
+	MaxOutput int
+
 	// SkillTool is the on-demand skill loader registered with the
 	// agent's tool registry, or nil if no SKILL.md files were
 	// discovered. Exposed so the tui can list / preview skills.
@@ -501,6 +508,7 @@ func Resolve(args Args, requireCred bool) (Resolved, error) {
 		ToolSummary:      summaries,
 		SystemPrompt:     sys,
 		MaxSteps:         max,
+		MaxOutput:        resolvedModel.MaxOutput,
 		Sandbox:          sandbox,
 		SkillTool:        skillTool,
 		systemAppend:     append_,
@@ -768,6 +776,7 @@ func (r *Resolved) UseSandbox(s *tools.Sandbox) {
 func (r Resolved) NewAgent() *core.Agent {
 	a := core.NewAgent(r.NewClient(), r.Model, r.SystemPrompt, r.ToolRegistry)
 	a.MaxSteps = r.MaxSteps
+	a.MaxTokens = r.MaxOutput
 	a.Reasoning = r.Reasoning
 	return a
 }
diff --git a/packages/agent/modes/interactive.go b/packages/agent/modes/interactive.go
index cee5117..caa8115 100644
--- a/packages/agent/modes/interactive.go
+++ b/packages/agent/modes/interactive.go
@@ -4656,6 +4656,17 @@ func (i *Interactive) handleEvent(ev core.AgentEvent) {
 			i.statusOK = "cancelled"
 			return
 		}
+		if e.Stop == provider.StopLength {
+			// The model hit its output-token cap mid-response, so the
+			// reply (often a long write/edit) is truncated. Surface it
+			// explicitly — otherwise the turn just ends and reads like
+			// the UI gave up. The agent already requests the model's
+			// full MaxOutput budget, so this means the response genuinely
+			// exceeded that ceiling; ask the user to continue.
+			i.statusErr = "response hit the model's output-token limit and was cut off — ask it to continue"
+			i.statusOK = ""
+			return
+		}
 		// Don't surface mid-loop stream errors as a red banner here.
 		// EvTurnEnd fires after every step in a multi-step tool loop,
 		// so a transient 503 / network blip would briefly paint a red
diff --git a/packages/core/agent.go b/packages/core/agent.go
index 85ad854..a0ad5a0 100644
--- a/packages/core/agent.go
+++ b/packages/core/agent.go
@@ -22,6 +22,14 @@ type Agent struct {
 	MaxSteps  int
 	Reasoning string
 
+	// MaxTokens caps the model's output tokens per turn. Zero leaves
+	// the field unset on the provider request, letting each provider
+	// apply its own default (which can be conservative — Bedrock
+	// defaults to 4096, truncating long writes/edits). Hosts populate
+	// this from the resolved model's MaxOutput so large single-turn
+	// responses aren't silently cut off with stopReason=length.
+	MaxTokens int
+
 	// BeforeToolExecute, if set, is called immediately before each
 	// tool runs. Returning (allowed=false, reason) short-circuits
 	// the call with an error result containing reason. Optionally,
@@ -515,6 +523,7 @@ func (a *Agent) oneTurn(ctx context.Context, sink func(AgentEvent)) (provider.St
 		Messages:  repairToolUseResultPairs(a.Messages()),
 		Tools:     a.Tools.Specs(),
 		Reasoning: a.Reasoning,
+		MaxTokens: a.MaxTokens,
 	}
 	stream, err := a.Client.Stream(ctx, req)
 	if err != nil {
diff --git a/packages/core/agent_retry_test.go b/packages/core/agent_retry_test.go
index 457c24d..d234817 100644
--- a/packages/core/agent_retry_test.go
+++ b/packages/core/agent_retry_test.go
@@ -109,3 +109,38 @@ func TestAgentDropsPartialAssistantBeforeRetry(t *testing.T) {
 		t.Fatalf("final assistant text = %q; want recovered", got)
 	}
 }
+
+// captureClient records the last Request it received so tests can
+// assert what the agent put on the wire.
+type captureClient struct {
+	lastReq provider.Request
+}
+
+func (c *captureClient) Name() string { return "capture" }
+
+func (c *captureClient) Stream(ctx context.Context, req provider.Request) (<-chan provider.Event, error) {
+	c.lastReq = req
+	out := make(chan provider.Event, 3)
+	go func() {
+		defer close(out)
+		out <- provider.EventStart{Provider: "capture", Model: req.Model}
+		out <- provider.EventDone{Stop: provider.StopEnd, Message: provider.Message{
+			Role:    provider.RoleAssistant,
+			Content: []provider.Content{provider.TextBlock{Text: "ok"}},
+		}}
+	}()
+	return out, nil
+}
+
+func TestAgentPropagatesMaxTokens(t *testing.T) {
+	client := &captureClient{}
+	a := NewAgent(client, "fake-model", "system", Registry{})
+	a.MaxTokens = 64000
+
+	if err := a.Prompt(context.Background(), "hello", nil, nil); err != nil {
+		t.Fatalf("Prompt returned %v", err)
+	}
+	if client.lastReq.MaxTokens != 64000 {
+		t.Fatalf("request MaxTokens = %d; want 64000 (Agent.MaxTokens not propagated)", client.lastReq.MaxTokens)
+	}
+}
diff --git a/packages/provider/amazon_bedrock_test.go b/packages/provider/amazon_bedrock_test.go
index aa13c9a..5b00bb9 100644
--- a/packages/provider/amazon_bedrock_test.go
+++ b/packages/provider/amazon_bedrock_test.go
@@ -163,6 +163,42 @@ func TestBedrockModelSupportsCaching(t *testing.T) {
 	}
 }
 
+func TestBedrockBuildRequestMaxTokens(t *testing.T) {
+	client := &bedrockClient{region: "us-east-1"}
+
+	// A non-zero MaxTokens must flow through to InferenceConfig so the
+	// model gets its full output budget. This is the regression guard
+	// for long writes/edits being truncated at Bedrock's 4096 default.
+	req, err := client.buildRequest(Request{
+		Model:     "anthropic.claude-sonnet-4-5-20250929-v1:0",
+		MaxTokens: 64000,
+		Messages: []Message{
+			{Role: RoleUser, Content: []Content{TextBlock{Text: "hello"}}},
+		},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if req.InferenceConfig.MaxTokens != 64000 {
+		t.Errorf("MaxTokens = %d, want 64000", req.InferenceConfig.MaxTokens)
+	}
+
+	// Zero still falls back to the conservative provider default so an
+	// unset budget never sends maxTokens:0 (which Bedrock rejects).
+	reqZero, err := client.buildRequest(Request{
+		Model: "anthropic.claude-sonnet-4-5-20250929-v1:0",
+		Messages: []Message{
+			{Role: RoleUser, Content: []Content{TextBlock{Text: "hello"}}},
+		},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if reqZero.InferenceConfig.MaxTokens != 4096 {
+		t.Errorf("zero MaxTokens default = %d, want 4096", reqZero.InferenceConfig.MaxTokens)
+	}
+}
+
 func TestBedrockBuildRequestCachingClaudeModel(t *testing.T) {
 	// A Claude model (PriceCacheWrite > 0) should get cachePoint markers
 	// in the system array and on the last user message.

From a373e828965db386a7ac674bccd3bb230796ec0d Mon Sep 17 00:00:00 2001
From: patriceckhart <mail@patriceckhart.com>
Date: Tue, 9 Jun 2026 18:38:09 +0200
Subject: [PATCH 2/2] style: drop em-dashes from output-token-budget
 strings/comments

Co-authored-by: Raymond Gasper <raymondgasper@fastmail.com>
---
 packages/agent/modes/interactive.go | 4 ++--
 packages/core/agent.go              | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/packages/agent/modes/interactive.go b/packages/agent/modes/interactive.go
index caa8115..cc26cd0 100644
--- a/packages/agent/modes/interactive.go
+++ b/packages/agent/modes/interactive.go
@@ -4659,11 +4659,11 @@ func (i *Interactive) handleEvent(ev core.AgentEvent) {
 		if e.Stop == provider.StopLength {
 			// The model hit its output-token cap mid-response, so the
 			// reply (often a long write/edit) is truncated. Surface it
-			// explicitly — otherwise the turn just ends and reads like
+			// explicitly, otherwise the turn just ends and reads like
 			// the UI gave up. The agent already requests the model's
 			// full MaxOutput budget, so this means the response genuinely
 			// exceeded that ceiling; ask the user to continue.
-			i.statusErr = "response hit the model's output-token limit and was cut off — ask it to continue"
+			i.statusErr = "response hit the model's output-token limit and was cut off, ask it to continue"
 			i.statusOK = ""
 			return
 		}
diff --git a/packages/core/agent.go b/packages/core/agent.go
index a0ad5a0..cf8731c 100644
--- a/packages/core/agent.go
+++ b/packages/core/agent.go
@@ -24,7 +24,7 @@ type Agent struct {
 
 	// MaxTokens caps the model's output tokens per turn. Zero leaves
 	// the field unset on the provider request, letting each provider
-	// apply its own default (which can be conservative — Bedrock
+	// apply its own default (which can be conservative, e.g. Bedrock
 	// defaults to 4096, truncating long writes/edits). Hosts populate
 	// this from the resolved model's MaxOutput so large single-turn
 	// responses aren't silently cut off with stopReason=length.