diff --git a/internal/agent/modes/interactive.go b/internal/agent/modes/interactive.go index d4570ff..b960635 100644 --- a/internal/agent/modes/interactive.go +++ b/internal/agent/modes/interactive.go @@ -560,7 +560,7 @@ func (i *Interactive) redraw() { cols, _ := i.cfg.Terminal.Size() if i.agent != nil { - i.view.Messages = i.agent.Messages() + i.view.Messages = filterHiddenTranscriptMessages(i.agent.Messages()) } else { i.view.Messages = nil } @@ -896,6 +896,33 @@ func alignSliceStartToImageBlock(chat []string, start, end int) int { return start } +const hiddenOpenAIImageMirrorPrefix = "Tool output included the following image content:" + +func filterHiddenTranscriptMessages(msgs []provider.Message) []provider.Message { + if len(msgs) == 0 { + return nil + } + out := make([]provider.Message, 0, len(msgs)) + for _, m := range msgs { + if isHiddenTranscriptMessage(m) { + continue + } + out = append(out, m) + } + return out +} + +func isHiddenTranscriptMessage(m provider.Message) bool { + if m.Role != provider.RoleUser || len(m.Content) == 0 { + return false + } + tb, ok := m.Content[0].(provider.TextBlock) + if !ok { + return false + } + return strings.TrimSpace(tb.Text) == hiddenOpenAIImageMirrorPrefix +} + func clipBottomClippedImages(lines []string) []string { if len(lines) == 0 { return lines diff --git a/internal/core/agent.go b/internal/core/agent.go index 7c32c48..75d36f0 100644 --- a/internal/core/agent.go +++ b/internal/core/agent.go @@ -181,6 +181,18 @@ func (a *Agent) runLoop(ctx context.Context, sink func(AgentEvent)) error { toolMsg, hadError := a.executeTools(ctx, assistantMsg, sink) a.mu.Lock() a.messages = append(a.messages, toolMsg) + // OpenAI's chat-completions tool message shape is text-centric. + // Vision models reliably consume images when they arrive as user + // content, so when a tool result contains images we mirror them + // into a synthetic user message immediately after the tool result. + // This keeps the transcript self-contained for providers that can + // see image blocks in tool messages while making OpenAI vision + // models actually receive the image bytes. + if a.Client != nil && a.Client.Name() == "openai" { + if mirror := mirrorToolImagesAsUser(toolMsg); len(mirror.Content) > 0 { + a.messages = append(a.messages, mirror) + } + } a.mu.Unlock() // If context was cancelled during tool execution, bail out. if err := ctx.Err(); err != nil { @@ -388,6 +400,35 @@ func (a *Agent) runOneTool(ctx context.Context, tc provider.ToolCallBlock, sink // extractText concatenates all TextBlock content in a message. Used // by BeforeAssistantMessage so guards see a single string instead of // having to walk provider.Content themselves. +func mirrorToolImagesAsUser(msg provider.Message) provider.Message { + var content []provider.Content + for _, c := range msg.Content { + tr, ok := c.(provider.ToolResultBlock) + if !ok { + continue + } + for _, inner := range tr.Content { + switch v := inner.(type) { + case provider.TextBlock: + // Keep short textual context so the model understands why + // the images appeared, but don't duplicate giant read + // outputs verbatim. + if len(v.Text) > 0 && len(v.Text) <= 500 { + content = append(content, v) + } + case provider.ImageBlock: + content = append(content, v) + } + } + } + if len(content) == 0 { + return provider.Message{} + } + prefix := provider.TextBlock{Text: "Tool output included the following image content:"} + content = append([]provider.Content{prefix}, content...) + return provider.Message{Role: provider.RoleUser, Content: content, Time: time.Now()} +} + func extractText(msg provider.Message) string { var out string for _, c := range msg.Content { diff --git a/internal/provider/openai.go b/internal/provider/openai.go index 2a5c04d..7938839 100644 --- a/internal/provider/openai.go +++ b/internal/provider/openai.go @@ -174,25 +174,16 @@ func (c *openaiClient) buildRequest(req Request) (*oaiRequest, error) { } out.Messages = append(out.Messages, am) case RoleTool: - // Each ToolResultBlock becomes its own tool message. + // Each ToolResultBlock becomes its own tool message. Preserve + // image blocks for vision-capable OpenAI models instead of + // flattening the tool output to plain text. for _, b := range msg.Content { if tr, ok := b.(ToolResultBlock); ok { - var text strings.Builder - for _, inner := range tr.Content { - if tb, ok := inner.(TextBlock); ok { - if text.Len() > 0 { - text.WriteString("\n") - } - text.WriteString(tb.Text) - } - } - if tr.IsError && text.Len() > 0 { - text.WriteString(" [error]") - } + content := buildOAIToolContent(tr.Content, tr.IsError) out.Messages = append(out.Messages, oaiMessage{ Role: "tool", ToolCallID: tr.CallID, - Content: text.String(), + Content: content, }) } } @@ -234,6 +225,36 @@ func buildOAIUserContent(blocks []Content) interface{} { } return sb.String() } + return buildOAIContentBlocks(blocks, false) +} + +func buildOAIToolContent(blocks []Content, isError bool) interface{} { + hasImage := false + for _, b := range blocks { + if _, ok := b.(ImageBlock); ok { + hasImage = true + break + } + } + if !hasImage { + var sb strings.Builder + for _, b := range blocks { + if tb, ok := b.(TextBlock); ok { + if sb.Len() > 0 { + sb.WriteString("\n") + } + sb.WriteString(tb.Text) + } + } + if isError && sb.Len() > 0 { + sb.WriteString(" [error]") + } + return sb.String() + } + return buildOAIContentBlocks(blocks, isError) +} + +func buildOAIContentBlocks(blocks []Content, isError bool) []interface{} { var arr []interface{} for _, b := range blocks { switch v := b.(type) { @@ -246,6 +267,9 @@ func buildOAIUserContent(blocks []Content) interface{} { arr = append(arr, img) } } + if isError { + arr = append(arr, oaiContentText{Type: "text", Text: "[error]"}) + } return arr }