fix(openai): pass image tool results to vision models

Mirror image-bearing tool output into an OpenAI-only user message so GPT vision models receive image bytes, and hide that synthetic message from the TUI transcript.
This commit is contained in:
patriceckhart 2026-04-21 21:45:26 +02:00
parent 2cf21d7bda
commit 8d3b7ff155
3 changed files with 107 additions and 15 deletions

View file

@ -560,7 +560,7 @@ func (i *Interactive) redraw() {
cols, _ := i.cfg.Terminal.Size()
if i.agent != nil {
i.view.Messages = i.agent.Messages()
i.view.Messages = filterHiddenTranscriptMessages(i.agent.Messages())
} else {
i.view.Messages = nil
}
@ -896,6 +896,33 @@ func alignSliceStartToImageBlock(chat []string, start, end int) int {
return start
}
const hiddenOpenAIImageMirrorPrefix = "Tool output included the following image content:"
func filterHiddenTranscriptMessages(msgs []provider.Message) []provider.Message {
if len(msgs) == 0 {
return nil
}
out := make([]provider.Message, 0, len(msgs))
for _, m := range msgs {
if isHiddenTranscriptMessage(m) {
continue
}
out = append(out, m)
}
return out
}
func isHiddenTranscriptMessage(m provider.Message) bool {
if m.Role != provider.RoleUser || len(m.Content) == 0 {
return false
}
tb, ok := m.Content[0].(provider.TextBlock)
if !ok {
return false
}
return strings.TrimSpace(tb.Text) == hiddenOpenAIImageMirrorPrefix
}
func clipBottomClippedImages(lines []string) []string {
if len(lines) == 0 {
return lines

View file

@ -181,6 +181,18 @@ func (a *Agent) runLoop(ctx context.Context, sink func(AgentEvent)) error {
toolMsg, hadError := a.executeTools(ctx, assistantMsg, sink)
a.mu.Lock()
a.messages = append(a.messages, toolMsg)
// OpenAI's chat-completions tool message shape is text-centric.
// Vision models reliably consume images when they arrive as user
// content, so when a tool result contains images we mirror them
// into a synthetic user message immediately after the tool result.
// This keeps the transcript self-contained for providers that can
// see image blocks in tool messages while making OpenAI vision
// models actually receive the image bytes.
if a.Client != nil && a.Client.Name() == "openai" {
if mirror := mirrorToolImagesAsUser(toolMsg); len(mirror.Content) > 0 {
a.messages = append(a.messages, mirror)
}
}
a.mu.Unlock()
// If context was cancelled during tool execution, bail out.
if err := ctx.Err(); err != nil {
@ -388,6 +400,35 @@ func (a *Agent) runOneTool(ctx context.Context, tc provider.ToolCallBlock, sink
// extractText concatenates all TextBlock content in a message. Used
// by BeforeAssistantMessage so guards see a single string instead of
// having to walk provider.Content themselves.
func mirrorToolImagesAsUser(msg provider.Message) provider.Message {
var content []provider.Content
for _, c := range msg.Content {
tr, ok := c.(provider.ToolResultBlock)
if !ok {
continue
}
for _, inner := range tr.Content {
switch v := inner.(type) {
case provider.TextBlock:
// Keep short textual context so the model understands why
// the images appeared, but don't duplicate giant read
// outputs verbatim.
if len(v.Text) > 0 && len(v.Text) <= 500 {
content = append(content, v)
}
case provider.ImageBlock:
content = append(content, v)
}
}
}
if len(content) == 0 {
return provider.Message{}
}
prefix := provider.TextBlock{Text: "Tool output included the following image content:"}
content = append([]provider.Content{prefix}, content...)
return provider.Message{Role: provider.RoleUser, Content: content, Time: time.Now()}
}
func extractText(msg provider.Message) string {
var out string
for _, c := range msg.Content {

View file

@ -174,25 +174,16 @@ func (c *openaiClient) buildRequest(req Request) (*oaiRequest, error) {
}
out.Messages = append(out.Messages, am)
case RoleTool:
// Each ToolResultBlock becomes its own tool message.
// Each ToolResultBlock becomes its own tool message. Preserve
// image blocks for vision-capable OpenAI models instead of
// flattening the tool output to plain text.
for _, b := range msg.Content {
if tr, ok := b.(ToolResultBlock); ok {
var text strings.Builder
for _, inner := range tr.Content {
if tb, ok := inner.(TextBlock); ok {
if text.Len() > 0 {
text.WriteString("\n")
}
text.WriteString(tb.Text)
}
}
if tr.IsError && text.Len() > 0 {
text.WriteString(" [error]")
}
content := buildOAIToolContent(tr.Content, tr.IsError)
out.Messages = append(out.Messages, oaiMessage{
Role: "tool",
ToolCallID: tr.CallID,
Content: text.String(),
Content: content,
})
}
}
@ -234,6 +225,36 @@ func buildOAIUserContent(blocks []Content) interface{} {
}
return sb.String()
}
return buildOAIContentBlocks(blocks, false)
}
func buildOAIToolContent(blocks []Content, isError bool) interface{} {
hasImage := false
for _, b := range blocks {
if _, ok := b.(ImageBlock); ok {
hasImage = true
break
}
}
if !hasImage {
var sb strings.Builder
for _, b := range blocks {
if tb, ok := b.(TextBlock); ok {
if sb.Len() > 0 {
sb.WriteString("\n")
}
sb.WriteString(tb.Text)
}
}
if isError && sb.Len() > 0 {
sb.WriteString(" [error]")
}
return sb.String()
}
return buildOAIContentBlocks(blocks, isError)
}
func buildOAIContentBlocks(blocks []Content, isError bool) []interface{} {
var arr []interface{}
for _, b := range blocks {
switch v := b.(type) {
@ -246,6 +267,9 @@ func buildOAIUserContent(blocks []Content) interface{} {
arr = append(arr, img)
}
}
if isError {
arr = append(arr, oaiContentText{Type: "text", Text: "[error]"})
}
return arr
}