mirror of
https://github.com/patriceckhart/zot.git
synced 2026-06-26 21:36:31 +02:00
fix(openai): pass image tool results to vision models
Mirror image-bearing tool output into an OpenAI-only user message so GPT vision models receive image bytes, and hide that synthetic message from the TUI transcript.
This commit is contained in:
parent
2cf21d7bda
commit
8d3b7ff155
3 changed files with 107 additions and 15 deletions
|
|
@ -560,7 +560,7 @@ func (i *Interactive) redraw() {
|
|||
|
||||
cols, _ := i.cfg.Terminal.Size()
|
||||
if i.agent != nil {
|
||||
i.view.Messages = i.agent.Messages()
|
||||
i.view.Messages = filterHiddenTranscriptMessages(i.agent.Messages())
|
||||
} else {
|
||||
i.view.Messages = nil
|
||||
}
|
||||
|
|
@ -896,6 +896,33 @@ func alignSliceStartToImageBlock(chat []string, start, end int) int {
|
|||
return start
|
||||
}
|
||||
|
||||
const hiddenOpenAIImageMirrorPrefix = "Tool output included the following image content:"
|
||||
|
||||
func filterHiddenTranscriptMessages(msgs []provider.Message) []provider.Message {
|
||||
if len(msgs) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make([]provider.Message, 0, len(msgs))
|
||||
for _, m := range msgs {
|
||||
if isHiddenTranscriptMessage(m) {
|
||||
continue
|
||||
}
|
||||
out = append(out, m)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func isHiddenTranscriptMessage(m provider.Message) bool {
|
||||
if m.Role != provider.RoleUser || len(m.Content) == 0 {
|
||||
return false
|
||||
}
|
||||
tb, ok := m.Content[0].(provider.TextBlock)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
return strings.TrimSpace(tb.Text) == hiddenOpenAIImageMirrorPrefix
|
||||
}
|
||||
|
||||
func clipBottomClippedImages(lines []string) []string {
|
||||
if len(lines) == 0 {
|
||||
return lines
|
||||
|
|
|
|||
|
|
@ -181,6 +181,18 @@ func (a *Agent) runLoop(ctx context.Context, sink func(AgentEvent)) error {
|
|||
toolMsg, hadError := a.executeTools(ctx, assistantMsg, sink)
|
||||
a.mu.Lock()
|
||||
a.messages = append(a.messages, toolMsg)
|
||||
// OpenAI's chat-completions tool message shape is text-centric.
|
||||
// Vision models reliably consume images when they arrive as user
|
||||
// content, so when a tool result contains images we mirror them
|
||||
// into a synthetic user message immediately after the tool result.
|
||||
// This keeps the transcript self-contained for providers that can
|
||||
// see image blocks in tool messages while making OpenAI vision
|
||||
// models actually receive the image bytes.
|
||||
if a.Client != nil && a.Client.Name() == "openai" {
|
||||
if mirror := mirrorToolImagesAsUser(toolMsg); len(mirror.Content) > 0 {
|
||||
a.messages = append(a.messages, mirror)
|
||||
}
|
||||
}
|
||||
a.mu.Unlock()
|
||||
// If context was cancelled during tool execution, bail out.
|
||||
if err := ctx.Err(); err != nil {
|
||||
|
|
@ -388,6 +400,35 @@ func (a *Agent) runOneTool(ctx context.Context, tc provider.ToolCallBlock, sink
|
|||
// extractText concatenates all TextBlock content in a message. Used
|
||||
// by BeforeAssistantMessage so guards see a single string instead of
|
||||
// having to walk provider.Content themselves.
|
||||
func mirrorToolImagesAsUser(msg provider.Message) provider.Message {
|
||||
var content []provider.Content
|
||||
for _, c := range msg.Content {
|
||||
tr, ok := c.(provider.ToolResultBlock)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
for _, inner := range tr.Content {
|
||||
switch v := inner.(type) {
|
||||
case provider.TextBlock:
|
||||
// Keep short textual context so the model understands why
|
||||
// the images appeared, but don't duplicate giant read
|
||||
// outputs verbatim.
|
||||
if len(v.Text) > 0 && len(v.Text) <= 500 {
|
||||
content = append(content, v)
|
||||
}
|
||||
case provider.ImageBlock:
|
||||
content = append(content, v)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(content) == 0 {
|
||||
return provider.Message{}
|
||||
}
|
||||
prefix := provider.TextBlock{Text: "Tool output included the following image content:"}
|
||||
content = append([]provider.Content{prefix}, content...)
|
||||
return provider.Message{Role: provider.RoleUser, Content: content, Time: time.Now()}
|
||||
}
|
||||
|
||||
func extractText(msg provider.Message) string {
|
||||
var out string
|
||||
for _, c := range msg.Content {
|
||||
|
|
|
|||
|
|
@ -174,25 +174,16 @@ func (c *openaiClient) buildRequest(req Request) (*oaiRequest, error) {
|
|||
}
|
||||
out.Messages = append(out.Messages, am)
|
||||
case RoleTool:
|
||||
// Each ToolResultBlock becomes its own tool message.
|
||||
// Each ToolResultBlock becomes its own tool message. Preserve
|
||||
// image blocks for vision-capable OpenAI models instead of
|
||||
// flattening the tool output to plain text.
|
||||
for _, b := range msg.Content {
|
||||
if tr, ok := b.(ToolResultBlock); ok {
|
||||
var text strings.Builder
|
||||
for _, inner := range tr.Content {
|
||||
if tb, ok := inner.(TextBlock); ok {
|
||||
if text.Len() > 0 {
|
||||
text.WriteString("\n")
|
||||
}
|
||||
text.WriteString(tb.Text)
|
||||
}
|
||||
}
|
||||
if tr.IsError && text.Len() > 0 {
|
||||
text.WriteString(" [error]")
|
||||
}
|
||||
content := buildOAIToolContent(tr.Content, tr.IsError)
|
||||
out.Messages = append(out.Messages, oaiMessage{
|
||||
Role: "tool",
|
||||
ToolCallID: tr.CallID,
|
||||
Content: text.String(),
|
||||
Content: content,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -234,6 +225,36 @@ func buildOAIUserContent(blocks []Content) interface{} {
|
|||
}
|
||||
return sb.String()
|
||||
}
|
||||
return buildOAIContentBlocks(blocks, false)
|
||||
}
|
||||
|
||||
func buildOAIToolContent(blocks []Content, isError bool) interface{} {
|
||||
hasImage := false
|
||||
for _, b := range blocks {
|
||||
if _, ok := b.(ImageBlock); ok {
|
||||
hasImage = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !hasImage {
|
||||
var sb strings.Builder
|
||||
for _, b := range blocks {
|
||||
if tb, ok := b.(TextBlock); ok {
|
||||
if sb.Len() > 0 {
|
||||
sb.WriteString("\n")
|
||||
}
|
||||
sb.WriteString(tb.Text)
|
||||
}
|
||||
}
|
||||
if isError && sb.Len() > 0 {
|
||||
sb.WriteString(" [error]")
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
return buildOAIContentBlocks(blocks, isError)
|
||||
}
|
||||
|
||||
func buildOAIContentBlocks(blocks []Content, isError bool) []interface{} {
|
||||
var arr []interface{}
|
||||
for _, b := range blocks {
|
||||
switch v := b.(type) {
|
||||
|
|
@ -246,6 +267,9 @@ func buildOAIUserContent(blocks []Content) interface{} {
|
|||
arr = append(arr, img)
|
||||
}
|
||||
}
|
||||
if isError {
|
||||
arr = append(arr, oaiContentText{Type: "text", Text: "[error]"})
|
||||
}
|
||||
return arr
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue