mirror of
https://github.com/patriceckhart/zot.git
synced 2026-06-27 22:06:31 +02:00
read tool and Anthropic builder derived an image's media type from its file extension, so a .png file containing JPEG bytes was declared image/png. Anthropic sniffs the real bytes and 400s the whole request on a mismatch, breaking the session and making it impossible to resume. - read tool now sniffs the real format from magic bytes (sniffImageMIME) - Anthropic request builder reconciles declared MIME against the actual bytes on every outbound image, repairing already-persisted sessions on continue/resume - kitty renderer re-encodes non-PNG images to PNG (f=100 is PNG-only), fixing empty image boxes for JPEG/GIF screenshots
226 lines
6.3 KiB
Go
226 lines
6.3 KiB
Go
// Package tools implements zot's built-in tools: read, write, edit, bash.
|
|
package tools
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/patriceckhart/zot/packages/core"
|
|
"github.com/patriceckhart/zot/packages/provider"
|
|
)
|
|
|
|
const (
|
|
maxReadLines = 2000
|
|
maxReadBytes = 50 * 1024
|
|
)
|
|
|
|
// ReadTool reads file contents from disk.
|
|
type ReadTool struct {
|
|
CWD string
|
|
Sandbox *Sandbox // when jailed, confines reads to the sandbox root
|
|
}
|
|
|
|
type readArgs struct {
|
|
Path string `json:"path"`
|
|
Offset int `json:"offset,omitempty"`
|
|
Limit int `json:"limit,omitempty"`
|
|
}
|
|
|
|
const readSchema = `{"type":"object","properties":{"path":{"type":"string"},"offset":{"type":"integer"},"limit":{"type":"integer"}},"required":["path"]}`
|
|
|
|
func (t *ReadTool) Name() string { return "read" }
|
|
func (t *ReadTool) Description() string {
|
|
return "Read a file. Images (png/jpg/gif/webp) return inline."
|
|
}
|
|
func (t *ReadTool) Schema() json.RawMessage { return json.RawMessage(readSchema) }
|
|
|
|
func (t *ReadTool) Execute(ctx context.Context, raw json.RawMessage, progress func(string)) (core.ToolResult, error) {
|
|
var a readArgs
|
|
if err := json.Unmarshal(raw, &a); err != nil {
|
|
return core.ToolResult{}, fmt.Errorf("invalid args: %w", err)
|
|
}
|
|
if a.Path == "" {
|
|
return core.ToolResult{}, fmt.Errorf("path is required")
|
|
}
|
|
path := resolvePath(t.CWD, a.Path)
|
|
if err := t.Sandbox.CheckPath(path); err != nil {
|
|
return core.ToolResult{}, err
|
|
}
|
|
// What the model sees in errors: relative to the sandbox root when
|
|
// jailed, so absolute paths stay out of the context window.
|
|
shown := t.Sandbox.DisplayPath(path, a.Path)
|
|
|
|
info, err := os.Stat(path)
|
|
if err != nil {
|
|
return core.ToolResult{}, err
|
|
}
|
|
if info.IsDir() {
|
|
return core.ToolResult{}, fmt.Errorf("%s is a directory", shown)
|
|
}
|
|
|
|
// Image handling.
|
|
if mime := imageMIME(path); mime != "" {
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return core.ToolResult{}, err
|
|
}
|
|
// The extension is only a hint. Files are routinely mislabeled
|
|
// (a .png that is actually JPEG bytes, a renamed download, an
|
|
// editor that re-encoded on save). Anthropic and other providers
|
|
// sniff the real bytes and reject the whole request when the
|
|
// declared media type disagrees, which would break the session.
|
|
// Always derive the MIME from the actual content; fall back to
|
|
// the extension-based guess only when the bytes are unrecognized.
|
|
if sniffed := sniffImageMIME(data); sniffed != "" {
|
|
mime = sniffed
|
|
}
|
|
return core.ToolResult{
|
|
Content: []provider.Content{provider.ImageBlock{MimeType: mime, Data: data}},
|
|
}, nil
|
|
}
|
|
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
return core.ToolResult{}, err
|
|
}
|
|
defer f.Close()
|
|
|
|
// Read up to maxReadBytes and also line-limit.
|
|
limited := io.LimitReader(f, int64(maxReadBytes)+1)
|
|
data, err := io.ReadAll(limited)
|
|
if err != nil {
|
|
return core.ToolResult{}, err
|
|
}
|
|
truncBytes := len(data) > maxReadBytes
|
|
if truncBytes {
|
|
data = data[:maxReadBytes]
|
|
}
|
|
|
|
if looksBinary(data) {
|
|
return core.ToolResult{}, fmt.Errorf("%s looks binary; refusing to read as text", a.Path)
|
|
}
|
|
|
|
lines := strings.Split(string(data), "\n")
|
|
// Trim trailing empty line from final \n.
|
|
if n := len(lines); n > 0 && lines[n-1] == "" {
|
|
lines = lines[:n-1]
|
|
}
|
|
|
|
start := 0
|
|
if a.Offset > 0 {
|
|
start = a.Offset - 1
|
|
if start > len(lines) {
|
|
start = len(lines)
|
|
}
|
|
}
|
|
end := len(lines)
|
|
if a.Limit > 0 && start+a.Limit < end {
|
|
end = start + a.Limit
|
|
}
|
|
selected := lines[start:end]
|
|
|
|
truncLines := false
|
|
if len(selected) > maxReadLines {
|
|
selected = selected[:maxReadLines]
|
|
truncLines = true
|
|
}
|
|
|
|
// Raw file contents go to the model. We deliberately DON'T
|
|
// prepend line numbers here: they'd inflate the token count by
|
|
// ~15-20% on typical source files (7 bytes per line, every
|
|
// line, every time the file gets re-sent as context on later
|
|
// turns) and the model doesn't need them — edit goes through
|
|
// exact-match text replacement, not line ranges.
|
|
//
|
|
// The TUI renders its own gutter using the start offset stored
|
|
// in Details, so the on-screen view still looks like cat -n.
|
|
var sb strings.Builder
|
|
for _, line := range selected {
|
|
sb.WriteString(line)
|
|
sb.WriteByte('\n')
|
|
}
|
|
if truncLines || truncBytes {
|
|
sb.WriteString("\n")
|
|
}
|
|
if truncLines {
|
|
sb.WriteString(fmt.Sprintf("... [truncated at %d lines]\n", maxReadLines))
|
|
}
|
|
if truncBytes {
|
|
sb.WriteString(fmt.Sprintf("... [truncated at %d bytes]\n", maxReadBytes))
|
|
}
|
|
|
|
return core.ToolResult{
|
|
Content: []provider.Content{provider.TextBlock{Text: sb.String()}},
|
|
Details: map[string]any{
|
|
"path": path,
|
|
"start_line": start + 1, // 1-indexed; TUI draws the gutter
|
|
"lines_truncated": truncLines,
|
|
"bytes_truncated": truncBytes,
|
|
"total_lines": len(lines),
|
|
},
|
|
}, nil
|
|
}
|
|
|
|
func resolvePath(cwd, p string) string {
|
|
if filepath.IsAbs(p) {
|
|
return p
|
|
}
|
|
if cwd == "" {
|
|
cwd, _ = os.Getwd()
|
|
}
|
|
return filepath.Join(cwd, p)
|
|
}
|
|
|
|
func imageMIME(path string) string {
|
|
switch strings.ToLower(filepath.Ext(path)) {
|
|
case ".png":
|
|
return "image/png"
|
|
case ".jpg", ".jpeg":
|
|
return "image/jpeg"
|
|
case ".gif":
|
|
return "image/gif"
|
|
case ".webp":
|
|
return "image/webp"
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// sniffImageMIME inspects the leading bytes of an image file and
|
|
// returns the real media type, independent of the file's extension.
|
|
// Providers validate the declared media type against the actual bytes
|
|
// and 400 the whole request on a mismatch, so the extension can never
|
|
// be trusted. Returns "" when the format is not one zot ships images
|
|
// for, leaving the caller's extension-based guess in place.
|
|
func sniffImageMIME(data []byte) string {
|
|
switch {
|
|
case len(data) >= 8 && bytes.Equal(data[:8], []byte{0x89, 'P', 'N', 'G', 0x0D, 0x0A, 0x1A, 0x0A}):
|
|
return "image/png"
|
|
case len(data) >= 3 && data[0] == 0xFF && data[1] == 0xD8 && data[2] == 0xFF:
|
|
return "image/jpeg"
|
|
case len(data) >= 6 && (bytes.Equal(data[:6], []byte("GIF87a")) || bytes.Equal(data[:6], []byte("GIF89a"))):
|
|
return "image/gif"
|
|
case len(data) >= 12 && bytes.Equal(data[:4], []byte("RIFF")) && bytes.Equal(data[8:12], []byte("WEBP")):
|
|
return "image/webp"
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// looksBinary returns true if the buffer contains a NUL byte in its first 8 KiB.
|
|
func looksBinary(b []byte) bool {
|
|
n := len(b)
|
|
if n > 8192 {
|
|
n = 8192
|
|
}
|
|
for i := 0; i < n; i++ {
|
|
if b[i] == 0 {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|