From 8a2d07edf8b35ffbdd6e1e6007d89887cf861f7b Mon Sep 17 00:00:00 2001 From: Sam & Claude Date: Fri, 5 Jun 2026 13:15:32 +0200 Subject: [PATCH] chore: remove Tier-A unused SaaS modules (browser-operator, tts, vision) (Sam & Claude) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Verified-safe slice of the prune plan: only modules with ZERO importers (static or dynamic) — confirmed by import-graph scan, not name-based categorization. Removed (src 62,085 -> 61,280; 805 lines, 6 files): - src/browser-operator.ts (+test) — no references anywhere - src/tts.ts (+test) — 0 importers; ttsMode config elsewhere is unrelated and untouched - src/vision.ts (+test) — 0 importers (prior "vision" hits were substrings of "provisioning") Aligns with config.ts core profile ("no budget/TTS/STT/vision"). Post-delete import-graph scan is clean. NOT included from the original "delete now" list — they have real importers and were mis-bucketed: memory-architecture/lifecycle + database-architecture (the DB pool — core infra), outbound-images (live Telegram path), transcription/stt-guard (Telegram voice), tenant-*/platform-*/surface-*/stripe-* (woven in). Those need untangle-then-delete, not a blind removal. GATE (run on a host with deps): npm run typecheck && npm run test (vitest). No local node_modules here, so verification was the import graph. Co-Authored-By: Claude Opus 4.8 --- src/browser-operator.test.ts | 115 ---------------------- src/browser-operator.ts | 161 ------------------------------ src/tts.test.ts | 115 ---------------------- src/tts.ts | 171 -------------------------------- src/vision.test.ts | 60 ------------ src/vision.ts | 183 ----------------------------------- 6 files changed, 805 deletions(-) delete mode 100644 src/browser-operator.test.ts delete mode 100644 src/browser-operator.ts delete mode 100644 src/tts.test.ts delete mode 100644 src/tts.ts delete mode 100644 src/vision.test.ts delete mode 100644 src/vision.ts diff --git a/src/browser-operator.test.ts b/src/browser-operator.test.ts deleted file mode 100644 index 91d6bbe..0000000 --- a/src/browser-operator.test.ts +++ /dev/null @@ -1,115 +0,0 @@ -import { describe, expect, it, vi } from 'vitest'; - -import { createClawdieBrowserOperator, type ClawdieBrowserOperator } from './browser-operator.js'; -import type { BackendCaller } from './browser-orchestrator.js'; -import type { BrowserCloneRow } from './browser-session-registry.js'; - -const baseTime = new Date('2026-05-11T12:00:00.000Z'); - -class OperatorPool { - row: BrowserCloneRow = { - session_id: '00000000-0000-4000-8000-000000000150', - tenant_id: 'tenant-a', - clone_name: 'browsertask150', - ip: '192.168.72.150', - status: 'open', - credential_mode: 'clean', - operator_grant_token_jti: null, - created_at: baseTime, - updated_at: baseTime, - opened_at: baseTime, - closed_at: null, - last_action_at: null, - expires_at: null, - error_code: null, - error_message: null, - }; - - async query(sql: string, params: unknown[] = []): Promise<{ rows: any[] }> { - const normalized = sql.replace(/\s+/g, ' ').trim(); - if (normalized.startsWith('SELECT session_id::text')) { - return { rows: this.row.session_id === params[0] ? [this.row] : [] }; - } - if (normalized.startsWith('UPDATE browser_clones')) { - const [, status, updatedAt, , closedAt, lastActionAt] = params; - this.row.status = status as any; - this.row.updated_at = updatedAt as Date; - if (closedAt) this.row.closed_at = closedAt as Date; - if (lastActionAt) this.row.last_action_at = lastActionAt as Date; - } - return { rows: [] }; - } -} - -describe('ClawdieBrowserOperator', () => { - it('exports the UI-TARS-compatible screenshot/execute shape', () => { - const operator: ClawdieBrowserOperator = createClawdieBrowserOperator(new OperatorPool() as any, 's1'); - expect(typeof operator.screenshot).toBe('function'); - expect(typeof operator.execute).toBe('function'); - expect(typeof operator.finished).toBe('function'); - expect(typeof operator.close).toBe('function'); - }); - - it('adapts screenshot responses to imageBase64', async () => { - const backend: BackendCaller = vi.fn(async () => ({ - ok: true, - status: 200, - data: { ok: true, image_base64: 'iVBORw0=', width: 1024, height: 768 }, - })); - const operator = createClawdieBrowserOperator( - new OperatorPool() as any, - '00000000-0000-4000-8000-000000000150', - { backend, now: () => baseTime }, - ); - await expect(operator.screenshot()).resolves.toEqual({ - imageBase64: 'iVBORw0=', - width: 1024, - height: 768, - }); - expect(backend).toHaveBeenCalledWith('192.168.72.150', 'POST', '/screenshot', {}, 30000); - }); - - it('translates basic predictions to browser action helpers', async () => { - const backend: BackendCaller = vi.fn(async (_ip, _method, path, body) => ({ - ok: true, - status: 200, - data: { ok: true, path, body }, - })); - const operator = createClawdieBrowserOperator( - new OperatorPool() as any, - '00000000-0000-4000-8000-000000000150', - { backend, now: () => baseTime }, - ); - await expect(operator.execute({ action: 'navigate', url: 'https://example.com/' })).resolves.toMatchObject({ - path: '/navigate', - body: { url: 'https://example.com/' }, - }); - await expect(operator.execute({ action: 'click', selector: '#go' })).resolves.toMatchObject({ - path: '/click', - body: { selector: '#go' }, - }); - await expect(operator.execute({ action: 'type', text: 'hello' })).resolves.toMatchObject({ - path: '/type', - body: { text: 'hello' }, - }); - }); - - it('tracks finish and closes the browser session', async () => { - const pool = new OperatorPool(); - const hostd = vi.fn(async () => ({ id: 'x', ok: true, output: 'ok' })); - const operator = createClawdieBrowserOperator( - pool as any, - '00000000-0000-4000-8000-000000000150', - { hostd, now: () => baseTime }, - ); - await expect(operator.execute({ action: 'finish' })).resolves.toEqual({ ok: true, finished: true }); - expect(await operator.finished?.()).toBe(true); - await operator.close?.(); - expect(hostd).toHaveBeenCalledWith('browser-clone-reap', { - clone: 'browsertask150', - ip: '192.168.72.150', - suffix: 'bt150', - }); - expect(pool.row.status).toBe('closed'); - }); -}); diff --git a/src/browser-operator.ts b/src/browser-operator.ts deleted file mode 100644 index 1fb3469..0000000 --- a/src/browser-operator.ts +++ /dev/null @@ -1,161 +0,0 @@ -import type pg from 'pg'; - -import { - clickBrowserSession, - closeBrowserSession, - navigateBrowserSession, - readDomBrowserSession, - screenshotBrowserSession, - scrollBrowserSession, - typeBrowserSession, - type BrowserOrchestratorDeps, -} from './browser-orchestrator.js'; - -export interface ClawdieBrowserScreenshot { - imageBase64: string; - width?: number; - height?: number; -} - -export interface ClawdieBrowserOperator { - screenshot(): Promise; - execute(prediction: unknown): Promise; - finished?(): Promise | boolean; - close?(): Promise; -} - -type Prediction = { - action?: string; - type?: string; - url?: string; - x?: number; - y?: number; - selector?: string; - text?: string; - dx?: number; - dy?: number; - full_page?: boolean; - max_bytes?: number; -}; - -function predictionAction(prediction: Prediction): string { - return String(prediction.action ?? prediction.type ?? '').trim().toLowerCase(); -} - -function ensureObject(prediction: unknown): Prediction { - if (!prediction || typeof prediction !== 'object') { - throw new Error('browser operator prediction must be an object'); - } - return prediction as Prediction; -} - -function unwrap(result: { ok: true; value: T } | { ok: false; code: string; error: string }): T { - if (!result.ok) throw Object.assign(new Error(result.error), { code: result.code }); - return result.value; -} - -export class BrowserSessionOperator implements ClawdieBrowserOperator { - private done = false; - - constructor( - private readonly pool: pg.Pool, - private readonly sessionId: string, - private readonly deps: BrowserOrchestratorDeps = {}, - ) {} - - async screenshot(): Promise { - const result = unwrap(await screenshotBrowserSession(this.pool, this.sessionId, {}, this.deps)) as { - image_base64?: string; - width?: number; - height?: number; - }; - return { - imageBase64: result.image_base64 ?? '', - width: result.width, - height: result.height, - }; - } - - async execute(rawPrediction: unknown): Promise { - const prediction = ensureObject(rawPrediction); - const action = predictionAction(prediction); - switch (action) { - case 'navigate': - case 'open': - if (!prediction.url) throw new Error('navigate action requires url'); - return unwrap(await navigateBrowserSession(this.pool, this.sessionId, { url: prediction.url }, this.deps)); - case 'click': - return unwrap( - await clickBrowserSession( - this.pool, - this.sessionId, - prediction.selector - ? { selector: prediction.selector } - : { x: prediction.x, y: prediction.y }, - this.deps, - ), - ); - case 'type': - case 'input': - return unwrap( - await typeBrowserSession( - this.pool, - this.sessionId, - { text: prediction.text ?? '', selector: prediction.selector }, - this.deps, - ), - ); - case 'scroll': - return unwrap( - await scrollBrowserSession( - this.pool, - this.sessionId, - { dx: prediction.dx, dy: prediction.dy, selector: prediction.selector }, - this.deps, - ), - ); - case 'read_dom': - case 'read': - return unwrap( - await readDomBrowserSession( - this.pool, - this.sessionId, - { max_bytes: prediction.max_bytes }, - this.deps, - ), - ); - case 'screenshot': - return unwrap( - await screenshotBrowserSession( - this.pool, - this.sessionId, - { full_page: prediction.full_page }, - this.deps, - ), - ); - case 'finish': - case 'done': - this.done = true; - return { ok: true, finished: true }; - default: - throw new Error(`unsupported browser operator action: ${action || 'unknown'}`); - } - } - - finished(): boolean { - return this.done; - } - - async close(): Promise { - await closeBrowserSession(this.pool, this.sessionId, this.deps); - this.done = true; - } -} - -export function createClawdieBrowserOperator( - pool: pg.Pool, - sessionId: string, - deps: BrowserOrchestratorDeps = {}, -): ClawdieBrowserOperator { - return new BrowserSessionOperator(pool, sessionId, deps); -} diff --git a/src/tts.test.ts b/src/tts.test.ts deleted file mode 100644 index 314b62e..0000000 --- a/src/tts.test.ts +++ /dev/null @@ -1,115 +0,0 @@ -import { describe, it, expect } from 'vitest'; - -import { shouldApplyTts, stripTtsMarker, stripMarkdown } from './tts.js'; - -describe('shouldApplyTts', () => { - it('returns false for off mode', () => { - expect( - shouldApplyTts({ mode: 'off', hadInboundAudio: false, text: 'hello' }), - ).toBe(false); - }); - - it('returns true for always mode', () => { - expect( - shouldApplyTts({ mode: 'always', hadInboundAudio: false, text: 'hello' }), - ).toBe(true); - }); - - it('returns true for inbound mode when inbound was audio', () => { - expect( - shouldApplyTts({ mode: 'inbound', hadInboundAudio: true, text: 'hello' }), - ).toBe(true); - }); - - it('returns false for inbound mode when inbound was not audio', () => { - expect( - shouldApplyTts({ - mode: 'inbound', - hadInboundAudio: false, - text: 'hello', - }), - ).toBe(false); - }); - - it('returns true for tagged mode when [[tts]] is present', () => { - expect( - shouldApplyTts({ - mode: 'tagged', - hadInboundAudio: false, - text: 'hello [[tts]] world', - }), - ).toBe(true); - }); - - it('returns false for tagged mode without marker', () => { - expect( - shouldApplyTts({ - mode: 'tagged', - hadInboundAudio: false, - text: 'hello world', - }), - ).toBe(false); - }); - - it('[[tts]] is case-insensitive', () => { - expect( - shouldApplyTts({ - mode: 'tagged', - hadInboundAudio: false, - text: '[[TTS]]', - }), - ).toBe(true); - }); -}); - -describe('stripTtsMarker', () => { - it('removes [[tts]] from text', () => { - expect(stripTtsMarker('hello [[tts]] world')).toBe('hello world'); - }); - - it('returns text unchanged when no marker', () => { - expect(stripTtsMarker('hello world')).toBe('hello world'); - }); - - it('removes multiple markers', () => { - expect(stripTtsMarker('[[tts]] a [[tts]] b')).toBe('a b'); - }); - - it('is case-insensitive', () => { - expect(stripTtsMarker('[[TTS]]hello')).toBe('hello'); - }); -}); - -describe('stripMarkdown', () => { - it('removes bold markers', () => { - expect(stripMarkdown('**hello**')).toBe('hello'); - }); - - it('removes italic markers', () => { - expect(stripMarkdown('*hello*')).toBe('hello'); - }); - - it('removes code fences', () => { - expect(stripMarkdown('```js\nconsole.log("hi")\n```')).toBe(''); - }); - - it('removes inline code', () => { - expect(stripMarkdown('use `foo` here')).toBe('use foo here'); - }); - - it('removes links keeping text', () => { - expect(stripMarkdown('[click](https://example.com)')).toBe('click'); - }); - - it('removes heading markers', () => { - expect(stripMarkdown('## Title')).toBe('Title'); - }); - - it('removes list markers', () => { - expect(stripMarkdown('- item')).toBe('item'); - }); - - it('removes strikethrough', () => { - expect(stripMarkdown('~~deleted~~')).toBe('deleted'); - }); -}); diff --git a/src/tts.ts b/src/tts.ts deleted file mode 100644 index 3ea36f6..0000000 --- a/src/tts.ts +++ /dev/null @@ -1,171 +0,0 @@ -import fs from 'fs'; -import path from 'path'; -import { spawn } from 'child_process'; - -import { logger } from './logger.js'; -import { - TTS_PROVIDER, - TTS_MAX_TEXT_LENGTH, - TTS_OUTPUT_FORMAT, - TTS_VOICE, - TMP_DIR, -} from './config.js'; - -export type TtsAutoMode = 'always' | 'inbound' | 'tagged' | 'off'; - -interface TtsResult { - audioPath: string; -} - -export function stripMarkdown(text: string): string { - let out = text; - out = out.replace(/\[Vision OCR\][\s\S]*?\[\/Vision OCR\]/g, ''); - out = out.replace(/\[System note\][\s\S]*?\[\/System note\]/g, ''); - out = out.replace(/```[\s\S]*?```/g, ''); - out = out.replace(/`([^`]+)`/g, '$1'); - out = out.replace(/\*\*([^*]+)\*\*/g, '$1'); - out = out.replace(/\*([^*]+)\*/g, '$1'); - out = out.replace(/__([^_]+)__/g, '$1'); - out = out.replace(/_([^_]+)_/g, '$1'); - out = out.replace(/~~([^~]+)~~/g, '$1'); - out = out.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1'); - out = out.replace(/^#{1,6}\s+/gm, ''); - out = out.replace(/^[-*+]\s+/gm, ''); - out = out.replace(/^\d+\.\s+/gm, ''); - out = out.replace(/^>\s+/gm, ''); - out = out.replace(/---+/g, ''); - out = out.replace(/\n{3,}/g, '\n\n'); - return out.trim(); -} - -export function shouldApplyTts(opts: { - mode: TtsAutoMode; - hadInboundAudio: boolean; - text: string; -}): boolean { - if (opts.mode === 'off') return false; - if (opts.mode === 'always') return true; - if (opts.mode === 'inbound') return opts.hadInboundAudio; - if (opts.mode === 'tagged') return /\[\[tts\]\]/i.test(opts.text); - return false; -} - -export function stripTtsMarker(text: string): string { - return text.replace(/\[\[tts\]\]/gi, '').trim(); -} - -export async function synthesize( - text: string, - opts?: { - voice?: string; - outputFormat?: string; - }, -): Promise { - const voice = opts?.voice || TTS_VOICE; - const outputFormat = opts?.outputFormat || TTS_OUTPUT_FORMAT; - - const cleanText = stripMarkdown(text); - if (!cleanText) { - throw new Error('TTS: no text to synthesize after stripping markdown'); - } - - const truncated = - cleanText.length > TTS_MAX_TEXT_LENGTH - ? cleanText.slice(0, TTS_MAX_TEXT_LENGTH) - : cleanText; - - const ttsDir = path.join(TMP_DIR, 'tts'); - fs.mkdirSync(ttsDir, { recursive: true }); - - const ext = inferExtension(outputFormat); - const fileName = `tts-${Date.now()}-${Math.random().toString(36).slice(2, 8)}${ext}`; - const audioPath = path.join(ttsDir, fileName); - - try { - if (TTS_PROVIDER === 'azure') { - throw new Error('TTS: azure provider not implemented yet'); - } else { - await runEdgeTtsCli({ - text: truncated, - voice, - audioPath, - }); - } - } catch (err) { - try { - fs.unlinkSync(audioPath); - } catch {} - throw err; - } - - if (!fs.existsSync(audioPath)) { - throw new Error('TTS: edge-tts completed but no audio file produced'); - } - const stat = fs.statSync(audioPath); - if (!stat.isFile()) { - throw new Error('TTS: synthesized audio path is not a file'); - } - if (stat.size < 1024) { - throw new Error(`TTS: synthesized audio file too small (${stat.size} bytes)`); - } - - logger.info( - { - voice, - format: outputFormat, - chars: truncated.length, - bytes: stat.size, - path: audioPath, - }, - 'TTS synthesized', - ); - - return { audioPath }; -} - -function runEdgeTtsCli(opts: { - text: string; - voice: string; - audioPath: string; -}): Promise { - return new Promise((resolve, reject) => { - const args = [ - '--text', - opts.text, - '--voice', - opts.voice, - '--write-media', - opts.audioPath, - ]; - const binPath = path.join( - path.dirname(path.dirname(new URL(import.meta.url).pathname)), - 'bin', - 'edge-tts', - ); - const proc = spawn(binPath, args, { stdio: 'ignore' }); - proc.on('error', (err: any) => { - if (err && err.code === 'ENOENT') { - reject( - new Error( - 'TTS: edge-tts not found. Install with: pip install edge-tts', - ), - ); - return; - } - reject(err); - }); - proc.on('exit', (code) => { - if (code === 0) resolve(); - else reject(new Error(`TTS: edge-tts exited with code ${code}`)); - }); - }); -} - -function inferExtension(outputFormat: string): string { - const n = outputFormat.toLowerCase(); - if (n.includes('ogg') || n.includes('opus')) return '.ogg'; - if (n.includes('webm')) return '.webm'; - if (n.includes('wav') || n.includes('riff') || n.includes('pcm')) - return '.wav'; - return '.mp3'; -} diff --git a/src/vision.test.ts b/src/vision.test.ts deleted file mode 100644 index a045677..0000000 --- a/src/vision.test.ts +++ /dev/null @@ -1,60 +0,0 @@ -import fs from 'fs'; -import path from 'path'; - -import { afterEach, describe, expect, it, vi } from 'vitest'; - -import { TMP_TESTS_DIR } from './config.js'; - -function writeTinyJpeg(filePath: string): void { - // Minimal JPEG header + EOI (enough for base64 packaging; model may ignore). - fs.writeFileSync(filePath, Buffer.from([0xff, 0xd8, 0xff, 0xd9])); -} - -describe('augmentPromptWithVision', () => { - const originalEnv = { ...process.env }; - - afterEach(() => { - process.env = { ...originalEnv }; - vi.restoreAllMocks(); - vi.resetModules(); - }); - - it('injects [Vision OCR] block for saved Telegram photos under TMP_DIR', async () => { - const dir = fs.mkdtempSync(path.join(TMP_TESTS_DIR, 'vision-test-')); - const imagePath = path.join(dir, 'photo.jpg'); - writeTinyJpeg(imagePath); - - process.env.VISION_PROVIDER = 'openrouter'; - process.env.VISION_MODEL = 'nvidia/nemotron-nano-12b-v2-vl:free'; - process.env.OPENROUTER_API_KEY = 'test-key'; - - const fetchMock = vi.fn(async () => { - return { - ok: true, - json: async () => ({ - choices: [ - { - message: { content: 'OCR:\nHELLO\n\nSUMMARY:\n- hi' }, - }, - ], - }), - } as any; - }); - // @ts-expect-error vitest runtime override - globalThis.fetch = fetchMock; - - // config.ts evaluates VISION_PROVIDER at module load; the static - // import of TMP_TESTS_DIR above already cached an empty value. - // Reset so the dynamic import below re-reads env we just set. - vi.resetModules(); - const { augmentPromptWithVision } = await import('./vision.js'); - const input = `User: test\n[Photo saved: ${imagePath}]`; - const out = await augmentPromptWithVision(input); - - expect(fetchMock).toHaveBeenCalled(); - expect(out).toContain('[Vision OCR]'); - expect(out).toContain('OCR:\nHELLO'); - expect(out).toContain('Do not claim you "cannot see"'); - }); -}); - diff --git a/src/vision.ts b/src/vision.ts deleted file mode 100644 index 6a7075d..0000000 --- a/src/vision.ts +++ /dev/null @@ -1,183 +0,0 @@ -import fs from 'fs'; -import path from 'path'; - -import { readEnvFile } from './env.js'; -import { - RUNTIME_ID, - TMP_DIR, - VISION_MAX_CHARS_PER_IMAGE, - VISION_MAX_IMAGES, - VISION_MAX_TOTAL_CHARS, - VISION_MODEL, - VISION_PROVIDER, -} from './config.js'; -import { logger } from './logger.js'; - -type OpenRouterResponse = { - choices?: Array<{ - message?: { content?: string }; - }>; -}; - -let cachedOpenRouterKey: string | null | undefined; -function getOpenRouterKey(): string | null { - if (cachedOpenRouterKey !== undefined) return cachedOpenRouterKey; - const fromEnv = process.env.OPENROUTER_API_KEY; - if (fromEnv) { - cachedOpenRouterKey = fromEnv; - return cachedOpenRouterKey; - } - const parsed = readEnvFile(['OPENROUTER_API_KEY']); - cachedOpenRouterKey = parsed.OPENROUTER_API_KEY || null; - return cachedOpenRouterKey; -} - -function clampText(text: string, maxChars: number): string { - if (text.length <= maxChars) return text; - return `${text.slice(0, maxChars)}\n…(truncated)…`; -} - -function isSafeTmpPath(filePath: string): boolean { - const resolved = path.resolve(filePath); - const safeRoot = path.resolve(TMP_DIR) + path.sep; - return resolved.startsWith(safeRoot); -} - -async function describeImageOpenRouter(imagePath: string): Promise { - const key = getOpenRouterKey(); - if (!key) throw new Error('OPENROUTER_API_KEY not configured'); - - const resolved = path.resolve(imagePath); - const bytes = fs.readFileSync(resolved); - const base64 = bytes.toString('base64'); - const ext = path.extname(resolved).toLowerCase().replace('.', '') || 'png'; - const mime = ext === 'jpg' ? 'jpeg' : ext; - const dataUrl = `data:image/${mime};base64,${base64}`; - - const prompt = - 'You are an OCR + screenshot/meme helper.\n' + - '\n' + - 'Task:\n' + - '1) Extract ALL readable text verbatim (including meme overlay text).\n' + - '2) If some text is unclear, write [unclear] for that span.\n' + - '3) After OCR, add a short bulleted summary (max 5 bullets) of what the image shows.\n' + - '\n' + - 'Output format:\n' + - 'OCR:\n' + - '\n' + - '\n' + - 'SUMMARY:\n' + - '- ...'; - - const body = { - model: VISION_MODEL, - messages: [ - { - role: 'user', - content: [ - { type: 'text', text: prompt }, - { type: 'image_url', image_url: { url: dataUrl } }, - ], - }, - ], - max_tokens: 1200, - temperature: 0, - }; - - const resp = await fetch('https://openrouter.ai/api/v1/chat/completions', { - method: 'POST', - headers: { - Authorization: `Bearer ${key}`, - 'Content-Type': 'application/json', - 'HTTP-Referer': 'https://codeberg.org/Clawdie/Clawdie-AI', - 'X-Title': `${RUNTIME_ID}-ai`, - }, - body: JSON.stringify(body), - }); - - if (!resp.ok) { - const txt = await resp.text().catch(() => ''); - throw new Error(`OpenRouter vision failed: ${resp.status} ${txt}`); - } - - const data = (await resp.json()) as OpenRouterResponse; - const content = data.choices?.[0]?.message?.content?.trim() || ''; - if (!content) throw new Error('OpenRouter vision returned empty content'); - return content; -} - -async function describeImage(imagePath: string): Promise { - if (VISION_PROVIDER !== 'openrouter') { - throw new Error(`Unsupported vision provider: ${VISION_PROVIDER || '(empty)'}`); - } - return describeImageOpenRouter(imagePath); -} - -/** - * Replace `[Photo saved: /path]` placeholders with an OCR/description block. - * Only runs when `VISION_PROVIDER` is set and `VISION_MAX_IMAGES > 0`. - */ -export async function augmentPromptWithVision(text: string): Promise { - if (!VISION_PROVIDER) return text; - if (VISION_MAX_IMAGES <= 0) return text; - - const re = /\[Photo saved:\s*([^\]\n]+)\]/g; - const matches = Array.from(text.matchAll(re)) - .map((m) => m[1]?.trim()) - .filter(Boolean) as string[]; - if (matches.length === 0) return text; - - const unique: string[] = []; - for (const m of matches) { - if (!unique.includes(m)) unique.push(m); - if (unique.length >= VISION_MAX_IMAGES) break; - } - - let totalAdded = 0; - let out = text; - let injectedAny = false; - for (const imagePath of unique) { - if (!isSafeTmpPath(imagePath)) { - logger.warn({ imagePath }, 'Vision skipped (path outside TMP_DIR)'); - continue; - } - if (!fs.existsSync(imagePath)) { - logger.warn({ imagePath }, 'Vision skipped (file missing)'); - continue; - } - - try { - logger.info({ imagePath, model: VISION_MODEL }, 'Vision OCR started'); - const ocr = await describeImage(imagePath); - logger.info( - { imagePath, snippet: clampText(ocr, 240) }, - 'Vision OCR raw output (snippet)', - ); - const clipped = clampText(ocr, VISION_MAX_CHARS_PER_IMAGE); - const block = `\n\n[Vision OCR]\n${clipped}\n[/Vision OCR]\n`; - if (totalAdded + block.length > VISION_MAX_TOTAL_CHARS) { - logger.warn( - { imagePath, totalAdded, maxTotal: VISION_MAX_TOTAL_CHARS }, - 'Vision OCR skipped (budget exceeded)', - ); - continue; - } - totalAdded += block.length; - out = out.replace(`[Photo saved: ${imagePath}]`, `[Photo saved: ${imagePath}]${block}`); - injectedAny = true; - logger.info({ imagePath }, 'Vision OCR injected into prompt'); - } catch (err) { - logger.warn({ imagePath, err }, 'Vision OCR failed'); - } - } - - if (!injectedAny) return out; - return ( - '[System note]\n' + - 'One or more images were OCR’d by a vision helper model. Use the content inside\n' + - '`[Vision OCR]...[/Vision OCR]` as authoritative input. Do not claim you "cannot see"\n' + - 'the image; answer using the OCR/summary provided.\n' + - '[/System note]\n\n' + - out - ); -} -- 2.45.3