chore: remove Tier-A unused SaaS modules (browser-operator, tts, vision) (Sam & Claude) #7
6 changed files with 0 additions and 805 deletions
|
|
@ -1,115 +0,0 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
|
||||
import { createClawdieBrowserOperator, type ClawdieBrowserOperator } from './browser-operator.js';
|
||||
import type { BackendCaller } from './browser-orchestrator.js';
|
||||
import type { BrowserCloneRow } from './browser-session-registry.js';
|
||||
|
||||
const baseTime = new Date('2026-05-11T12:00:00.000Z');
|
||||
|
||||
class OperatorPool {
|
||||
row: BrowserCloneRow = {
|
||||
session_id: '00000000-0000-4000-8000-000000000150',
|
||||
tenant_id: 'tenant-a',
|
||||
clone_name: 'browsertask150',
|
||||
ip: '192.168.72.150',
|
||||
status: 'open',
|
||||
credential_mode: 'clean',
|
||||
operator_grant_token_jti: null,
|
||||
created_at: baseTime,
|
||||
updated_at: baseTime,
|
||||
opened_at: baseTime,
|
||||
closed_at: null,
|
||||
last_action_at: null,
|
||||
expires_at: null,
|
||||
error_code: null,
|
||||
error_message: null,
|
||||
};
|
||||
|
||||
async query(sql: string, params: unknown[] = []): Promise<{ rows: any[] }> {
|
||||
const normalized = sql.replace(/\s+/g, ' ').trim();
|
||||
if (normalized.startsWith('SELECT session_id::text')) {
|
||||
return { rows: this.row.session_id === params[0] ? [this.row] : [] };
|
||||
}
|
||||
if (normalized.startsWith('UPDATE browser_clones')) {
|
||||
const [, status, updatedAt, , closedAt, lastActionAt] = params;
|
||||
this.row.status = status as any;
|
||||
this.row.updated_at = updatedAt as Date;
|
||||
if (closedAt) this.row.closed_at = closedAt as Date;
|
||||
if (lastActionAt) this.row.last_action_at = lastActionAt as Date;
|
||||
}
|
||||
return { rows: [] };
|
||||
}
|
||||
}
|
||||
|
||||
describe('ClawdieBrowserOperator', () => {
|
||||
it('exports the UI-TARS-compatible screenshot/execute shape', () => {
|
||||
const operator: ClawdieBrowserOperator = createClawdieBrowserOperator(new OperatorPool() as any, 's1');
|
||||
expect(typeof operator.screenshot).toBe('function');
|
||||
expect(typeof operator.execute).toBe('function');
|
||||
expect(typeof operator.finished).toBe('function');
|
||||
expect(typeof operator.close).toBe('function');
|
||||
});
|
||||
|
||||
it('adapts screenshot responses to imageBase64', async () => {
|
||||
const backend: BackendCaller = vi.fn(async () => ({
|
||||
ok: true,
|
||||
status: 200,
|
||||
data: { ok: true, image_base64: 'iVBORw0=', width: 1024, height: 768 },
|
||||
}));
|
||||
const operator = createClawdieBrowserOperator(
|
||||
new OperatorPool() as any,
|
||||
'00000000-0000-4000-8000-000000000150',
|
||||
{ backend, now: () => baseTime },
|
||||
);
|
||||
await expect(operator.screenshot()).resolves.toEqual({
|
||||
imageBase64: 'iVBORw0=',
|
||||
width: 1024,
|
||||
height: 768,
|
||||
});
|
||||
expect(backend).toHaveBeenCalledWith('192.168.72.150', 'POST', '/screenshot', {}, 30000);
|
||||
});
|
||||
|
||||
it('translates basic predictions to browser action helpers', async () => {
|
||||
const backend: BackendCaller = vi.fn(async (_ip, _method, path, body) => ({
|
||||
ok: true,
|
||||
status: 200,
|
||||
data: { ok: true, path, body },
|
||||
}));
|
||||
const operator = createClawdieBrowserOperator(
|
||||
new OperatorPool() as any,
|
||||
'00000000-0000-4000-8000-000000000150',
|
||||
{ backend, now: () => baseTime },
|
||||
);
|
||||
await expect(operator.execute({ action: 'navigate', url: 'https://example.com/' })).resolves.toMatchObject({
|
||||
path: '/navigate',
|
||||
body: { url: 'https://example.com/' },
|
||||
});
|
||||
await expect(operator.execute({ action: 'click', selector: '#go' })).resolves.toMatchObject({
|
||||
path: '/click',
|
||||
body: { selector: '#go' },
|
||||
});
|
||||
await expect(operator.execute({ action: 'type', text: 'hello' })).resolves.toMatchObject({
|
||||
path: '/type',
|
||||
body: { text: 'hello' },
|
||||
});
|
||||
});
|
||||
|
||||
it('tracks finish and closes the browser session', async () => {
|
||||
const pool = new OperatorPool();
|
||||
const hostd = vi.fn(async () => ({ id: 'x', ok: true, output: 'ok' }));
|
||||
const operator = createClawdieBrowserOperator(
|
||||
pool as any,
|
||||
'00000000-0000-4000-8000-000000000150',
|
||||
{ hostd, now: () => baseTime },
|
||||
);
|
||||
await expect(operator.execute({ action: 'finish' })).resolves.toEqual({ ok: true, finished: true });
|
||||
expect(await operator.finished?.()).toBe(true);
|
||||
await operator.close?.();
|
||||
expect(hostd).toHaveBeenCalledWith('browser-clone-reap', {
|
||||
clone: 'browsertask150',
|
||||
ip: '192.168.72.150',
|
||||
suffix: 'bt150',
|
||||
});
|
||||
expect(pool.row.status).toBe('closed');
|
||||
});
|
||||
});
|
||||
|
|
@ -1,161 +0,0 @@
|
|||
import type pg from 'pg';
|
||||
|
||||
import {
|
||||
clickBrowserSession,
|
||||
closeBrowserSession,
|
||||
navigateBrowserSession,
|
||||
readDomBrowserSession,
|
||||
screenshotBrowserSession,
|
||||
scrollBrowserSession,
|
||||
typeBrowserSession,
|
||||
type BrowserOrchestratorDeps,
|
||||
} from './browser-orchestrator.js';
|
||||
|
||||
export interface ClawdieBrowserScreenshot {
|
||||
imageBase64: string;
|
||||
width?: number;
|
||||
height?: number;
|
||||
}
|
||||
|
||||
export interface ClawdieBrowserOperator {
|
||||
screenshot(): Promise<ClawdieBrowserScreenshot>;
|
||||
execute(prediction: unknown): Promise<unknown>;
|
||||
finished?(): Promise<boolean> | boolean;
|
||||
close?(): Promise<void>;
|
||||
}
|
||||
|
||||
type Prediction = {
|
||||
action?: string;
|
||||
type?: string;
|
||||
url?: string;
|
||||
x?: number;
|
||||
y?: number;
|
||||
selector?: string;
|
||||
text?: string;
|
||||
dx?: number;
|
||||
dy?: number;
|
||||
full_page?: boolean;
|
||||
max_bytes?: number;
|
||||
};
|
||||
|
||||
function predictionAction(prediction: Prediction): string {
|
||||
return String(prediction.action ?? prediction.type ?? '').trim().toLowerCase();
|
||||
}
|
||||
|
||||
function ensureObject(prediction: unknown): Prediction {
|
||||
if (!prediction || typeof prediction !== 'object') {
|
||||
throw new Error('browser operator prediction must be an object');
|
||||
}
|
||||
return prediction as Prediction;
|
||||
}
|
||||
|
||||
function unwrap<T>(result: { ok: true; value: T } | { ok: false; code: string; error: string }): T {
|
||||
if (!result.ok) throw Object.assign(new Error(result.error), { code: result.code });
|
||||
return result.value;
|
||||
}
|
||||
|
||||
export class BrowserSessionOperator implements ClawdieBrowserOperator {
|
||||
private done = false;
|
||||
|
||||
constructor(
|
||||
private readonly pool: pg.Pool,
|
||||
private readonly sessionId: string,
|
||||
private readonly deps: BrowserOrchestratorDeps = {},
|
||||
) {}
|
||||
|
||||
async screenshot(): Promise<ClawdieBrowserScreenshot> {
|
||||
const result = unwrap(await screenshotBrowserSession(this.pool, this.sessionId, {}, this.deps)) as {
|
||||
image_base64?: string;
|
||||
width?: number;
|
||||
height?: number;
|
||||
};
|
||||
return {
|
||||
imageBase64: result.image_base64 ?? '',
|
||||
width: result.width,
|
||||
height: result.height,
|
||||
};
|
||||
}
|
||||
|
||||
async execute(rawPrediction: unknown): Promise<unknown> {
|
||||
const prediction = ensureObject(rawPrediction);
|
||||
const action = predictionAction(prediction);
|
||||
switch (action) {
|
||||
case 'navigate':
|
||||
case 'open':
|
||||
if (!prediction.url) throw new Error('navigate action requires url');
|
||||
return unwrap(await navigateBrowserSession(this.pool, this.sessionId, { url: prediction.url }, this.deps));
|
||||
case 'click':
|
||||
return unwrap(
|
||||
await clickBrowserSession(
|
||||
this.pool,
|
||||
this.sessionId,
|
||||
prediction.selector
|
||||
? { selector: prediction.selector }
|
||||
: { x: prediction.x, y: prediction.y },
|
||||
this.deps,
|
||||
),
|
||||
);
|
||||
case 'type':
|
||||
case 'input':
|
||||
return unwrap(
|
||||
await typeBrowserSession(
|
||||
this.pool,
|
||||
this.sessionId,
|
||||
{ text: prediction.text ?? '', selector: prediction.selector },
|
||||
this.deps,
|
||||
),
|
||||
);
|
||||
case 'scroll':
|
||||
return unwrap(
|
||||
await scrollBrowserSession(
|
||||
this.pool,
|
||||
this.sessionId,
|
||||
{ dx: prediction.dx, dy: prediction.dy, selector: prediction.selector },
|
||||
this.deps,
|
||||
),
|
||||
);
|
||||
case 'read_dom':
|
||||
case 'read':
|
||||
return unwrap(
|
||||
await readDomBrowserSession(
|
||||
this.pool,
|
||||
this.sessionId,
|
||||
{ max_bytes: prediction.max_bytes },
|
||||
this.deps,
|
||||
),
|
||||
);
|
||||
case 'screenshot':
|
||||
return unwrap(
|
||||
await screenshotBrowserSession(
|
||||
this.pool,
|
||||
this.sessionId,
|
||||
{ full_page: prediction.full_page },
|
||||
this.deps,
|
||||
),
|
||||
);
|
||||
case 'finish':
|
||||
case 'done':
|
||||
this.done = true;
|
||||
return { ok: true, finished: true };
|
||||
default:
|
||||
throw new Error(`unsupported browser operator action: ${action || 'unknown'}`);
|
||||
}
|
||||
}
|
||||
|
||||
finished(): boolean {
|
||||
return this.done;
|
||||
}
|
||||
|
||||
async close(): Promise<void> {
|
||||
await closeBrowserSession(this.pool, this.sessionId, this.deps);
|
||||
this.done = true;
|
||||
}
|
||||
}
|
||||
|
||||
export function createClawdieBrowserOperator(
|
||||
pool: pg.Pool,
|
||||
sessionId: string,
|
||||
deps: BrowserOrchestratorDeps = {},
|
||||
): ClawdieBrowserOperator {
|
||||
return new BrowserSessionOperator(pool, sessionId, deps);
|
||||
}
|
||||
115
src/tts.test.ts
115
src/tts.test.ts
|
|
@ -1,115 +0,0 @@
|
|||
import { describe, it, expect } from 'vitest';
|
||||
|
||||
import { shouldApplyTts, stripTtsMarker, stripMarkdown } from './tts.js';
|
||||
|
||||
describe('shouldApplyTts', () => {
|
||||
it('returns false for off mode', () => {
|
||||
expect(
|
||||
shouldApplyTts({ mode: 'off', hadInboundAudio: false, text: 'hello' }),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it('returns true for always mode', () => {
|
||||
expect(
|
||||
shouldApplyTts({ mode: 'always', hadInboundAudio: false, text: 'hello' }),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it('returns true for inbound mode when inbound was audio', () => {
|
||||
expect(
|
||||
shouldApplyTts({ mode: 'inbound', hadInboundAudio: true, text: 'hello' }),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it('returns false for inbound mode when inbound was not audio', () => {
|
||||
expect(
|
||||
shouldApplyTts({
|
||||
mode: 'inbound',
|
||||
hadInboundAudio: false,
|
||||
text: 'hello',
|
||||
}),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it('returns true for tagged mode when [[tts]] is present', () => {
|
||||
expect(
|
||||
shouldApplyTts({
|
||||
mode: 'tagged',
|
||||
hadInboundAudio: false,
|
||||
text: 'hello [[tts]] world',
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it('returns false for tagged mode without marker', () => {
|
||||
expect(
|
||||
shouldApplyTts({
|
||||
mode: 'tagged',
|
||||
hadInboundAudio: false,
|
||||
text: 'hello world',
|
||||
}),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it('[[tts]] is case-insensitive', () => {
|
||||
expect(
|
||||
shouldApplyTts({
|
||||
mode: 'tagged',
|
||||
hadInboundAudio: false,
|
||||
text: '[[TTS]]',
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('stripTtsMarker', () => {
|
||||
it('removes [[tts]] from text', () => {
|
||||
expect(stripTtsMarker('hello [[tts]] world')).toBe('hello world');
|
||||
});
|
||||
|
||||
it('returns text unchanged when no marker', () => {
|
||||
expect(stripTtsMarker('hello world')).toBe('hello world');
|
||||
});
|
||||
|
||||
it('removes multiple markers', () => {
|
||||
expect(stripTtsMarker('[[tts]] a [[tts]] b')).toBe('a b');
|
||||
});
|
||||
|
||||
it('is case-insensitive', () => {
|
||||
expect(stripTtsMarker('[[TTS]]hello')).toBe('hello');
|
||||
});
|
||||
});
|
||||
|
||||
describe('stripMarkdown', () => {
|
||||
it('removes bold markers', () => {
|
||||
expect(stripMarkdown('**hello**')).toBe('hello');
|
||||
});
|
||||
|
||||
it('removes italic markers', () => {
|
||||
expect(stripMarkdown('*hello*')).toBe('hello');
|
||||
});
|
||||
|
||||
it('removes code fences', () => {
|
||||
expect(stripMarkdown('```js\nconsole.log("hi")\n```')).toBe('');
|
||||
});
|
||||
|
||||
it('removes inline code', () => {
|
||||
expect(stripMarkdown('use `foo` here')).toBe('use foo here');
|
||||
});
|
||||
|
||||
it('removes links keeping text', () => {
|
||||
expect(stripMarkdown('[click](https://example.com)')).toBe('click');
|
||||
});
|
||||
|
||||
it('removes heading markers', () => {
|
||||
expect(stripMarkdown('## Title')).toBe('Title');
|
||||
});
|
||||
|
||||
it('removes list markers', () => {
|
||||
expect(stripMarkdown('- item')).toBe('item');
|
||||
});
|
||||
|
||||
it('removes strikethrough', () => {
|
||||
expect(stripMarkdown('~~deleted~~')).toBe('deleted');
|
||||
});
|
||||
});
|
||||
171
src/tts.ts
171
src/tts.ts
|
|
@ -1,171 +0,0 @@
|
|||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import { spawn } from 'child_process';
|
||||
|
||||
import { logger } from './logger.js';
|
||||
import {
|
||||
TTS_PROVIDER,
|
||||
TTS_MAX_TEXT_LENGTH,
|
||||
TTS_OUTPUT_FORMAT,
|
||||
TTS_VOICE,
|
||||
TMP_DIR,
|
||||
} from './config.js';
|
||||
|
||||
export type TtsAutoMode = 'always' | 'inbound' | 'tagged' | 'off';
|
||||
|
||||
interface TtsResult {
|
||||
audioPath: string;
|
||||
}
|
||||
|
||||
export function stripMarkdown(text: string): string {
|
||||
let out = text;
|
||||
out = out.replace(/\[Vision OCR\][\s\S]*?\[\/Vision OCR\]/g, '');
|
||||
out = out.replace(/\[System note\][\s\S]*?\[\/System note\]/g, '');
|
||||
out = out.replace(/```[\s\S]*?```/g, '');
|
||||
out = out.replace(/`([^`]+)`/g, '$1');
|
||||
out = out.replace(/\*\*([^*]+)\*\*/g, '$1');
|
||||
out = out.replace(/\*([^*]+)\*/g, '$1');
|
||||
out = out.replace(/__([^_]+)__/g, '$1');
|
||||
out = out.replace(/_([^_]+)_/g, '$1');
|
||||
out = out.replace(/~~([^~]+)~~/g, '$1');
|
||||
out = out.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');
|
||||
out = out.replace(/^#{1,6}\s+/gm, '');
|
||||
out = out.replace(/^[-*+]\s+/gm, '');
|
||||
out = out.replace(/^\d+\.\s+/gm, '');
|
||||
out = out.replace(/^>\s+/gm, '');
|
||||
out = out.replace(/---+/g, '');
|
||||
out = out.replace(/\n{3,}/g, '\n\n');
|
||||
return out.trim();
|
||||
}
|
||||
|
||||
export function shouldApplyTts(opts: {
|
||||
mode: TtsAutoMode;
|
||||
hadInboundAudio: boolean;
|
||||
text: string;
|
||||
}): boolean {
|
||||
if (opts.mode === 'off') return false;
|
||||
if (opts.mode === 'always') return true;
|
||||
if (opts.mode === 'inbound') return opts.hadInboundAudio;
|
||||
if (opts.mode === 'tagged') return /\[\[tts\]\]/i.test(opts.text);
|
||||
return false;
|
||||
}
|
||||
|
||||
export function stripTtsMarker(text: string): string {
|
||||
return text.replace(/\[\[tts\]\]/gi, '').trim();
|
||||
}
|
||||
|
||||
export async function synthesize(
|
||||
text: string,
|
||||
opts?: {
|
||||
voice?: string;
|
||||
outputFormat?: string;
|
||||
},
|
||||
): Promise<TtsResult> {
|
||||
const voice = opts?.voice || TTS_VOICE;
|
||||
const outputFormat = opts?.outputFormat || TTS_OUTPUT_FORMAT;
|
||||
|
||||
const cleanText = stripMarkdown(text);
|
||||
if (!cleanText) {
|
||||
throw new Error('TTS: no text to synthesize after stripping markdown');
|
||||
}
|
||||
|
||||
const truncated =
|
||||
cleanText.length > TTS_MAX_TEXT_LENGTH
|
||||
? cleanText.slice(0, TTS_MAX_TEXT_LENGTH)
|
||||
: cleanText;
|
||||
|
||||
const ttsDir = path.join(TMP_DIR, 'tts');
|
||||
fs.mkdirSync(ttsDir, { recursive: true });
|
||||
|
||||
const ext = inferExtension(outputFormat);
|
||||
const fileName = `tts-${Date.now()}-${Math.random().toString(36).slice(2, 8)}${ext}`;
|
||||
const audioPath = path.join(ttsDir, fileName);
|
||||
|
||||
try {
|
||||
if (TTS_PROVIDER === 'azure') {
|
||||
throw new Error('TTS: azure provider not implemented yet');
|
||||
} else {
|
||||
await runEdgeTtsCli({
|
||||
text: truncated,
|
||||
voice,
|
||||
audioPath,
|
||||
});
|
||||
}
|
||||
} catch (err) {
|
||||
try {
|
||||
fs.unlinkSync(audioPath);
|
||||
} catch {}
|
||||
throw err;
|
||||
}
|
||||
|
||||
if (!fs.existsSync(audioPath)) {
|
||||
throw new Error('TTS: edge-tts completed but no audio file produced');
|
||||
}
|
||||
const stat = fs.statSync(audioPath);
|
||||
if (!stat.isFile()) {
|
||||
throw new Error('TTS: synthesized audio path is not a file');
|
||||
}
|
||||
if (stat.size < 1024) {
|
||||
throw new Error(`TTS: synthesized audio file too small (${stat.size} bytes)`);
|
||||
}
|
||||
|
||||
logger.info(
|
||||
{
|
||||
voice,
|
||||
format: outputFormat,
|
||||
chars: truncated.length,
|
||||
bytes: stat.size,
|
||||
path: audioPath,
|
||||
},
|
||||
'TTS synthesized',
|
||||
);
|
||||
|
||||
return { audioPath };
|
||||
}
|
||||
|
||||
function runEdgeTtsCli(opts: {
|
||||
text: string;
|
||||
voice: string;
|
||||
audioPath: string;
|
||||
}): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const args = [
|
||||
'--text',
|
||||
opts.text,
|
||||
'--voice',
|
||||
opts.voice,
|
||||
'--write-media',
|
||||
opts.audioPath,
|
||||
];
|
||||
const binPath = path.join(
|
||||
path.dirname(path.dirname(new URL(import.meta.url).pathname)),
|
||||
'bin',
|
||||
'edge-tts',
|
||||
);
|
||||
const proc = spawn(binPath, args, { stdio: 'ignore' });
|
||||
proc.on('error', (err: any) => {
|
||||
if (err && err.code === 'ENOENT') {
|
||||
reject(
|
||||
new Error(
|
||||
'TTS: edge-tts not found. Install with: pip install edge-tts',
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
reject(err);
|
||||
});
|
||||
proc.on('exit', (code) => {
|
||||
if (code === 0) resolve();
|
||||
else reject(new Error(`TTS: edge-tts exited with code ${code}`));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
function inferExtension(outputFormat: string): string {
|
||||
const n = outputFormat.toLowerCase();
|
||||
if (n.includes('ogg') || n.includes('opus')) return '.ogg';
|
||||
if (n.includes('webm')) return '.webm';
|
||||
if (n.includes('wav') || n.includes('riff') || n.includes('pcm'))
|
||||
return '.wav';
|
||||
return '.mp3';
|
||||
}
|
||||
|
|
@ -1,60 +0,0 @@
|
|||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
|
||||
import { afterEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
import { TMP_TESTS_DIR } from './config.js';
|
||||
|
||||
function writeTinyJpeg(filePath: string): void {
|
||||
// Minimal JPEG header + EOI (enough for base64 packaging; model may ignore).
|
||||
fs.writeFileSync(filePath, Buffer.from([0xff, 0xd8, 0xff, 0xd9]));
|
||||
}
|
||||
|
||||
describe('augmentPromptWithVision', () => {
|
||||
const originalEnv = { ...process.env };
|
||||
|
||||
afterEach(() => {
|
||||
process.env = { ...originalEnv };
|
||||
vi.restoreAllMocks();
|
||||
vi.resetModules();
|
||||
});
|
||||
|
||||
it('injects [Vision OCR] block for saved Telegram photos under TMP_DIR', async () => {
|
||||
const dir = fs.mkdtempSync(path.join(TMP_TESTS_DIR, 'vision-test-'));
|
||||
const imagePath = path.join(dir, 'photo.jpg');
|
||||
writeTinyJpeg(imagePath);
|
||||
|
||||
process.env.VISION_PROVIDER = 'openrouter';
|
||||
process.env.VISION_MODEL = 'nvidia/nemotron-nano-12b-v2-vl:free';
|
||||
process.env.OPENROUTER_API_KEY = 'test-key';
|
||||
|
||||
const fetchMock = vi.fn(async () => {
|
||||
return {
|
||||
ok: true,
|
||||
json: async () => ({
|
||||
choices: [
|
||||
{
|
||||
message: { content: 'OCR:\nHELLO\n\nSUMMARY:\n- hi' },
|
||||
},
|
||||
],
|
||||
}),
|
||||
} as any;
|
||||
});
|
||||
// @ts-expect-error vitest runtime override
|
||||
globalThis.fetch = fetchMock;
|
||||
|
||||
// config.ts evaluates VISION_PROVIDER at module load; the static
|
||||
// import of TMP_TESTS_DIR above already cached an empty value.
|
||||
// Reset so the dynamic import below re-reads env we just set.
|
||||
vi.resetModules();
|
||||
const { augmentPromptWithVision } = await import('./vision.js');
|
||||
const input = `User: test\n[Photo saved: ${imagePath}]`;
|
||||
const out = await augmentPromptWithVision(input);
|
||||
|
||||
expect(fetchMock).toHaveBeenCalled();
|
||||
expect(out).toContain('[Vision OCR]');
|
||||
expect(out).toContain('OCR:\nHELLO');
|
||||
expect(out).toContain('Do not claim you "cannot see"');
|
||||
});
|
||||
});
|
||||
|
||||
183
src/vision.ts
183
src/vision.ts
|
|
@ -1,183 +0,0 @@
|
|||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
|
||||
import { readEnvFile } from './env.js';
|
||||
import {
|
||||
RUNTIME_ID,
|
||||
TMP_DIR,
|
||||
VISION_MAX_CHARS_PER_IMAGE,
|
||||
VISION_MAX_IMAGES,
|
||||
VISION_MAX_TOTAL_CHARS,
|
||||
VISION_MODEL,
|
||||
VISION_PROVIDER,
|
||||
} from './config.js';
|
||||
import { logger } from './logger.js';
|
||||
|
||||
type OpenRouterResponse = {
|
||||
choices?: Array<{
|
||||
message?: { content?: string };
|
||||
}>;
|
||||
};
|
||||
|
||||
let cachedOpenRouterKey: string | null | undefined;
|
||||
function getOpenRouterKey(): string | null {
|
||||
if (cachedOpenRouterKey !== undefined) return cachedOpenRouterKey;
|
||||
const fromEnv = process.env.OPENROUTER_API_KEY;
|
||||
if (fromEnv) {
|
||||
cachedOpenRouterKey = fromEnv;
|
||||
return cachedOpenRouterKey;
|
||||
}
|
||||
const parsed = readEnvFile(['OPENROUTER_API_KEY']);
|
||||
cachedOpenRouterKey = parsed.OPENROUTER_API_KEY || null;
|
||||
return cachedOpenRouterKey;
|
||||
}
|
||||
|
||||
function clampText(text: string, maxChars: number): string {
|
||||
if (text.length <= maxChars) return text;
|
||||
return `${text.slice(0, maxChars)}\n…(truncated)…`;
|
||||
}
|
||||
|
||||
function isSafeTmpPath(filePath: string): boolean {
|
||||
const resolved = path.resolve(filePath);
|
||||
const safeRoot = path.resolve(TMP_DIR) + path.sep;
|
||||
return resolved.startsWith(safeRoot);
|
||||
}
|
||||
|
||||
async function describeImageOpenRouter(imagePath: string): Promise<string> {
|
||||
const key = getOpenRouterKey();
|
||||
if (!key) throw new Error('OPENROUTER_API_KEY not configured');
|
||||
|
||||
const resolved = path.resolve(imagePath);
|
||||
const bytes = fs.readFileSync(resolved);
|
||||
const base64 = bytes.toString('base64');
|
||||
const ext = path.extname(resolved).toLowerCase().replace('.', '') || 'png';
|
||||
const mime = ext === 'jpg' ? 'jpeg' : ext;
|
||||
const dataUrl = `data:image/${mime};base64,${base64}`;
|
||||
|
||||
const prompt =
|
||||
'You are an OCR + screenshot/meme helper.\n' +
|
||||
'\n' +
|
||||
'Task:\n' +
|
||||
'1) Extract ALL readable text verbatim (including meme overlay text).\n' +
|
||||
'2) If some text is unclear, write [unclear] for that span.\n' +
|
||||
'3) After OCR, add a short bulleted summary (max 5 bullets) of what the image shows.\n' +
|
||||
'\n' +
|
||||
'Output format:\n' +
|
||||
'OCR:\n' +
|
||||
'<verbatim text>\n' +
|
||||
'\n' +
|
||||
'SUMMARY:\n' +
|
||||
'- ...';
|
||||
|
||||
const body = {
|
||||
model: VISION_MODEL,
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: prompt },
|
||||
{ type: 'image_url', image_url: { url: dataUrl } },
|
||||
],
|
||||
},
|
||||
],
|
||||
max_tokens: 1200,
|
||||
temperature: 0,
|
||||
};
|
||||
|
||||
const resp = await fetch('https://openrouter.ai/api/v1/chat/completions', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
Authorization: `Bearer ${key}`,
|
||||
'Content-Type': 'application/json',
|
||||
'HTTP-Referer': 'https://codeberg.org/Clawdie/Clawdie-AI',
|
||||
'X-Title': `${RUNTIME_ID}-ai`,
|
||||
},
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
|
||||
if (!resp.ok) {
|
||||
const txt = await resp.text().catch(() => '');
|
||||
throw new Error(`OpenRouter vision failed: ${resp.status} ${txt}`);
|
||||
}
|
||||
|
||||
const data = (await resp.json()) as OpenRouterResponse;
|
||||
const content = data.choices?.[0]?.message?.content?.trim() || '';
|
||||
if (!content) throw new Error('OpenRouter vision returned empty content');
|
||||
return content;
|
||||
}
|
||||
|
||||
async function describeImage(imagePath: string): Promise<string> {
|
||||
if (VISION_PROVIDER !== 'openrouter') {
|
||||
throw new Error(`Unsupported vision provider: ${VISION_PROVIDER || '(empty)'}`);
|
||||
}
|
||||
return describeImageOpenRouter(imagePath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace `[Photo saved: /path]` placeholders with an OCR/description block.
|
||||
* Only runs when `VISION_PROVIDER` is set and `VISION_MAX_IMAGES > 0`.
|
||||
*/
|
||||
export async function augmentPromptWithVision(text: string): Promise<string> {
|
||||
if (!VISION_PROVIDER) return text;
|
||||
if (VISION_MAX_IMAGES <= 0) return text;
|
||||
|
||||
const re = /\[Photo saved:\s*([^\]\n]+)\]/g;
|
||||
const matches = Array.from(text.matchAll(re))
|
||||
.map((m) => m[1]?.trim())
|
||||
.filter(Boolean) as string[];
|
||||
if (matches.length === 0) return text;
|
||||
|
||||
const unique: string[] = [];
|
||||
for (const m of matches) {
|
||||
if (!unique.includes(m)) unique.push(m);
|
||||
if (unique.length >= VISION_MAX_IMAGES) break;
|
||||
}
|
||||
|
||||
let totalAdded = 0;
|
||||
let out = text;
|
||||
let injectedAny = false;
|
||||
for (const imagePath of unique) {
|
||||
if (!isSafeTmpPath(imagePath)) {
|
||||
logger.warn({ imagePath }, 'Vision skipped (path outside TMP_DIR)');
|
||||
continue;
|
||||
}
|
||||
if (!fs.existsSync(imagePath)) {
|
||||
logger.warn({ imagePath }, 'Vision skipped (file missing)');
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
logger.info({ imagePath, model: VISION_MODEL }, 'Vision OCR started');
|
||||
const ocr = await describeImage(imagePath);
|
||||
logger.info(
|
||||
{ imagePath, snippet: clampText(ocr, 240) },
|
||||
'Vision OCR raw output (snippet)',
|
||||
);
|
||||
const clipped = clampText(ocr, VISION_MAX_CHARS_PER_IMAGE);
|
||||
const block = `\n\n[Vision OCR]\n${clipped}\n[/Vision OCR]\n`;
|
||||
if (totalAdded + block.length > VISION_MAX_TOTAL_CHARS) {
|
||||
logger.warn(
|
||||
{ imagePath, totalAdded, maxTotal: VISION_MAX_TOTAL_CHARS },
|
||||
'Vision OCR skipped (budget exceeded)',
|
||||
);
|
||||
continue;
|
||||
}
|
||||
totalAdded += block.length;
|
||||
out = out.replace(`[Photo saved: ${imagePath}]`, `[Photo saved: ${imagePath}]${block}`);
|
||||
injectedAny = true;
|
||||
logger.info({ imagePath }, 'Vision OCR injected into prompt');
|
||||
} catch (err) {
|
||||
logger.warn({ imagePath, err }, 'Vision OCR failed');
|
||||
}
|
||||
}
|
||||
|
||||
if (!injectedAny) return out;
|
||||
return (
|
||||
'[System note]\n' +
|
||||
'One or more images were OCR’d by a vision helper model. Use the content inside\n' +
|
||||
'`[Vision OCR]...[/Vision OCR]` as authoritative input. Do not claim you "cannot see"\n' +
|
||||
'the image; answer using the OCR/summary provided.\n' +
|
||||
'[/System note]\n\n' +
|
||||
out
|
||||
);
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue