chore: remove Tier-A unused SaaS modules (browser-operator, tts, vision) (Sam & Claude) #7
6 changed files with 0 additions and 805 deletions
|
|
@ -1,115 +0,0 @@
|
||||||
import { describe, expect, it, vi } from 'vitest';
|
|
||||||
|
|
||||||
import { createClawdieBrowserOperator, type ClawdieBrowserOperator } from './browser-operator.js';
|
|
||||||
import type { BackendCaller } from './browser-orchestrator.js';
|
|
||||||
import type { BrowserCloneRow } from './browser-session-registry.js';
|
|
||||||
|
|
||||||
const baseTime = new Date('2026-05-11T12:00:00.000Z');
|
|
||||||
|
|
||||||
class OperatorPool {
|
|
||||||
row: BrowserCloneRow = {
|
|
||||||
session_id: '00000000-0000-4000-8000-000000000150',
|
|
||||||
tenant_id: 'tenant-a',
|
|
||||||
clone_name: 'browsertask150',
|
|
||||||
ip: '192.168.72.150',
|
|
||||||
status: 'open',
|
|
||||||
credential_mode: 'clean',
|
|
||||||
operator_grant_token_jti: null,
|
|
||||||
created_at: baseTime,
|
|
||||||
updated_at: baseTime,
|
|
||||||
opened_at: baseTime,
|
|
||||||
closed_at: null,
|
|
||||||
last_action_at: null,
|
|
||||||
expires_at: null,
|
|
||||||
error_code: null,
|
|
||||||
error_message: null,
|
|
||||||
};
|
|
||||||
|
|
||||||
async query(sql: string, params: unknown[] = []): Promise<{ rows: any[] }> {
|
|
||||||
const normalized = sql.replace(/\s+/g, ' ').trim();
|
|
||||||
if (normalized.startsWith('SELECT session_id::text')) {
|
|
||||||
return { rows: this.row.session_id === params[0] ? [this.row] : [] };
|
|
||||||
}
|
|
||||||
if (normalized.startsWith('UPDATE browser_clones')) {
|
|
||||||
const [, status, updatedAt, , closedAt, lastActionAt] = params;
|
|
||||||
this.row.status = status as any;
|
|
||||||
this.row.updated_at = updatedAt as Date;
|
|
||||||
if (closedAt) this.row.closed_at = closedAt as Date;
|
|
||||||
if (lastActionAt) this.row.last_action_at = lastActionAt as Date;
|
|
||||||
}
|
|
||||||
return { rows: [] };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
describe('ClawdieBrowserOperator', () => {
|
|
||||||
it('exports the UI-TARS-compatible screenshot/execute shape', () => {
|
|
||||||
const operator: ClawdieBrowserOperator = createClawdieBrowserOperator(new OperatorPool() as any, 's1');
|
|
||||||
expect(typeof operator.screenshot).toBe('function');
|
|
||||||
expect(typeof operator.execute).toBe('function');
|
|
||||||
expect(typeof operator.finished).toBe('function');
|
|
||||||
expect(typeof operator.close).toBe('function');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('adapts screenshot responses to imageBase64', async () => {
|
|
||||||
const backend: BackendCaller = vi.fn(async () => ({
|
|
||||||
ok: true,
|
|
||||||
status: 200,
|
|
||||||
data: { ok: true, image_base64: 'iVBORw0=', width: 1024, height: 768 },
|
|
||||||
}));
|
|
||||||
const operator = createClawdieBrowserOperator(
|
|
||||||
new OperatorPool() as any,
|
|
||||||
'00000000-0000-4000-8000-000000000150',
|
|
||||||
{ backend, now: () => baseTime },
|
|
||||||
);
|
|
||||||
await expect(operator.screenshot()).resolves.toEqual({
|
|
||||||
imageBase64: 'iVBORw0=',
|
|
||||||
width: 1024,
|
|
||||||
height: 768,
|
|
||||||
});
|
|
||||||
expect(backend).toHaveBeenCalledWith('192.168.72.150', 'POST', '/screenshot', {}, 30000);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('translates basic predictions to browser action helpers', async () => {
|
|
||||||
const backend: BackendCaller = vi.fn(async (_ip, _method, path, body) => ({
|
|
||||||
ok: true,
|
|
||||||
status: 200,
|
|
||||||
data: { ok: true, path, body },
|
|
||||||
}));
|
|
||||||
const operator = createClawdieBrowserOperator(
|
|
||||||
new OperatorPool() as any,
|
|
||||||
'00000000-0000-4000-8000-000000000150',
|
|
||||||
{ backend, now: () => baseTime },
|
|
||||||
);
|
|
||||||
await expect(operator.execute({ action: 'navigate', url: 'https://example.com/' })).resolves.toMatchObject({
|
|
||||||
path: '/navigate',
|
|
||||||
body: { url: 'https://example.com/' },
|
|
||||||
});
|
|
||||||
await expect(operator.execute({ action: 'click', selector: '#go' })).resolves.toMatchObject({
|
|
||||||
path: '/click',
|
|
||||||
body: { selector: '#go' },
|
|
||||||
});
|
|
||||||
await expect(operator.execute({ action: 'type', text: 'hello' })).resolves.toMatchObject({
|
|
||||||
path: '/type',
|
|
||||||
body: { text: 'hello' },
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
it('tracks finish and closes the browser session', async () => {
|
|
||||||
const pool = new OperatorPool();
|
|
||||||
const hostd = vi.fn(async () => ({ id: 'x', ok: true, output: 'ok' }));
|
|
||||||
const operator = createClawdieBrowserOperator(
|
|
||||||
pool as any,
|
|
||||||
'00000000-0000-4000-8000-000000000150',
|
|
||||||
{ hostd, now: () => baseTime },
|
|
||||||
);
|
|
||||||
await expect(operator.execute({ action: 'finish' })).resolves.toEqual({ ok: true, finished: true });
|
|
||||||
expect(await operator.finished?.()).toBe(true);
|
|
||||||
await operator.close?.();
|
|
||||||
expect(hostd).toHaveBeenCalledWith('browser-clone-reap', {
|
|
||||||
clone: 'browsertask150',
|
|
||||||
ip: '192.168.72.150',
|
|
||||||
suffix: 'bt150',
|
|
||||||
});
|
|
||||||
expect(pool.row.status).toBe('closed');
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
@ -1,161 +0,0 @@
|
||||||
import type pg from 'pg';
|
|
||||||
|
|
||||||
import {
|
|
||||||
clickBrowserSession,
|
|
||||||
closeBrowserSession,
|
|
||||||
navigateBrowserSession,
|
|
||||||
readDomBrowserSession,
|
|
||||||
screenshotBrowserSession,
|
|
||||||
scrollBrowserSession,
|
|
||||||
typeBrowserSession,
|
|
||||||
type BrowserOrchestratorDeps,
|
|
||||||
} from './browser-orchestrator.js';
|
|
||||||
|
|
||||||
export interface ClawdieBrowserScreenshot {
|
|
||||||
imageBase64: string;
|
|
||||||
width?: number;
|
|
||||||
height?: number;
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface ClawdieBrowserOperator {
|
|
||||||
screenshot(): Promise<ClawdieBrowserScreenshot>;
|
|
||||||
execute(prediction: unknown): Promise<unknown>;
|
|
||||||
finished?(): Promise<boolean> | boolean;
|
|
||||||
close?(): Promise<void>;
|
|
||||||
}
|
|
||||||
|
|
||||||
type Prediction = {
|
|
||||||
action?: string;
|
|
||||||
type?: string;
|
|
||||||
url?: string;
|
|
||||||
x?: number;
|
|
||||||
y?: number;
|
|
||||||
selector?: string;
|
|
||||||
text?: string;
|
|
||||||
dx?: number;
|
|
||||||
dy?: number;
|
|
||||||
full_page?: boolean;
|
|
||||||
max_bytes?: number;
|
|
||||||
};
|
|
||||||
|
|
||||||
function predictionAction(prediction: Prediction): string {
|
|
||||||
return String(prediction.action ?? prediction.type ?? '').trim().toLowerCase();
|
|
||||||
}
|
|
||||||
|
|
||||||
function ensureObject(prediction: unknown): Prediction {
|
|
||||||
if (!prediction || typeof prediction !== 'object') {
|
|
||||||
throw new Error('browser operator prediction must be an object');
|
|
||||||
}
|
|
||||||
return prediction as Prediction;
|
|
||||||
}
|
|
||||||
|
|
||||||
function unwrap<T>(result: { ok: true; value: T } | { ok: false; code: string; error: string }): T {
|
|
||||||
if (!result.ok) throw Object.assign(new Error(result.error), { code: result.code });
|
|
||||||
return result.value;
|
|
||||||
}
|
|
||||||
|
|
||||||
export class BrowserSessionOperator implements ClawdieBrowserOperator {
|
|
||||||
private done = false;
|
|
||||||
|
|
||||||
constructor(
|
|
||||||
private readonly pool: pg.Pool,
|
|
||||||
private readonly sessionId: string,
|
|
||||||
private readonly deps: BrowserOrchestratorDeps = {},
|
|
||||||
) {}
|
|
||||||
|
|
||||||
async screenshot(): Promise<ClawdieBrowserScreenshot> {
|
|
||||||
const result = unwrap(await screenshotBrowserSession(this.pool, this.sessionId, {}, this.deps)) as {
|
|
||||||
image_base64?: string;
|
|
||||||
width?: number;
|
|
||||||
height?: number;
|
|
||||||
};
|
|
||||||
return {
|
|
||||||
imageBase64: result.image_base64 ?? '',
|
|
||||||
width: result.width,
|
|
||||||
height: result.height,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
async execute(rawPrediction: unknown): Promise<unknown> {
|
|
||||||
const prediction = ensureObject(rawPrediction);
|
|
||||||
const action = predictionAction(prediction);
|
|
||||||
switch (action) {
|
|
||||||
case 'navigate':
|
|
||||||
case 'open':
|
|
||||||
if (!prediction.url) throw new Error('navigate action requires url');
|
|
||||||
return unwrap(await navigateBrowserSession(this.pool, this.sessionId, { url: prediction.url }, this.deps));
|
|
||||||
case 'click':
|
|
||||||
return unwrap(
|
|
||||||
await clickBrowserSession(
|
|
||||||
this.pool,
|
|
||||||
this.sessionId,
|
|
||||||
prediction.selector
|
|
||||||
? { selector: prediction.selector }
|
|
||||||
: { x: prediction.x, y: prediction.y },
|
|
||||||
this.deps,
|
|
||||||
),
|
|
||||||
);
|
|
||||||
case 'type':
|
|
||||||
case 'input':
|
|
||||||
return unwrap(
|
|
||||||
await typeBrowserSession(
|
|
||||||
this.pool,
|
|
||||||
this.sessionId,
|
|
||||||
{ text: prediction.text ?? '', selector: prediction.selector },
|
|
||||||
this.deps,
|
|
||||||
),
|
|
||||||
);
|
|
||||||
case 'scroll':
|
|
||||||
return unwrap(
|
|
||||||
await scrollBrowserSession(
|
|
||||||
this.pool,
|
|
||||||
this.sessionId,
|
|
||||||
{ dx: prediction.dx, dy: prediction.dy, selector: prediction.selector },
|
|
||||||
this.deps,
|
|
||||||
),
|
|
||||||
);
|
|
||||||
case 'read_dom':
|
|
||||||
case 'read':
|
|
||||||
return unwrap(
|
|
||||||
await readDomBrowserSession(
|
|
||||||
this.pool,
|
|
||||||
this.sessionId,
|
|
||||||
{ max_bytes: prediction.max_bytes },
|
|
||||||
this.deps,
|
|
||||||
),
|
|
||||||
);
|
|
||||||
case 'screenshot':
|
|
||||||
return unwrap(
|
|
||||||
await screenshotBrowserSession(
|
|
||||||
this.pool,
|
|
||||||
this.sessionId,
|
|
||||||
{ full_page: prediction.full_page },
|
|
||||||
this.deps,
|
|
||||||
),
|
|
||||||
);
|
|
||||||
case 'finish':
|
|
||||||
case 'done':
|
|
||||||
this.done = true;
|
|
||||||
return { ok: true, finished: true };
|
|
||||||
default:
|
|
||||||
throw new Error(`unsupported browser operator action: ${action || 'unknown'}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
finished(): boolean {
|
|
||||||
return this.done;
|
|
||||||
}
|
|
||||||
|
|
||||||
async close(): Promise<void> {
|
|
||||||
await closeBrowserSession(this.pool, this.sessionId, this.deps);
|
|
||||||
this.done = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export function createClawdieBrowserOperator(
|
|
||||||
pool: pg.Pool,
|
|
||||||
sessionId: string,
|
|
||||||
deps: BrowserOrchestratorDeps = {},
|
|
||||||
): ClawdieBrowserOperator {
|
|
||||||
return new BrowserSessionOperator(pool, sessionId, deps);
|
|
||||||
}
|
|
||||||
115
src/tts.test.ts
115
src/tts.test.ts
|
|
@ -1,115 +0,0 @@
|
||||||
import { describe, it, expect } from 'vitest';
|
|
||||||
|
|
||||||
import { shouldApplyTts, stripTtsMarker, stripMarkdown } from './tts.js';
|
|
||||||
|
|
||||||
describe('shouldApplyTts', () => {
|
|
||||||
it('returns false for off mode', () => {
|
|
||||||
expect(
|
|
||||||
shouldApplyTts({ mode: 'off', hadInboundAudio: false, text: 'hello' }),
|
|
||||||
).toBe(false);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('returns true for always mode', () => {
|
|
||||||
expect(
|
|
||||||
shouldApplyTts({ mode: 'always', hadInboundAudio: false, text: 'hello' }),
|
|
||||||
).toBe(true);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('returns true for inbound mode when inbound was audio', () => {
|
|
||||||
expect(
|
|
||||||
shouldApplyTts({ mode: 'inbound', hadInboundAudio: true, text: 'hello' }),
|
|
||||||
).toBe(true);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('returns false for inbound mode when inbound was not audio', () => {
|
|
||||||
expect(
|
|
||||||
shouldApplyTts({
|
|
||||||
mode: 'inbound',
|
|
||||||
hadInboundAudio: false,
|
|
||||||
text: 'hello',
|
|
||||||
}),
|
|
||||||
).toBe(false);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('returns true for tagged mode when [[tts]] is present', () => {
|
|
||||||
expect(
|
|
||||||
shouldApplyTts({
|
|
||||||
mode: 'tagged',
|
|
||||||
hadInboundAudio: false,
|
|
||||||
text: 'hello [[tts]] world',
|
|
||||||
}),
|
|
||||||
).toBe(true);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('returns false for tagged mode without marker', () => {
|
|
||||||
expect(
|
|
||||||
shouldApplyTts({
|
|
||||||
mode: 'tagged',
|
|
||||||
hadInboundAudio: false,
|
|
||||||
text: 'hello world',
|
|
||||||
}),
|
|
||||||
).toBe(false);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('[[tts]] is case-insensitive', () => {
|
|
||||||
expect(
|
|
||||||
shouldApplyTts({
|
|
||||||
mode: 'tagged',
|
|
||||||
hadInboundAudio: false,
|
|
||||||
text: '[[TTS]]',
|
|
||||||
}),
|
|
||||||
).toBe(true);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
describe('stripTtsMarker', () => {
|
|
||||||
it('removes [[tts]] from text', () => {
|
|
||||||
expect(stripTtsMarker('hello [[tts]] world')).toBe('hello world');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('returns text unchanged when no marker', () => {
|
|
||||||
expect(stripTtsMarker('hello world')).toBe('hello world');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('removes multiple markers', () => {
|
|
||||||
expect(stripTtsMarker('[[tts]] a [[tts]] b')).toBe('a b');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('is case-insensitive', () => {
|
|
||||||
expect(stripTtsMarker('[[TTS]]hello')).toBe('hello');
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
describe('stripMarkdown', () => {
|
|
||||||
it('removes bold markers', () => {
|
|
||||||
expect(stripMarkdown('**hello**')).toBe('hello');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('removes italic markers', () => {
|
|
||||||
expect(stripMarkdown('*hello*')).toBe('hello');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('removes code fences', () => {
|
|
||||||
expect(stripMarkdown('```js\nconsole.log("hi")\n```')).toBe('');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('removes inline code', () => {
|
|
||||||
expect(stripMarkdown('use `foo` here')).toBe('use foo here');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('removes links keeping text', () => {
|
|
||||||
expect(stripMarkdown('[click](https://example.com)')).toBe('click');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('removes heading markers', () => {
|
|
||||||
expect(stripMarkdown('## Title')).toBe('Title');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('removes list markers', () => {
|
|
||||||
expect(stripMarkdown('- item')).toBe('item');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('removes strikethrough', () => {
|
|
||||||
expect(stripMarkdown('~~deleted~~')).toBe('deleted');
|
|
||||||
});
|
|
||||||
});
|
|
||||||
171
src/tts.ts
171
src/tts.ts
|
|
@ -1,171 +0,0 @@
|
||||||
import fs from 'fs';
|
|
||||||
import path from 'path';
|
|
||||||
import { spawn } from 'child_process';
|
|
||||||
|
|
||||||
import { logger } from './logger.js';
|
|
||||||
import {
|
|
||||||
TTS_PROVIDER,
|
|
||||||
TTS_MAX_TEXT_LENGTH,
|
|
||||||
TTS_OUTPUT_FORMAT,
|
|
||||||
TTS_VOICE,
|
|
||||||
TMP_DIR,
|
|
||||||
} from './config.js';
|
|
||||||
|
|
||||||
export type TtsAutoMode = 'always' | 'inbound' | 'tagged' | 'off';
|
|
||||||
|
|
||||||
interface TtsResult {
|
|
||||||
audioPath: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
export function stripMarkdown(text: string): string {
|
|
||||||
let out = text;
|
|
||||||
out = out.replace(/\[Vision OCR\][\s\S]*?\[\/Vision OCR\]/g, '');
|
|
||||||
out = out.replace(/\[System note\][\s\S]*?\[\/System note\]/g, '');
|
|
||||||
out = out.replace(/```[\s\S]*?```/g, '');
|
|
||||||
out = out.replace(/`([^`]+)`/g, '$1');
|
|
||||||
out = out.replace(/\*\*([^*]+)\*\*/g, '$1');
|
|
||||||
out = out.replace(/\*([^*]+)\*/g, '$1');
|
|
||||||
out = out.replace(/__([^_]+)__/g, '$1');
|
|
||||||
out = out.replace(/_([^_]+)_/g, '$1');
|
|
||||||
out = out.replace(/~~([^~]+)~~/g, '$1');
|
|
||||||
out = out.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');
|
|
||||||
out = out.replace(/^#{1,6}\s+/gm, '');
|
|
||||||
out = out.replace(/^[-*+]\s+/gm, '');
|
|
||||||
out = out.replace(/^\d+\.\s+/gm, '');
|
|
||||||
out = out.replace(/^>\s+/gm, '');
|
|
||||||
out = out.replace(/---+/g, '');
|
|
||||||
out = out.replace(/\n{3,}/g, '\n\n');
|
|
||||||
return out.trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
export function shouldApplyTts(opts: {
|
|
||||||
mode: TtsAutoMode;
|
|
||||||
hadInboundAudio: boolean;
|
|
||||||
text: string;
|
|
||||||
}): boolean {
|
|
||||||
if (opts.mode === 'off') return false;
|
|
||||||
if (opts.mode === 'always') return true;
|
|
||||||
if (opts.mode === 'inbound') return opts.hadInboundAudio;
|
|
||||||
if (opts.mode === 'tagged') return /\[\[tts\]\]/i.test(opts.text);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
export function stripTtsMarker(text: string): string {
|
|
||||||
return text.replace(/\[\[tts\]\]/gi, '').trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function synthesize(
|
|
||||||
text: string,
|
|
||||||
opts?: {
|
|
||||||
voice?: string;
|
|
||||||
outputFormat?: string;
|
|
||||||
},
|
|
||||||
): Promise<TtsResult> {
|
|
||||||
const voice = opts?.voice || TTS_VOICE;
|
|
||||||
const outputFormat = opts?.outputFormat || TTS_OUTPUT_FORMAT;
|
|
||||||
|
|
||||||
const cleanText = stripMarkdown(text);
|
|
||||||
if (!cleanText) {
|
|
||||||
throw new Error('TTS: no text to synthesize after stripping markdown');
|
|
||||||
}
|
|
||||||
|
|
||||||
const truncated =
|
|
||||||
cleanText.length > TTS_MAX_TEXT_LENGTH
|
|
||||||
? cleanText.slice(0, TTS_MAX_TEXT_LENGTH)
|
|
||||||
: cleanText;
|
|
||||||
|
|
||||||
const ttsDir = path.join(TMP_DIR, 'tts');
|
|
||||||
fs.mkdirSync(ttsDir, { recursive: true });
|
|
||||||
|
|
||||||
const ext = inferExtension(outputFormat);
|
|
||||||
const fileName = `tts-${Date.now()}-${Math.random().toString(36).slice(2, 8)}${ext}`;
|
|
||||||
const audioPath = path.join(ttsDir, fileName);
|
|
||||||
|
|
||||||
try {
|
|
||||||
if (TTS_PROVIDER === 'azure') {
|
|
||||||
throw new Error('TTS: azure provider not implemented yet');
|
|
||||||
} else {
|
|
||||||
await runEdgeTtsCli({
|
|
||||||
text: truncated,
|
|
||||||
voice,
|
|
||||||
audioPath,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
} catch (err) {
|
|
||||||
try {
|
|
||||||
fs.unlinkSync(audioPath);
|
|
||||||
} catch {}
|
|
||||||
throw err;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!fs.existsSync(audioPath)) {
|
|
||||||
throw new Error('TTS: edge-tts completed but no audio file produced');
|
|
||||||
}
|
|
||||||
const stat = fs.statSync(audioPath);
|
|
||||||
if (!stat.isFile()) {
|
|
||||||
throw new Error('TTS: synthesized audio path is not a file');
|
|
||||||
}
|
|
||||||
if (stat.size < 1024) {
|
|
||||||
throw new Error(`TTS: synthesized audio file too small (${stat.size} bytes)`);
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
{
|
|
||||||
voice,
|
|
||||||
format: outputFormat,
|
|
||||||
chars: truncated.length,
|
|
||||||
bytes: stat.size,
|
|
||||||
path: audioPath,
|
|
||||||
},
|
|
||||||
'TTS synthesized',
|
|
||||||
);
|
|
||||||
|
|
||||||
return { audioPath };
|
|
||||||
}
|
|
||||||
|
|
||||||
function runEdgeTtsCli(opts: {
|
|
||||||
text: string;
|
|
||||||
voice: string;
|
|
||||||
audioPath: string;
|
|
||||||
}): Promise<void> {
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
const args = [
|
|
||||||
'--text',
|
|
||||||
opts.text,
|
|
||||||
'--voice',
|
|
||||||
opts.voice,
|
|
||||||
'--write-media',
|
|
||||||
opts.audioPath,
|
|
||||||
];
|
|
||||||
const binPath = path.join(
|
|
||||||
path.dirname(path.dirname(new URL(import.meta.url).pathname)),
|
|
||||||
'bin',
|
|
||||||
'edge-tts',
|
|
||||||
);
|
|
||||||
const proc = spawn(binPath, args, { stdio: 'ignore' });
|
|
||||||
proc.on('error', (err: any) => {
|
|
||||||
if (err && err.code === 'ENOENT') {
|
|
||||||
reject(
|
|
||||||
new Error(
|
|
||||||
'TTS: edge-tts not found. Install with: pip install edge-tts',
|
|
||||||
),
|
|
||||||
);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
reject(err);
|
|
||||||
});
|
|
||||||
proc.on('exit', (code) => {
|
|
||||||
if (code === 0) resolve();
|
|
||||||
else reject(new Error(`TTS: edge-tts exited with code ${code}`));
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
function inferExtension(outputFormat: string): string {
|
|
||||||
const n = outputFormat.toLowerCase();
|
|
||||||
if (n.includes('ogg') || n.includes('opus')) return '.ogg';
|
|
||||||
if (n.includes('webm')) return '.webm';
|
|
||||||
if (n.includes('wav') || n.includes('riff') || n.includes('pcm'))
|
|
||||||
return '.wav';
|
|
||||||
return '.mp3';
|
|
||||||
}
|
|
||||||
|
|
@ -1,60 +0,0 @@
|
||||||
import fs from 'fs';
|
|
||||||
import path from 'path';
|
|
||||||
|
|
||||||
import { afterEach, describe, expect, it, vi } from 'vitest';
|
|
||||||
|
|
||||||
import { TMP_TESTS_DIR } from './config.js';
|
|
||||||
|
|
||||||
function writeTinyJpeg(filePath: string): void {
|
|
||||||
// Minimal JPEG header + EOI (enough for base64 packaging; model may ignore).
|
|
||||||
fs.writeFileSync(filePath, Buffer.from([0xff, 0xd8, 0xff, 0xd9]));
|
|
||||||
}
|
|
||||||
|
|
||||||
describe('augmentPromptWithVision', () => {
|
|
||||||
const originalEnv = { ...process.env };
|
|
||||||
|
|
||||||
afterEach(() => {
|
|
||||||
process.env = { ...originalEnv };
|
|
||||||
vi.restoreAllMocks();
|
|
||||||
vi.resetModules();
|
|
||||||
});
|
|
||||||
|
|
||||||
it('injects [Vision OCR] block for saved Telegram photos under TMP_DIR', async () => {
|
|
||||||
const dir = fs.mkdtempSync(path.join(TMP_TESTS_DIR, 'vision-test-'));
|
|
||||||
const imagePath = path.join(dir, 'photo.jpg');
|
|
||||||
writeTinyJpeg(imagePath);
|
|
||||||
|
|
||||||
process.env.VISION_PROVIDER = 'openrouter';
|
|
||||||
process.env.VISION_MODEL = 'nvidia/nemotron-nano-12b-v2-vl:free';
|
|
||||||
process.env.OPENROUTER_API_KEY = 'test-key';
|
|
||||||
|
|
||||||
const fetchMock = vi.fn(async () => {
|
|
||||||
return {
|
|
||||||
ok: true,
|
|
||||||
json: async () => ({
|
|
||||||
choices: [
|
|
||||||
{
|
|
||||||
message: { content: 'OCR:\nHELLO\n\nSUMMARY:\n- hi' },
|
|
||||||
},
|
|
||||||
],
|
|
||||||
}),
|
|
||||||
} as any;
|
|
||||||
});
|
|
||||||
// @ts-expect-error vitest runtime override
|
|
||||||
globalThis.fetch = fetchMock;
|
|
||||||
|
|
||||||
// config.ts evaluates VISION_PROVIDER at module load; the static
|
|
||||||
// import of TMP_TESTS_DIR above already cached an empty value.
|
|
||||||
// Reset so the dynamic import below re-reads env we just set.
|
|
||||||
vi.resetModules();
|
|
||||||
const { augmentPromptWithVision } = await import('./vision.js');
|
|
||||||
const input = `User: test\n[Photo saved: ${imagePath}]`;
|
|
||||||
const out = await augmentPromptWithVision(input);
|
|
||||||
|
|
||||||
expect(fetchMock).toHaveBeenCalled();
|
|
||||||
expect(out).toContain('[Vision OCR]');
|
|
||||||
expect(out).toContain('OCR:\nHELLO');
|
|
||||||
expect(out).toContain('Do not claim you "cannot see"');
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
183
src/vision.ts
183
src/vision.ts
|
|
@ -1,183 +0,0 @@
|
||||||
import fs from 'fs';
|
|
||||||
import path from 'path';
|
|
||||||
|
|
||||||
import { readEnvFile } from './env.js';
|
|
||||||
import {
|
|
||||||
RUNTIME_ID,
|
|
||||||
TMP_DIR,
|
|
||||||
VISION_MAX_CHARS_PER_IMAGE,
|
|
||||||
VISION_MAX_IMAGES,
|
|
||||||
VISION_MAX_TOTAL_CHARS,
|
|
||||||
VISION_MODEL,
|
|
||||||
VISION_PROVIDER,
|
|
||||||
} from './config.js';
|
|
||||||
import { logger } from './logger.js';
|
|
||||||
|
|
||||||
type OpenRouterResponse = {
|
|
||||||
choices?: Array<{
|
|
||||||
message?: { content?: string };
|
|
||||||
}>;
|
|
||||||
};
|
|
||||||
|
|
||||||
let cachedOpenRouterKey: string | null | undefined;
|
|
||||||
function getOpenRouterKey(): string | null {
|
|
||||||
if (cachedOpenRouterKey !== undefined) return cachedOpenRouterKey;
|
|
||||||
const fromEnv = process.env.OPENROUTER_API_KEY;
|
|
||||||
if (fromEnv) {
|
|
||||||
cachedOpenRouterKey = fromEnv;
|
|
||||||
return cachedOpenRouterKey;
|
|
||||||
}
|
|
||||||
const parsed = readEnvFile(['OPENROUTER_API_KEY']);
|
|
||||||
cachedOpenRouterKey = parsed.OPENROUTER_API_KEY || null;
|
|
||||||
return cachedOpenRouterKey;
|
|
||||||
}
|
|
||||||
|
|
||||||
function clampText(text: string, maxChars: number): string {
|
|
||||||
if (text.length <= maxChars) return text;
|
|
||||||
return `${text.slice(0, maxChars)}\n…(truncated)…`;
|
|
||||||
}
|
|
||||||
|
|
||||||
function isSafeTmpPath(filePath: string): boolean {
|
|
||||||
const resolved = path.resolve(filePath);
|
|
||||||
const safeRoot = path.resolve(TMP_DIR) + path.sep;
|
|
||||||
return resolved.startsWith(safeRoot);
|
|
||||||
}
|
|
||||||
|
|
||||||
async function describeImageOpenRouter(imagePath: string): Promise<string> {
|
|
||||||
const key = getOpenRouterKey();
|
|
||||||
if (!key) throw new Error('OPENROUTER_API_KEY not configured');
|
|
||||||
|
|
||||||
const resolved = path.resolve(imagePath);
|
|
||||||
const bytes = fs.readFileSync(resolved);
|
|
||||||
const base64 = bytes.toString('base64');
|
|
||||||
const ext = path.extname(resolved).toLowerCase().replace('.', '') || 'png';
|
|
||||||
const mime = ext === 'jpg' ? 'jpeg' : ext;
|
|
||||||
const dataUrl = `data:image/${mime};base64,${base64}`;
|
|
||||||
|
|
||||||
const prompt =
|
|
||||||
'You are an OCR + screenshot/meme helper.\n' +
|
|
||||||
'\n' +
|
|
||||||
'Task:\n' +
|
|
||||||
'1) Extract ALL readable text verbatim (including meme overlay text).\n' +
|
|
||||||
'2) If some text is unclear, write [unclear] for that span.\n' +
|
|
||||||
'3) After OCR, add a short bulleted summary (max 5 bullets) of what the image shows.\n' +
|
|
||||||
'\n' +
|
|
||||||
'Output format:\n' +
|
|
||||||
'OCR:\n' +
|
|
||||||
'<verbatim text>\n' +
|
|
||||||
'\n' +
|
|
||||||
'SUMMARY:\n' +
|
|
||||||
'- ...';
|
|
||||||
|
|
||||||
const body = {
|
|
||||||
model: VISION_MODEL,
|
|
||||||
messages: [
|
|
||||||
{
|
|
||||||
role: 'user',
|
|
||||||
content: [
|
|
||||||
{ type: 'text', text: prompt },
|
|
||||||
{ type: 'image_url', image_url: { url: dataUrl } },
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
max_tokens: 1200,
|
|
||||||
temperature: 0,
|
|
||||||
};
|
|
||||||
|
|
||||||
const resp = await fetch('https://openrouter.ai/api/v1/chat/completions', {
|
|
||||||
method: 'POST',
|
|
||||||
headers: {
|
|
||||||
Authorization: `Bearer ${key}`,
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'HTTP-Referer': 'https://codeberg.org/Clawdie/Clawdie-AI',
|
|
||||||
'X-Title': `${RUNTIME_ID}-ai`,
|
|
||||||
},
|
|
||||||
body: JSON.stringify(body),
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!resp.ok) {
|
|
||||||
const txt = await resp.text().catch(() => '');
|
|
||||||
throw new Error(`OpenRouter vision failed: ${resp.status} ${txt}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
const data = (await resp.json()) as OpenRouterResponse;
|
|
||||||
const content = data.choices?.[0]?.message?.content?.trim() || '';
|
|
||||||
if (!content) throw new Error('OpenRouter vision returned empty content');
|
|
||||||
return content;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function describeImage(imagePath: string): Promise<string> {
|
|
||||||
if (VISION_PROVIDER !== 'openrouter') {
|
|
||||||
throw new Error(`Unsupported vision provider: ${VISION_PROVIDER || '(empty)'}`);
|
|
||||||
}
|
|
||||||
return describeImageOpenRouter(imagePath);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Replace `[Photo saved: /path]` placeholders with an OCR/description block.
|
|
||||||
* Only runs when `VISION_PROVIDER` is set and `VISION_MAX_IMAGES > 0`.
|
|
||||||
*/
|
|
||||||
export async function augmentPromptWithVision(text: string): Promise<string> {
|
|
||||||
if (!VISION_PROVIDER) return text;
|
|
||||||
if (VISION_MAX_IMAGES <= 0) return text;
|
|
||||||
|
|
||||||
const re = /\[Photo saved:\s*([^\]\n]+)\]/g;
|
|
||||||
const matches = Array.from(text.matchAll(re))
|
|
||||||
.map((m) => m[1]?.trim())
|
|
||||||
.filter(Boolean) as string[];
|
|
||||||
if (matches.length === 0) return text;
|
|
||||||
|
|
||||||
const unique: string[] = [];
|
|
||||||
for (const m of matches) {
|
|
||||||
if (!unique.includes(m)) unique.push(m);
|
|
||||||
if (unique.length >= VISION_MAX_IMAGES) break;
|
|
||||||
}
|
|
||||||
|
|
||||||
let totalAdded = 0;
|
|
||||||
let out = text;
|
|
||||||
let injectedAny = false;
|
|
||||||
for (const imagePath of unique) {
|
|
||||||
if (!isSafeTmpPath(imagePath)) {
|
|
||||||
logger.warn({ imagePath }, 'Vision skipped (path outside TMP_DIR)');
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!fs.existsSync(imagePath)) {
|
|
||||||
logger.warn({ imagePath }, 'Vision skipped (file missing)');
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
logger.info({ imagePath, model: VISION_MODEL }, 'Vision OCR started');
|
|
||||||
const ocr = await describeImage(imagePath);
|
|
||||||
logger.info(
|
|
||||||
{ imagePath, snippet: clampText(ocr, 240) },
|
|
||||||
'Vision OCR raw output (snippet)',
|
|
||||||
);
|
|
||||||
const clipped = clampText(ocr, VISION_MAX_CHARS_PER_IMAGE);
|
|
||||||
const block = `\n\n[Vision OCR]\n${clipped}\n[/Vision OCR]\n`;
|
|
||||||
if (totalAdded + block.length > VISION_MAX_TOTAL_CHARS) {
|
|
||||||
logger.warn(
|
|
||||||
{ imagePath, totalAdded, maxTotal: VISION_MAX_TOTAL_CHARS },
|
|
||||||
'Vision OCR skipped (budget exceeded)',
|
|
||||||
);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
totalAdded += block.length;
|
|
||||||
out = out.replace(`[Photo saved: ${imagePath}]`, `[Photo saved: ${imagePath}]${block}`);
|
|
||||||
injectedAny = true;
|
|
||||||
logger.info({ imagePath }, 'Vision OCR injected into prompt');
|
|
||||||
} catch (err) {
|
|
||||||
logger.warn({ imagePath, err }, 'Vision OCR failed');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!injectedAny) return out;
|
|
||||||
return (
|
|
||||||
'[System note]\n' +
|
|
||||||
'One or more images were OCR’d by a vision helper model. Use the content inside\n' +
|
|
||||||
'`[Vision OCR]...[/Vision OCR]` as authoritative input. Do not claim you "cannot see"\n' +
|
|
||||||
'the image; answer using the OCR/summary provided.\n' +
|
|
||||||
'[/System note]\n\n' +
|
|
||||||
out
|
|
||||||
);
|
|
||||||
}
|
|
||||||
Loading…
Add table
Reference in a new issue