chore: remove Tier-A unused SaaS modules (browser-operator, tts, vision) (Sam & Claude) #7

Merged
clawdie merged 1 commit from chore/prune-tier-a-unused into main 2026-06-05 13:22:07 +02:00
6 changed files with 0 additions and 805 deletions

View file

@ -1,115 +0,0 @@
import { describe, expect, it, vi } from 'vitest';
import { createClawdieBrowserOperator, type ClawdieBrowserOperator } from './browser-operator.js';
import type { BackendCaller } from './browser-orchestrator.js';
import type { BrowserCloneRow } from './browser-session-registry.js';
const baseTime = new Date('2026-05-11T12:00:00.000Z');
class OperatorPool {
row: BrowserCloneRow = {
session_id: '00000000-0000-4000-8000-000000000150',
tenant_id: 'tenant-a',
clone_name: 'browsertask150',
ip: '192.168.72.150',
status: 'open',
credential_mode: 'clean',
operator_grant_token_jti: null,
created_at: baseTime,
updated_at: baseTime,
opened_at: baseTime,
closed_at: null,
last_action_at: null,
expires_at: null,
error_code: null,
error_message: null,
};
async query(sql: string, params: unknown[] = []): Promise<{ rows: any[] }> {
const normalized = sql.replace(/\s+/g, ' ').trim();
if (normalized.startsWith('SELECT session_id::text')) {
return { rows: this.row.session_id === params[0] ? [this.row] : [] };
}
if (normalized.startsWith('UPDATE browser_clones')) {
const [, status, updatedAt, , closedAt, lastActionAt] = params;
this.row.status = status as any;
this.row.updated_at = updatedAt as Date;
if (closedAt) this.row.closed_at = closedAt as Date;
if (lastActionAt) this.row.last_action_at = lastActionAt as Date;
}
return { rows: [] };
}
}
describe('ClawdieBrowserOperator', () => {
it('exports the UI-TARS-compatible screenshot/execute shape', () => {
const operator: ClawdieBrowserOperator = createClawdieBrowserOperator(new OperatorPool() as any, 's1');
expect(typeof operator.screenshot).toBe('function');
expect(typeof operator.execute).toBe('function');
expect(typeof operator.finished).toBe('function');
expect(typeof operator.close).toBe('function');
});
it('adapts screenshot responses to imageBase64', async () => {
const backend: BackendCaller = vi.fn(async () => ({
ok: true,
status: 200,
data: { ok: true, image_base64: 'iVBORw0=', width: 1024, height: 768 },
}));
const operator = createClawdieBrowserOperator(
new OperatorPool() as any,
'00000000-0000-4000-8000-000000000150',
{ backend, now: () => baseTime },
);
await expect(operator.screenshot()).resolves.toEqual({
imageBase64: 'iVBORw0=',
width: 1024,
height: 768,
});
expect(backend).toHaveBeenCalledWith('192.168.72.150', 'POST', '/screenshot', {}, 30000);
});
it('translates basic predictions to browser action helpers', async () => {
const backend: BackendCaller = vi.fn(async (_ip, _method, path, body) => ({
ok: true,
status: 200,
data: { ok: true, path, body },
}));
const operator = createClawdieBrowserOperator(
new OperatorPool() as any,
'00000000-0000-4000-8000-000000000150',
{ backend, now: () => baseTime },
);
await expect(operator.execute({ action: 'navigate', url: 'https://example.com/' })).resolves.toMatchObject({
path: '/navigate',
body: { url: 'https://example.com/' },
});
await expect(operator.execute({ action: 'click', selector: '#go' })).resolves.toMatchObject({
path: '/click',
body: { selector: '#go' },
});
await expect(operator.execute({ action: 'type', text: 'hello' })).resolves.toMatchObject({
path: '/type',
body: { text: 'hello' },
});
});
it('tracks finish and closes the browser session', async () => {
const pool = new OperatorPool();
const hostd = vi.fn(async () => ({ id: 'x', ok: true, output: 'ok' }));
const operator = createClawdieBrowserOperator(
pool as any,
'00000000-0000-4000-8000-000000000150',
{ hostd, now: () => baseTime },
);
await expect(operator.execute({ action: 'finish' })).resolves.toEqual({ ok: true, finished: true });
expect(await operator.finished?.()).toBe(true);
await operator.close?.();
expect(hostd).toHaveBeenCalledWith('browser-clone-reap', {
clone: 'browsertask150',
ip: '192.168.72.150',
suffix: 'bt150',
});
expect(pool.row.status).toBe('closed');
});
});

View file

@ -1,161 +0,0 @@
import type pg from 'pg';
import {
clickBrowserSession,
closeBrowserSession,
navigateBrowserSession,
readDomBrowserSession,
screenshotBrowserSession,
scrollBrowserSession,
typeBrowserSession,
type BrowserOrchestratorDeps,
} from './browser-orchestrator.js';
export interface ClawdieBrowserScreenshot {
imageBase64: string;
width?: number;
height?: number;
}
export interface ClawdieBrowserOperator {
screenshot(): Promise<ClawdieBrowserScreenshot>;
execute(prediction: unknown): Promise<unknown>;
finished?(): Promise<boolean> | boolean;
close?(): Promise<void>;
}
type Prediction = {
action?: string;
type?: string;
url?: string;
x?: number;
y?: number;
selector?: string;
text?: string;
dx?: number;
dy?: number;
full_page?: boolean;
max_bytes?: number;
};
function predictionAction(prediction: Prediction): string {
return String(prediction.action ?? prediction.type ?? '').trim().toLowerCase();
}
function ensureObject(prediction: unknown): Prediction {
if (!prediction || typeof prediction !== 'object') {
throw new Error('browser operator prediction must be an object');
}
return prediction as Prediction;
}
function unwrap<T>(result: { ok: true; value: T } | { ok: false; code: string; error: string }): T {
if (!result.ok) throw Object.assign(new Error(result.error), { code: result.code });
return result.value;
}
export class BrowserSessionOperator implements ClawdieBrowserOperator {
private done = false;
constructor(
private readonly pool: pg.Pool,
private readonly sessionId: string,
private readonly deps: BrowserOrchestratorDeps = {},
) {}
async screenshot(): Promise<ClawdieBrowserScreenshot> {
const result = unwrap(await screenshotBrowserSession(this.pool, this.sessionId, {}, this.deps)) as {
image_base64?: string;
width?: number;
height?: number;
};
return {
imageBase64: result.image_base64 ?? '',
width: result.width,
height: result.height,
};
}
async execute(rawPrediction: unknown): Promise<unknown> {
const prediction = ensureObject(rawPrediction);
const action = predictionAction(prediction);
switch (action) {
case 'navigate':
case 'open':
if (!prediction.url) throw new Error('navigate action requires url');
return unwrap(await navigateBrowserSession(this.pool, this.sessionId, { url: prediction.url }, this.deps));
case 'click':
return unwrap(
await clickBrowserSession(
this.pool,
this.sessionId,
prediction.selector
? { selector: prediction.selector }
: { x: prediction.x, y: prediction.y },
this.deps,
),
);
case 'type':
case 'input':
return unwrap(
await typeBrowserSession(
this.pool,
this.sessionId,
{ text: prediction.text ?? '', selector: prediction.selector },
this.deps,
),
);
case 'scroll':
return unwrap(
await scrollBrowserSession(
this.pool,
this.sessionId,
{ dx: prediction.dx, dy: prediction.dy, selector: prediction.selector },
this.deps,
),
);
case 'read_dom':
case 'read':
return unwrap(
await readDomBrowserSession(
this.pool,
this.sessionId,
{ max_bytes: prediction.max_bytes },
this.deps,
),
);
case 'screenshot':
return unwrap(
await screenshotBrowserSession(
this.pool,
this.sessionId,
{ full_page: prediction.full_page },
this.deps,
),
);
case 'finish':
case 'done':
this.done = true;
return { ok: true, finished: true };
default:
throw new Error(`unsupported browser operator action: ${action || 'unknown'}`);
}
}
finished(): boolean {
return this.done;
}
async close(): Promise<void> {
await closeBrowserSession(this.pool, this.sessionId, this.deps);
this.done = true;
}
}
export function createClawdieBrowserOperator(
pool: pg.Pool,
sessionId: string,
deps: BrowserOrchestratorDeps = {},
): ClawdieBrowserOperator {
return new BrowserSessionOperator(pool, sessionId, deps);
}

View file

@ -1,115 +0,0 @@
import { describe, it, expect } from 'vitest';
import { shouldApplyTts, stripTtsMarker, stripMarkdown } from './tts.js';
describe('shouldApplyTts', () => {
it('returns false for off mode', () => {
expect(
shouldApplyTts({ mode: 'off', hadInboundAudio: false, text: 'hello' }),
).toBe(false);
});
it('returns true for always mode', () => {
expect(
shouldApplyTts({ mode: 'always', hadInboundAudio: false, text: 'hello' }),
).toBe(true);
});
it('returns true for inbound mode when inbound was audio', () => {
expect(
shouldApplyTts({ mode: 'inbound', hadInboundAudio: true, text: 'hello' }),
).toBe(true);
});
it('returns false for inbound mode when inbound was not audio', () => {
expect(
shouldApplyTts({
mode: 'inbound',
hadInboundAudio: false,
text: 'hello',
}),
).toBe(false);
});
it('returns true for tagged mode when [[tts]] is present', () => {
expect(
shouldApplyTts({
mode: 'tagged',
hadInboundAudio: false,
text: 'hello [[tts]] world',
}),
).toBe(true);
});
it('returns false for tagged mode without marker', () => {
expect(
shouldApplyTts({
mode: 'tagged',
hadInboundAudio: false,
text: 'hello world',
}),
).toBe(false);
});
it('[[tts]] is case-insensitive', () => {
expect(
shouldApplyTts({
mode: 'tagged',
hadInboundAudio: false,
text: '[[TTS]]',
}),
).toBe(true);
});
});
describe('stripTtsMarker', () => {
it('removes [[tts]] from text', () => {
expect(stripTtsMarker('hello [[tts]] world')).toBe('hello world');
});
it('returns text unchanged when no marker', () => {
expect(stripTtsMarker('hello world')).toBe('hello world');
});
it('removes multiple markers', () => {
expect(stripTtsMarker('[[tts]] a [[tts]] b')).toBe('a b');
});
it('is case-insensitive', () => {
expect(stripTtsMarker('[[TTS]]hello')).toBe('hello');
});
});
describe('stripMarkdown', () => {
it('removes bold markers', () => {
expect(stripMarkdown('**hello**')).toBe('hello');
});
it('removes italic markers', () => {
expect(stripMarkdown('*hello*')).toBe('hello');
});
it('removes code fences', () => {
expect(stripMarkdown('```js\nconsole.log("hi")\n```')).toBe('');
});
it('removes inline code', () => {
expect(stripMarkdown('use `foo` here')).toBe('use foo here');
});
it('removes links keeping text', () => {
expect(stripMarkdown('[click](https://example.com)')).toBe('click');
});
it('removes heading markers', () => {
expect(stripMarkdown('## Title')).toBe('Title');
});
it('removes list markers', () => {
expect(stripMarkdown('- item')).toBe('item');
});
it('removes strikethrough', () => {
expect(stripMarkdown('~~deleted~~')).toBe('deleted');
});
});

View file

@ -1,171 +0,0 @@
import fs from 'fs';
import path from 'path';
import { spawn } from 'child_process';
import { logger } from './logger.js';
import {
TTS_PROVIDER,
TTS_MAX_TEXT_LENGTH,
TTS_OUTPUT_FORMAT,
TTS_VOICE,
TMP_DIR,
} from './config.js';
export type TtsAutoMode = 'always' | 'inbound' | 'tagged' | 'off';
interface TtsResult {
audioPath: string;
}
export function stripMarkdown(text: string): string {
let out = text;
out = out.replace(/\[Vision OCR\][\s\S]*?\[\/Vision OCR\]/g, '');
out = out.replace(/\[System note\][\s\S]*?\[\/System note\]/g, '');
out = out.replace(/```[\s\S]*?```/g, '');
out = out.replace(/`([^`]+)`/g, '$1');
out = out.replace(/\*\*([^*]+)\*\*/g, '$1');
out = out.replace(/\*([^*]+)\*/g, '$1');
out = out.replace(/__([^_]+)__/g, '$1');
out = out.replace(/_([^_]+)_/g, '$1');
out = out.replace(/~~([^~]+)~~/g, '$1');
out = out.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');
out = out.replace(/^#{1,6}\s+/gm, '');
out = out.replace(/^[-*+]\s+/gm, '');
out = out.replace(/^\d+\.\s+/gm, '');
out = out.replace(/^>\s+/gm, '');
out = out.replace(/---+/g, '');
out = out.replace(/\n{3,}/g, '\n\n');
return out.trim();
}
export function shouldApplyTts(opts: {
mode: TtsAutoMode;
hadInboundAudio: boolean;
text: string;
}): boolean {
if (opts.mode === 'off') return false;
if (opts.mode === 'always') return true;
if (opts.mode === 'inbound') return opts.hadInboundAudio;
if (opts.mode === 'tagged') return /\[\[tts\]\]/i.test(opts.text);
return false;
}
export function stripTtsMarker(text: string): string {
return text.replace(/\[\[tts\]\]/gi, '').trim();
}
export async function synthesize(
text: string,
opts?: {
voice?: string;
outputFormat?: string;
},
): Promise<TtsResult> {
const voice = opts?.voice || TTS_VOICE;
const outputFormat = opts?.outputFormat || TTS_OUTPUT_FORMAT;
const cleanText = stripMarkdown(text);
if (!cleanText) {
throw new Error('TTS: no text to synthesize after stripping markdown');
}
const truncated =
cleanText.length > TTS_MAX_TEXT_LENGTH
? cleanText.slice(0, TTS_MAX_TEXT_LENGTH)
: cleanText;
const ttsDir = path.join(TMP_DIR, 'tts');
fs.mkdirSync(ttsDir, { recursive: true });
const ext = inferExtension(outputFormat);
const fileName = `tts-${Date.now()}-${Math.random().toString(36).slice(2, 8)}${ext}`;
const audioPath = path.join(ttsDir, fileName);
try {
if (TTS_PROVIDER === 'azure') {
throw new Error('TTS: azure provider not implemented yet');
} else {
await runEdgeTtsCli({
text: truncated,
voice,
audioPath,
});
}
} catch (err) {
try {
fs.unlinkSync(audioPath);
} catch {}
throw err;
}
if (!fs.existsSync(audioPath)) {
throw new Error('TTS: edge-tts completed but no audio file produced');
}
const stat = fs.statSync(audioPath);
if (!stat.isFile()) {
throw new Error('TTS: synthesized audio path is not a file');
}
if (stat.size < 1024) {
throw new Error(`TTS: synthesized audio file too small (${stat.size} bytes)`);
}
logger.info(
{
voice,
format: outputFormat,
chars: truncated.length,
bytes: stat.size,
path: audioPath,
},
'TTS synthesized',
);
return { audioPath };
}
function runEdgeTtsCli(opts: {
text: string;
voice: string;
audioPath: string;
}): Promise<void> {
return new Promise((resolve, reject) => {
const args = [
'--text',
opts.text,
'--voice',
opts.voice,
'--write-media',
opts.audioPath,
];
const binPath = path.join(
path.dirname(path.dirname(new URL(import.meta.url).pathname)),
'bin',
'edge-tts',
);
const proc = spawn(binPath, args, { stdio: 'ignore' });
proc.on('error', (err: any) => {
if (err && err.code === 'ENOENT') {
reject(
new Error(
'TTS: edge-tts not found. Install with: pip install edge-tts',
),
);
return;
}
reject(err);
});
proc.on('exit', (code) => {
if (code === 0) resolve();
else reject(new Error(`TTS: edge-tts exited with code ${code}`));
});
});
}
function inferExtension(outputFormat: string): string {
const n = outputFormat.toLowerCase();
if (n.includes('ogg') || n.includes('opus')) return '.ogg';
if (n.includes('webm')) return '.webm';
if (n.includes('wav') || n.includes('riff') || n.includes('pcm'))
return '.wav';
return '.mp3';
}

View file

@ -1,60 +0,0 @@
import fs from 'fs';
import path from 'path';
import { afterEach, describe, expect, it, vi } from 'vitest';
import { TMP_TESTS_DIR } from './config.js';
function writeTinyJpeg(filePath: string): void {
// Minimal JPEG header + EOI (enough for base64 packaging; model may ignore).
fs.writeFileSync(filePath, Buffer.from([0xff, 0xd8, 0xff, 0xd9]));
}
describe('augmentPromptWithVision', () => {
const originalEnv = { ...process.env };
afterEach(() => {
process.env = { ...originalEnv };
vi.restoreAllMocks();
vi.resetModules();
});
it('injects [Vision OCR] block for saved Telegram photos under TMP_DIR', async () => {
const dir = fs.mkdtempSync(path.join(TMP_TESTS_DIR, 'vision-test-'));
const imagePath = path.join(dir, 'photo.jpg');
writeTinyJpeg(imagePath);
process.env.VISION_PROVIDER = 'openrouter';
process.env.VISION_MODEL = 'nvidia/nemotron-nano-12b-v2-vl:free';
process.env.OPENROUTER_API_KEY = 'test-key';
const fetchMock = vi.fn(async () => {
return {
ok: true,
json: async () => ({
choices: [
{
message: { content: 'OCR:\nHELLO\n\nSUMMARY:\n- hi' },
},
],
}),
} as any;
});
// @ts-expect-error vitest runtime override
globalThis.fetch = fetchMock;
// config.ts evaluates VISION_PROVIDER at module load; the static
// import of TMP_TESTS_DIR above already cached an empty value.
// Reset so the dynamic import below re-reads env we just set.
vi.resetModules();
const { augmentPromptWithVision } = await import('./vision.js');
const input = `User: test\n[Photo saved: ${imagePath}]`;
const out = await augmentPromptWithVision(input);
expect(fetchMock).toHaveBeenCalled();
expect(out).toContain('[Vision OCR]');
expect(out).toContain('OCR:\nHELLO');
expect(out).toContain('Do not claim you "cannot see"');
});
});

View file

@ -1,183 +0,0 @@
import fs from 'fs';
import path from 'path';
import { readEnvFile } from './env.js';
import {
RUNTIME_ID,
TMP_DIR,
VISION_MAX_CHARS_PER_IMAGE,
VISION_MAX_IMAGES,
VISION_MAX_TOTAL_CHARS,
VISION_MODEL,
VISION_PROVIDER,
} from './config.js';
import { logger } from './logger.js';
type OpenRouterResponse = {
choices?: Array<{
message?: { content?: string };
}>;
};
let cachedOpenRouterKey: string | null | undefined;
function getOpenRouterKey(): string | null {
if (cachedOpenRouterKey !== undefined) return cachedOpenRouterKey;
const fromEnv = process.env.OPENROUTER_API_KEY;
if (fromEnv) {
cachedOpenRouterKey = fromEnv;
return cachedOpenRouterKey;
}
const parsed = readEnvFile(['OPENROUTER_API_KEY']);
cachedOpenRouterKey = parsed.OPENROUTER_API_KEY || null;
return cachedOpenRouterKey;
}
function clampText(text: string, maxChars: number): string {
if (text.length <= maxChars) return text;
return `${text.slice(0, maxChars)}\n…(truncated)…`;
}
function isSafeTmpPath(filePath: string): boolean {
const resolved = path.resolve(filePath);
const safeRoot = path.resolve(TMP_DIR) + path.sep;
return resolved.startsWith(safeRoot);
}
async function describeImageOpenRouter(imagePath: string): Promise<string> {
const key = getOpenRouterKey();
if (!key) throw new Error('OPENROUTER_API_KEY not configured');
const resolved = path.resolve(imagePath);
const bytes = fs.readFileSync(resolved);
const base64 = bytes.toString('base64');
const ext = path.extname(resolved).toLowerCase().replace('.', '') || 'png';
const mime = ext === 'jpg' ? 'jpeg' : ext;
const dataUrl = `data:image/${mime};base64,${base64}`;
const prompt =
'You are an OCR + screenshot/meme helper.\n' +
'\n' +
'Task:\n' +
'1) Extract ALL readable text verbatim (including meme overlay text).\n' +
'2) If some text is unclear, write [unclear] for that span.\n' +
'3) After OCR, add a short bulleted summary (max 5 bullets) of what the image shows.\n' +
'\n' +
'Output format:\n' +
'OCR:\n' +
'<verbatim text>\n' +
'\n' +
'SUMMARY:\n' +
'- ...';
const body = {
model: VISION_MODEL,
messages: [
{
role: 'user',
content: [
{ type: 'text', text: prompt },
{ type: 'image_url', image_url: { url: dataUrl } },
],
},
],
max_tokens: 1200,
temperature: 0,
};
const resp = await fetch('https://openrouter.ai/api/v1/chat/completions', {
method: 'POST',
headers: {
Authorization: `Bearer ${key}`,
'Content-Type': 'application/json',
'HTTP-Referer': 'https://codeberg.org/Clawdie/Clawdie-AI',
'X-Title': `${RUNTIME_ID}-ai`,
},
body: JSON.stringify(body),
});
if (!resp.ok) {
const txt = await resp.text().catch(() => '');
throw new Error(`OpenRouter vision failed: ${resp.status} ${txt}`);
}
const data = (await resp.json()) as OpenRouterResponse;
const content = data.choices?.[0]?.message?.content?.trim() || '';
if (!content) throw new Error('OpenRouter vision returned empty content');
return content;
}
async function describeImage(imagePath: string): Promise<string> {
if (VISION_PROVIDER !== 'openrouter') {
throw new Error(`Unsupported vision provider: ${VISION_PROVIDER || '(empty)'}`);
}
return describeImageOpenRouter(imagePath);
}
/**
* Replace `[Photo saved: /path]` placeholders with an OCR/description block.
* Only runs when `VISION_PROVIDER` is set and `VISION_MAX_IMAGES > 0`.
*/
export async function augmentPromptWithVision(text: string): Promise<string> {
if (!VISION_PROVIDER) return text;
if (VISION_MAX_IMAGES <= 0) return text;
const re = /\[Photo saved:\s*([^\]\n]+)\]/g;
const matches = Array.from(text.matchAll(re))
.map((m) => m[1]?.trim())
.filter(Boolean) as string[];
if (matches.length === 0) return text;
const unique: string[] = [];
for (const m of matches) {
if (!unique.includes(m)) unique.push(m);
if (unique.length >= VISION_MAX_IMAGES) break;
}
let totalAdded = 0;
let out = text;
let injectedAny = false;
for (const imagePath of unique) {
if (!isSafeTmpPath(imagePath)) {
logger.warn({ imagePath }, 'Vision skipped (path outside TMP_DIR)');
continue;
}
if (!fs.existsSync(imagePath)) {
logger.warn({ imagePath }, 'Vision skipped (file missing)');
continue;
}
try {
logger.info({ imagePath, model: VISION_MODEL }, 'Vision OCR started');
const ocr = await describeImage(imagePath);
logger.info(
{ imagePath, snippet: clampText(ocr, 240) },
'Vision OCR raw output (snippet)',
);
const clipped = clampText(ocr, VISION_MAX_CHARS_PER_IMAGE);
const block = `\n\n[Vision OCR]\n${clipped}\n[/Vision OCR]\n`;
if (totalAdded + block.length > VISION_MAX_TOTAL_CHARS) {
logger.warn(
{ imagePath, totalAdded, maxTotal: VISION_MAX_TOTAL_CHARS },
'Vision OCR skipped (budget exceeded)',
);
continue;
}
totalAdded += block.length;
out = out.replace(`[Photo saved: ${imagePath}]`, `[Photo saved: ${imagePath}]${block}`);
injectedAny = true;
logger.info({ imagePath }, 'Vision OCR injected into prompt');
} catch (err) {
logger.warn({ imagePath, err }, 'Vision OCR failed');
}
}
if (!injectedAny) return out;
return (
'[System note]\n' +
'One or more images were OCRd by a vision helper model. Use the content inside\n' +
'`[Vision OCR]...[/Vision OCR]` as authoritative input. Do not claim you "cannot see"\n' +
'the image; answer using the OCR/summary provided.\n' +
'[/System note]\n\n' +
out
);
}