clawdie-ai/scripts/browser-jail-validation/predict-openai-compat.mjs
Operator & Codex 6c549e7ad0 Rename browser validation assets
---
Build: pass | Tests: pass — 2383 passed (175 files)
2026-05-11 17:32:22 +02:00

86 lines
3.4 KiB
JavaScript

// Run vision-grounding predictions against any OpenAI-compatible vision API.
// Use for GPT-4o (api.openai.com), GLM-4V via z.ai, UI-TARS via vLLM, etc.
//
// Env:
// VISION_BASE_URL e.g. https://api.openai.com/v1
// VISION_API_KEY bearer token
// VISION_MODEL e.g. gpt-4o, glm-4v, ui-tars-7b
// VISION_LABEL short name used in output filename, e.g. gpt-4o
//
// Usage:
// VISION_BASE_URL=... VISION_API_KEY=... VISION_MODEL=gpt-4o VISION_LABEL=gpt-4o \
// node predict-openai-compat.mjs
import { readFileSync, writeFileSync, readdirSync } from 'fs';
import path from 'path';
const ROOT = path.resolve(import.meta.dirname);
const BASE = process.env.VISION_BASE_URL;
const KEY = process.env.VISION_API_KEY;
const MODEL = process.env.VISION_MODEL;
const LABEL = process.env.VISION_LABEL || MODEL || 'unknown';
if (!BASE || !KEY || !MODEL) {
console.error('Missing VISION_BASE_URL / VISION_API_KEY / VISION_MODEL');
process.exit(1);
}
const PROMPT = (label) =>
`Look at this 1024x768 screenshot. Identify the visual element described and respond with strict JSON only: {"x": <integer>, "y": <integer>} where x and y are the pixel coordinates of the element's CENTER in the image. No prose, no markdown fences, just the JSON object. Element to locate: ${label}`;
const truthFiles = readdirSync(path.join(ROOT, 'results'))
.filter((f) => /^\d{2}-.+\.json$/.test(f));
const out = { model: LABEL, prompt_template: PROMPT('<label>'), captured_at: new Date().toISOString(), fixtures: {} };
for (const tf of truthFiles) {
const t = JSON.parse(readFileSync(path.join(ROOT, 'results', tf), 'utf8'));
const fixture = t.fixture;
const pngB64 = readFileSync(path.join(ROOT, 'screenshots', `${fixture}.png`)).toString('base64');
out.fixtures[fixture] = [];
for (const target of t.targets) {
if (target.missing) continue;
const body = {
model: MODEL,
messages: [
{
role: 'user',
content: [
{ type: 'text', text: PROMPT(target.label) },
{ type: 'image_url', image_url: { url: `data:image/png;base64,${pngB64}` } },
],
},
],
max_tokens: 64,
temperature: 0,
};
const r = await fetch(`${BASE}/chat/completions`, {
method: 'POST',
headers: { 'content-type': 'application/json', authorization: `Bearer ${KEY}` },
body: JSON.stringify(body),
});
if (!r.ok) {
out.fixtures[fixture].push({ id: target.id, error: `HTTP ${r.status}: ${await r.text()}` });
console.error(`[${fixture}] ${target.id} HTTP ${r.status}`);
continue;
}
const j = await r.json();
const raw = j.choices?.[0]?.message?.content ?? '';
const match = raw.match(/\{[^}]*"x"\s*:\s*(-?\d+)[^}]*"y"\s*:\s*(-?\d+)[^}]*\}/);
if (!match) {
out.fixtures[fixture].push({ id: target.id, raw, error: 'no JSON {x,y} found' });
console.error(`[${fixture}] ${target.id} parse fail: ${raw.slice(0, 120)}`);
continue;
}
const pred = { x: parseInt(match[1], 10), y: parseInt(match[2], 10) };
out.fixtures[fixture].push({ id: target.id, pred, raw });
console.log(`[${fixture}] ${target.id} -> (${pred.x},${pred.y})`);
}
}
const outPath = path.join(ROOT, 'results', `predictions-${LABEL}.json`);
writeFileSync(outPath, JSON.stringify(out, null, 2));
console.log(`\nWrote ${outPath}`);
console.log(`Score with: node score.mjs ${outPath}`);