86 lines
3.4 KiB
JavaScript
86 lines
3.4 KiB
JavaScript
// Run vision-grounding predictions against any OpenAI-compatible vision API.
|
|
// Use for GPT-4o (api.openai.com), GLM-4V via z.ai, UI-TARS via vLLM, etc.
|
|
//
|
|
// Env:
|
|
// VISION_BASE_URL e.g. https://api.openai.com/v1
|
|
// VISION_API_KEY bearer token
|
|
// VISION_MODEL e.g. gpt-4o, glm-4v, ui-tars-7b
|
|
// VISION_LABEL short name used in output filename, e.g. gpt-4o
|
|
//
|
|
// Usage:
|
|
// VISION_BASE_URL=... VISION_API_KEY=... VISION_MODEL=gpt-4o VISION_LABEL=gpt-4o \
|
|
// node predict-openai-compat.mjs
|
|
|
|
import { readFileSync, writeFileSync, readdirSync } from 'fs';
|
|
import path from 'path';
|
|
|
|
const ROOT = path.resolve(import.meta.dirname);
|
|
|
|
const BASE = process.env.VISION_BASE_URL;
|
|
const KEY = process.env.VISION_API_KEY;
|
|
const MODEL = process.env.VISION_MODEL;
|
|
const LABEL = process.env.VISION_LABEL || MODEL || 'unknown';
|
|
|
|
if (!BASE || !KEY || !MODEL) {
|
|
console.error('Missing VISION_BASE_URL / VISION_API_KEY / VISION_MODEL');
|
|
process.exit(1);
|
|
}
|
|
|
|
const PROMPT = (label) =>
|
|
`Look at this 1024x768 screenshot. Identify the visual element described and respond with strict JSON only: {"x": <integer>, "y": <integer>} where x and y are the pixel coordinates of the element's CENTER in the image. No prose, no markdown fences, just the JSON object. Element to locate: ${label}`;
|
|
|
|
const truthFiles = readdirSync(path.join(ROOT, 'results'))
|
|
.filter((f) => /^\d{2}-.+\.json$/.test(f));
|
|
|
|
const out = { model: LABEL, prompt_template: PROMPT('<label>'), captured_at: new Date().toISOString(), fixtures: {} };
|
|
|
|
for (const tf of truthFiles) {
|
|
const t = JSON.parse(readFileSync(path.join(ROOT, 'results', tf), 'utf8'));
|
|
const fixture = t.fixture;
|
|
const pngB64 = readFileSync(path.join(ROOT, 'screenshots', `${fixture}.png`)).toString('base64');
|
|
out.fixtures[fixture] = [];
|
|
|
|
for (const target of t.targets) {
|
|
if (target.missing) continue;
|
|
const body = {
|
|
model: MODEL,
|
|
messages: [
|
|
{
|
|
role: 'user',
|
|
content: [
|
|
{ type: 'text', text: PROMPT(target.label) },
|
|
{ type: 'image_url', image_url: { url: `data:image/png;base64,${pngB64}` } },
|
|
],
|
|
},
|
|
],
|
|
max_tokens: 64,
|
|
temperature: 0,
|
|
};
|
|
const r = await fetch(`${BASE}/chat/completions`, {
|
|
method: 'POST',
|
|
headers: { 'content-type': 'application/json', authorization: `Bearer ${KEY}` },
|
|
body: JSON.stringify(body),
|
|
});
|
|
if (!r.ok) {
|
|
out.fixtures[fixture].push({ id: target.id, error: `HTTP ${r.status}: ${await r.text()}` });
|
|
console.error(`[${fixture}] ${target.id} HTTP ${r.status}`);
|
|
continue;
|
|
}
|
|
const j = await r.json();
|
|
const raw = j.choices?.[0]?.message?.content ?? '';
|
|
const match = raw.match(/\{[^}]*"x"\s*:\s*(-?\d+)[^}]*"y"\s*:\s*(-?\d+)[^}]*\}/);
|
|
if (!match) {
|
|
out.fixtures[fixture].push({ id: target.id, raw, error: 'no JSON {x,y} found' });
|
|
console.error(`[${fixture}] ${target.id} parse fail: ${raw.slice(0, 120)}`);
|
|
continue;
|
|
}
|
|
const pred = { x: parseInt(match[1], 10), y: parseInt(match[2], 10) };
|
|
out.fixtures[fixture].push({ id: target.id, pred, raw });
|
|
console.log(`[${fixture}] ${target.id} -> (${pred.x},${pred.y})`);
|
|
}
|
|
}
|
|
|
|
const outPath = path.join(ROOT, 'results', `predictions-${LABEL}.json`);
|
|
writeFileSync(outPath, JSON.stringify(out, null, 2));
|
|
console.log(`\nWrote ${outPath}`);
|
|
console.log(`Score with: node score.mjs ${outPath}`);
|