clawdie-ai/scripts/browser-jail-validation/predict-openai-compat.mjs

// Run vision-grounding predictions against any OpenAI-compatible vision API.
// Use for GPT-4o (api.openai.com), GLM-4V via z.ai, UI-TARS via vLLM, etc.
//
// Env:
//   VISION_BASE_URL   e.g. https://api.openai.com/v1
//   VISION_API_KEY    bearer token
//   VISION_MODEL      e.g. gpt-4o, glm-4v, ui-tars-7b
//   VISION_LABEL      short name used in output filename, e.g. gpt-4o
//
// Usage:
//   VISION_BASE_URL=... VISION_API_KEY=... VISION_MODEL=gpt-4o VISION_LABEL=gpt-4o \
//     node predict-openai-compat.mjs

import { readFileSync, writeFileSync, readdirSync } from 'fs';
import path from 'path';

const ROOT = path.resolve(import.meta.dirname);

const BASE = process.env.VISION_BASE_URL;
const KEY = process.env.VISION_API_KEY;
const MODEL = process.env.VISION_MODEL;
const LABEL = process.env.VISION_LABEL || MODEL || 'unknown';

if (!BASE || !KEY || !MODEL) {
  console.error('Missing VISION_BASE_URL / VISION_API_KEY / VISION_MODEL');
  process.exit(1);
}

const PROMPT = (label) =>
  `Look at this 1024x768 screenshot. Identify the visual element described and respond with strict JSON only: {"x": <integer>, "y": <integer>} where x and y are the pixel coordinates of the element's CENTER in the image. No prose, no markdown fences, just the JSON object. Element to locate: ${label}`;

const truthFiles = readdirSync(path.join(ROOT, 'results'))
  .filter((f) => /^\d{2}-.+\.json$/.test(f));

const out = { model: LABEL, prompt_template: PROMPT('<label>'), captured_at: new Date().toISOString(), fixtures: {} };

for (const tf of truthFiles) {
  const t = JSON.parse(readFileSync(path.join(ROOT, 'results', tf), 'utf8'));
  const fixture = t.fixture;
  const pngB64 = readFileSync(path.join(ROOT, 'screenshots', `${fixture}.png`)).toString('base64');
  out.fixtures[fixture] = [];

  for (const target of t.targets) {
    if (target.missing) continue;
    const body = {
      model: MODEL,
      messages: [
        {
          role: 'user',
          content: [
            { type: 'text', text: PROMPT(target.label) },
            { type: 'image_url', image_url: { url: `data:image/png;base64,${pngB64}` } },
          ],
        },
      ],
      max_tokens: 64,
      temperature: 0,
    };
    const r = await fetch(`${BASE}/chat/completions`, {
      method: 'POST',
      headers: { 'content-type': 'application/json', authorization: `Bearer ${KEY}` },
      body: JSON.stringify(body),
    });
    if (!r.ok) {
      out.fixtures[fixture].push({ id: target.id, error: `HTTP ${r.status}: ${await r.text()}` });
      console.error(`[${fixture}] ${target.id}  HTTP ${r.status}`);
      continue;
    }
    const j = await r.json();
    const raw = j.choices?.[0]?.message?.content ?? '';
    const match = raw.match(/\{[^}]*"x"\s*:\s*(-?\d+)[^}]*"y"\s*:\s*(-?\d+)[^}]*\}/);
    if (!match) {
      out.fixtures[fixture].push({ id: target.id, raw, error: 'no JSON {x,y} found' });
      console.error(`[${fixture}] ${target.id}  parse fail: ${raw.slice(0, 120)}`);
      continue;
    }
    const pred = { x: parseInt(match[1], 10), y: parseInt(match[2], 10) };
    out.fixtures[fixture].push({ id: target.id, pred, raw });
    console.log(`[${fixture}] ${target.id}  -> (${pred.x},${pred.y})`);
  }
}

const outPath = path.join(ROOT, 'results', `predictions-${LABEL}.json`);
writeFileSync(outPath, JSON.stringify(out, null, 2));
console.log(`\nWrote ${outPath}`);
console.log(`Score with:  node score.mjs ${outPath}`);