clawdie-ai/scripts/browser-jail-validation/score.mjs

// Score a predictions file against ground-truth DOM bboxes.
// Usage:  node score.mjs results/predictions-<model>.json
//
// Outputs a per-target table + an aggregate report:
//   - pass rate at 30 px tolerance (center distance)
//   - mean / median / max pixel distance
//   - in-bbox rate (predicted point falls inside the ground-truth bbox)
//
// Fails per target if either ground truth is missing or the prediction is
// missing.

import { readFileSync, readdirSync, writeFileSync } from 'fs';
import path from 'path';

const TOLERANCE_PX = 30;
const ROOT = path.resolve(import.meta.dirname);

const predPath = process.argv[2];
if (!predPath) {
  console.error('usage: node score.mjs <predictions.json>');
  process.exit(1);
}

const preds = JSON.parse(readFileSync(predPath, 'utf8'));
const truthFiles = readdirSync(path.join(ROOT, 'results'))
  .filter((f) => /^\d{2}-.+\.json$/.test(f));

const truth = {};
for (const f of truthFiles) {
  const t = JSON.parse(readFileSync(path.join(ROOT, 'results', f), 'utf8'));
  truth[t.fixture] = Object.fromEntries(
    t.targets.map((tg) => [tg.id, tg]),
  );
}

const rows = [];
for (const [fixture, predTargets] of Object.entries(preds.fixtures)) {
  for (const p of predTargets) {
    const t = truth[fixture]?.[p.id];
    if (!t || t.missing) {
      rows.push({ fixture, id: p.id, error: 'no ground truth' });
      continue;
    }
    if (!p.pred || typeof p.pred.x !== 'number' || typeof p.pred.y !== 'number') {
      rows.push({ fixture, id: p.id, label: t.label, error: 'no prediction' });
      continue;
    }
    const dx = p.pred.x - t.center.x;
    const dy = p.pred.y - t.center.y;
    const dist = Math.round(Math.sqrt(dx * dx + dy * dy));
    const inside =
      p.pred.x >= t.bbox.x &&
      p.pred.x <= t.bbox.x + t.bbox.w &&
      p.pred.y >= t.bbox.y &&
      p.pred.y <= t.bbox.y + t.bbox.h;
    const pass = dist <= TOLERANCE_PX;
    rows.push({
      fixture,
      id: p.id,
      label: t.label,
      pred: p.pred,
      truth_center: t.center,
      bbox: t.bbox,
      dist,
      inside,
      pass,
    });
  }
}

const scored = rows.filter((r) => !r.error);
const passes = scored.filter((r) => r.pass).length;
const insides = scored.filter((r) => r.inside).length;
const dists = scored.map((r) => r.dist).sort((a, b) => a - b);
const mean = dists.reduce((a, b) => a + b, 0) / dists.length;
const median = dists[Math.floor(dists.length / 2)];
const max = dists[dists.length - 1];

const summary = {
  model: preds.model,
  tolerance_px: TOLERANCE_PX,
  total: scored.length,
  pass_count: passes,
  pass_rate: +(passes / scored.length).toFixed(3),
  inside_bbox_count: insides,
  inside_bbox_rate: +(insides / scored.length).toFixed(3),
  mean_dist_px: Math.round(mean),
  median_dist_px: median,
  max_dist_px: max,
};

console.log('Per-target results:');
console.log('─'.repeat(96));
for (const r of rows) {
  if (r.error) {
    console.log(`  [${r.fixture}] ${r.id}  ✗ ${r.error}`);
    continue;
  }
  const tag = r.pass ? 'PASS' : 'FAIL';
  const insideTag = r.inside ? ' in-bbox' : '';
  console.log(
    `  [${r.fixture}] ${r.id.padEnd(18)} ${tag}  dist=${String(r.dist).padStart(4)}px${insideTag}  pred=(${r.pred.x},${r.pred.y}) truth=(${r.truth_center.x},${r.truth_center.y})  "${r.label}"`,
  );
}
console.log('─'.repeat(96));
console.log('Summary:');
console.log(JSON.stringify(summary, null, 2));

const out = path.join(ROOT, 'results', `score-${path.basename(predPath, '.json')}.json`);
writeFileSync(out, JSON.stringify({ summary, rows }, null, 2));
console.log(`\nWrote ${out}`);