clawdie-ai/scripts/browser-jail-validation/score.mjs
Operator & Codex 6c549e7ad0 Rename browser validation assets
---
Build: pass | Tests: pass — 2383 passed (175 files)
2026-05-11 17:32:22 +02:00

111 lines
3.5 KiB
JavaScript

// Score a predictions file against ground-truth DOM bboxes.
// Usage: node score.mjs results/predictions-<model>.json
//
// Outputs a per-target table + an aggregate report:
// - pass rate at 30 px tolerance (center distance)
// - mean / median / max pixel distance
// - in-bbox rate (predicted point falls inside the ground-truth bbox)
//
// Fails per target if either ground truth is missing or the prediction is
// missing.
import { readFileSync, readdirSync, writeFileSync } from 'fs';
import path from 'path';
const TOLERANCE_PX = 30;
const ROOT = path.resolve(import.meta.dirname);
const predPath = process.argv[2];
if (!predPath) {
console.error('usage: node score.mjs <predictions.json>');
process.exit(1);
}
const preds = JSON.parse(readFileSync(predPath, 'utf8'));
const truthFiles = readdirSync(path.join(ROOT, 'results'))
.filter((f) => /^\d{2}-.+\.json$/.test(f));
const truth = {};
for (const f of truthFiles) {
const t = JSON.parse(readFileSync(path.join(ROOT, 'results', f), 'utf8'));
truth[t.fixture] = Object.fromEntries(
t.targets.map((tg) => [tg.id, tg]),
);
}
const rows = [];
for (const [fixture, predTargets] of Object.entries(preds.fixtures)) {
for (const p of predTargets) {
const t = truth[fixture]?.[p.id];
if (!t || t.missing) {
rows.push({ fixture, id: p.id, error: 'no ground truth' });
continue;
}
if (!p.pred || typeof p.pred.x !== 'number' || typeof p.pred.y !== 'number') {
rows.push({ fixture, id: p.id, label: t.label, error: 'no prediction' });
continue;
}
const dx = p.pred.x - t.center.x;
const dy = p.pred.y - t.center.y;
const dist = Math.round(Math.sqrt(dx * dx + dy * dy));
const inside =
p.pred.x >= t.bbox.x &&
p.pred.x <= t.bbox.x + t.bbox.w &&
p.pred.y >= t.bbox.y &&
p.pred.y <= t.bbox.y + t.bbox.h;
const pass = dist <= TOLERANCE_PX;
rows.push({
fixture,
id: p.id,
label: t.label,
pred: p.pred,
truth_center: t.center,
bbox: t.bbox,
dist,
inside,
pass,
});
}
}
const scored = rows.filter((r) => !r.error);
const passes = scored.filter((r) => r.pass).length;
const insides = scored.filter((r) => r.inside).length;
const dists = scored.map((r) => r.dist).sort((a, b) => a - b);
const mean = dists.reduce((a, b) => a + b, 0) / dists.length;
const median = dists[Math.floor(dists.length / 2)];
const max = dists[dists.length - 1];
const summary = {
model: preds.model,
tolerance_px: TOLERANCE_PX,
total: scored.length,
pass_count: passes,
pass_rate: +(passes / scored.length).toFixed(3),
inside_bbox_count: insides,
inside_bbox_rate: +(insides / scored.length).toFixed(3),
mean_dist_px: Math.round(mean),
median_dist_px: median,
max_dist_px: max,
};
console.log('Per-target results:');
console.log('─'.repeat(96));
for (const r of rows) {
if (r.error) {
console.log(` [${r.fixture}] ${r.id}${r.error}`);
continue;
}
const tag = r.pass ? 'PASS' : 'FAIL';
const insideTag = r.inside ? ' in-bbox' : '';
console.log(
` [${r.fixture}] ${r.id.padEnd(18)} ${tag} dist=${String(r.dist).padStart(4)}px${insideTag} pred=(${r.pred.x},${r.pred.y}) truth=(${r.truth_center.x},${r.truth_center.y}) "${r.label}"`,
);
}
console.log('─'.repeat(96));
console.log('Summary:');
console.log(JSON.stringify(summary, null, 2));
const out = path.join(ROOT, 'results', `score-${path.basename(predPath, '.json')}.json`);
writeFileSync(out, JSON.stringify({ summary, rows }, null, 2));
console.log(`\nWrote ${out}`);