clawdie-ai/src/metrics.ts
2026-05-04 06:24:32 +02:00

158 lines
6 KiB
TypeScript

/**
* Lightweight in-process metrics registry.
*
* Exposes a /metrics endpoint in Prometheus text format (v0.0.4).
* No external dependencies — uses Node's built-in http module.
*
* Enable by setting METRICS_PORT (default 9100).
* Set METRICS_PORT=0 to disable entirely.
*
* Metrics exposed (prefix = ${RUNTIME_ID}_):
* sessions_started_total — counter
* sessions_completed_total — counter {status="ok|error|timeout"}
* session_duration_seconds_sum — counter (sum of durations)
* session_duration_seconds_count — counter (number of completed sessions)
* llm_invocations_total — counter {provider="..."}
* messages_received_total — counter
* jail_ops_total — counter {op="..."}
* jail_ops_errors_total — counter {op="..."}
* skill_searches_total — counter
* skill_search_hits_total — counter
* queue_active — gauge
* queue_depth — gauge
*/
import http from 'http';
import { RUNTIME_ID } from './config.js';
import { logger } from './logger.js';
const P = `${RUNTIME_ID}_`;
// ── Registry ──────────────────────────────────────────────────────────────────
/** Simple counters: name → value */
const counters = new Map<string, number>();
/** Labeled counters: name → Map<"key=\"val\"", value> */
const labeledCounters = new Map<string, Map<string, number>>();
/** Gauges: name → function returning current value */
const gauges = new Map<string, () => number>();
/** Counter HELP lines (optional) */
const help: Record<string, string> = {
[`${P}sessions_started_total`]: 'Total agent sessions spawned',
[`${P}sessions_completed_total`]: 'Total agent sessions completed by status',
[`${P}session_duration_seconds_sum`]:
'Sum of agent session durations in seconds',
[`${P}session_duration_seconds_count`]:
'Number of agent sessions with recorded duration',
[`${P}llm_invocations_total`]: 'Total LLM invocations by provider',
[`${P}messages_received_total`]: 'Total inbound messages routed to the queue',
[`${P}jail_ops_total`]: 'Total privileged jail operations dispatched',
[`${P}jail_ops_errors_total`]:
'Total privileged jail operations that returned ok=false',
[`${P}skill_searches_total`]: 'Total built-in knowledge searches performed',
[`${P}skill_search_hits_total`]:
'Total built-in knowledge chunks returned across all searches',
[`${P}queue_active`]: 'Current number of active jail processes',
[`${P}queue_depth`]: 'Current number of groups waiting for a jail slot',
[`${P}http_surface_requests_total`]:
'Total HTTP requests received by resolved host surface kind',
[`${P}http_surface_404_total`]:
'Total HTTP 404 responses emitted by resolved host surface kind',
[`${P}tenant_home_requests_total`]:
'Total tenant-home root requests by tenant id',
[`${P}tenant_site_requests_total`]:
'Total tenant-site root requests by tenant.site id',
[`${P}controlplane_root_redirects_total`]:
'Total redirects from controlplane root to /dashboard',
};
// ── Public API ────────────────────────────────────────────────────────────────
export function incCounter(name: string, by = 1): void {
counters.set(name, (counters.get(name) ?? 0) + by);
}
export function incLabeledCounter(
name: string,
labelKey: string,
labelValue: string,
by = 1,
): void {
let lc = labeledCounters.get(name);
if (!lc) {
lc = new Map();
labeledCounters.set(name, lc);
}
const pair = `${labelKey}="${labelValue}"`;
lc.set(pair, (lc.get(pair) ?? 0) + by);
}
export function registerGauge(name: string, fn: () => number): void {
gauges.set(name, fn);
}
// ── Renderer ──────────────────────────────────────────────────────────────────
export function renderMetrics(): string {
const lines: string[] = [];
for (const [name, value] of counters) {
if (help[name]) lines.push(`# HELP ${name} ${help[name]}`);
lines.push(`# TYPE ${name} counter`);
lines.push(`${name} ${value}`);
}
for (const [name, lc] of labeledCounters) {
if (help[name]) lines.push(`# HELP ${name} ${help[name]}`);
lines.push(`# TYPE ${name} counter`);
for (const [labelPair, value] of lc) {
lines.push(`${name}{${labelPair}} ${value}`);
}
}
for (const [name, fn] of gauges) {
if (help[name]) lines.push(`# HELP ${name} ${help[name]}`);
lines.push(`# TYPE ${name} gauge`);
lines.push(`${name} ${fn()}`);
}
return lines.join('\n') + '\n';
}
// ── HTTP server ───────────────────────────────────────────────────────────────
export function startMetricsServer(port: number): http.Server | null {
if (port === 0) return null;
const server = http.createServer((req, res) => {
if (req.method === 'GET' && req.url === '/metrics') {
const body = renderMetrics();
res.writeHead(200, {
'Content-Type': 'text/plain; version=0.0.4; charset=utf-8',
'Content-Length': Buffer.byteLength(body),
});
res.end(body);
} else if (req.method === 'GET' && req.url === '/healthz') {
res.writeHead(200, { 'Content-Type': 'text/plain' });
res.end('ok\n');
} else {
res.writeHead(404);
res.end();
}
});
// Bind to all interfaces so the management jail can scrape via the bridge IP.
// Access is restricted by PF — see setup/pf.ts metrics section.
server.listen(port, '0.0.0.0', () => {
logger.info({ port }, 'Metrics server listening');
});
server.on('error', (err) => {
logger.error({ err, port }, 'Metrics server error');
});
return server;
}