158 lines
6 KiB
TypeScript
158 lines
6 KiB
TypeScript
/**
|
|
* Lightweight in-process metrics registry.
|
|
*
|
|
* Exposes a /metrics endpoint in Prometheus text format (v0.0.4).
|
|
* No external dependencies — uses Node's built-in http module.
|
|
*
|
|
* Enable by setting METRICS_PORT (default 9100).
|
|
* Set METRICS_PORT=0 to disable entirely.
|
|
*
|
|
* Metrics exposed (prefix = ${RUNTIME_ID}_):
|
|
* sessions_started_total — counter
|
|
* sessions_completed_total — counter {status="ok|error|timeout"}
|
|
* session_duration_seconds_sum — counter (sum of durations)
|
|
* session_duration_seconds_count — counter (number of completed sessions)
|
|
* llm_invocations_total — counter {provider="..."}
|
|
* messages_received_total — counter
|
|
* jail_ops_total — counter {op="..."}
|
|
* jail_ops_errors_total — counter {op="..."}
|
|
* skill_searches_total — counter
|
|
* skill_search_hits_total — counter
|
|
* queue_active — gauge
|
|
* queue_depth — gauge
|
|
*/
|
|
import http from 'http';
|
|
|
|
import { RUNTIME_ID } from './config.js';
|
|
import { logger } from './logger.js';
|
|
|
|
const P = `${RUNTIME_ID}_`;
|
|
|
|
// ── Registry ──────────────────────────────────────────────────────────────────
|
|
|
|
/** Simple counters: name → value */
|
|
const counters = new Map<string, number>();
|
|
|
|
/** Labeled counters: name → Map<"key=\"val\"", value> */
|
|
const labeledCounters = new Map<string, Map<string, number>>();
|
|
|
|
/** Gauges: name → function returning current value */
|
|
const gauges = new Map<string, () => number>();
|
|
|
|
/** Counter HELP lines (optional) */
|
|
const help: Record<string, string> = {
|
|
[`${P}sessions_started_total`]: 'Total agent sessions spawned',
|
|
[`${P}sessions_completed_total`]: 'Total agent sessions completed by status',
|
|
[`${P}session_duration_seconds_sum`]:
|
|
'Sum of agent session durations in seconds',
|
|
[`${P}session_duration_seconds_count`]:
|
|
'Number of agent sessions with recorded duration',
|
|
[`${P}llm_invocations_total`]: 'Total LLM invocations by provider',
|
|
[`${P}messages_received_total`]: 'Total inbound messages routed to the queue',
|
|
[`${P}jail_ops_total`]: 'Total privileged jail operations dispatched',
|
|
[`${P}jail_ops_errors_total`]:
|
|
'Total privileged jail operations that returned ok=false',
|
|
[`${P}skill_searches_total`]: 'Total built-in knowledge searches performed',
|
|
[`${P}skill_search_hits_total`]:
|
|
'Total built-in knowledge chunks returned across all searches',
|
|
[`${P}queue_active`]: 'Current number of active jail processes',
|
|
[`${P}queue_depth`]: 'Current number of groups waiting for a jail slot',
|
|
[`${P}http_surface_requests_total`]:
|
|
'Total HTTP requests received by resolved host surface kind',
|
|
[`${P}http_surface_404_total`]:
|
|
'Total HTTP 404 responses emitted by resolved host surface kind',
|
|
[`${P}tenant_home_requests_total`]:
|
|
'Total tenant-home root requests by tenant id',
|
|
[`${P}tenant_site_requests_total`]:
|
|
'Total tenant-site root requests by tenant.site id',
|
|
[`${P}controlplane_root_redirects_total`]:
|
|
'Total redirects from controlplane root to /dashboard',
|
|
};
|
|
|
|
// ── Public API ────────────────────────────────────────────────────────────────
|
|
|
|
export function incCounter(name: string, by = 1): void {
|
|
counters.set(name, (counters.get(name) ?? 0) + by);
|
|
}
|
|
|
|
export function incLabeledCounter(
|
|
name: string,
|
|
labelKey: string,
|
|
labelValue: string,
|
|
by = 1,
|
|
): void {
|
|
let lc = labeledCounters.get(name);
|
|
if (!lc) {
|
|
lc = new Map();
|
|
labeledCounters.set(name, lc);
|
|
}
|
|
const pair = `${labelKey}="${labelValue}"`;
|
|
lc.set(pair, (lc.get(pair) ?? 0) + by);
|
|
}
|
|
|
|
export function registerGauge(name: string, fn: () => number): void {
|
|
gauges.set(name, fn);
|
|
}
|
|
|
|
// ── Renderer ──────────────────────────────────────────────────────────────────
|
|
|
|
export function renderMetrics(): string {
|
|
const lines: string[] = [];
|
|
|
|
for (const [name, value] of counters) {
|
|
if (help[name]) lines.push(`# HELP ${name} ${help[name]}`);
|
|
lines.push(`# TYPE ${name} counter`);
|
|
lines.push(`${name} ${value}`);
|
|
}
|
|
|
|
for (const [name, lc] of labeledCounters) {
|
|
if (help[name]) lines.push(`# HELP ${name} ${help[name]}`);
|
|
lines.push(`# TYPE ${name} counter`);
|
|
for (const [labelPair, value] of lc) {
|
|
lines.push(`${name}{${labelPair}} ${value}`);
|
|
}
|
|
}
|
|
|
|
for (const [name, fn] of gauges) {
|
|
if (help[name]) lines.push(`# HELP ${name} ${help[name]}`);
|
|
lines.push(`# TYPE ${name} gauge`);
|
|
lines.push(`${name} ${fn()}`);
|
|
}
|
|
|
|
return lines.join('\n') + '\n';
|
|
}
|
|
|
|
// ── HTTP server ───────────────────────────────────────────────────────────────
|
|
|
|
export function startMetricsServer(port: number): http.Server | null {
|
|
if (port === 0) return null;
|
|
|
|
const server = http.createServer((req, res) => {
|
|
if (req.method === 'GET' && req.url === '/metrics') {
|
|
const body = renderMetrics();
|
|
res.writeHead(200, {
|
|
'Content-Type': 'text/plain; version=0.0.4; charset=utf-8',
|
|
'Content-Length': Buffer.byteLength(body),
|
|
});
|
|
res.end(body);
|
|
} else if (req.method === 'GET' && req.url === '/healthz') {
|
|
res.writeHead(200, { 'Content-Type': 'text/plain' });
|
|
res.end('ok\n');
|
|
} else {
|
|
res.writeHead(404);
|
|
res.end();
|
|
}
|
|
});
|
|
|
|
// Bind to all interfaces so the management jail can scrape via the bridge IP.
|
|
// Access is restricted by PF — see setup/pf.ts metrics section.
|
|
server.listen(port, '0.0.0.0', () => {
|
|
logger.info({ port }, 'Metrics server listening');
|
|
});
|
|
|
|
server.on('error', (err) => {
|
|
logger.error({ err, port }, 'Metrics server error');
|
|
});
|
|
|
|
return server;
|
|
}
|