Add doctor checks for DNS and morning reports (Sam & Codex)

---
Build: pass | Tests: pass — 2249 passed (666 files)
This commit is contained in:
Operator & Codex 2026-05-09 14:47:10 +02:00
parent 85da1ee050
commit 5f6f6aaede

View file

@ -1,15 +1,27 @@
import { SERVICE_NAME } from './platform-identity.js';
import { spawnSync } from 'child_process';
import dns from 'dns';
import net from 'net';
import {
CMS_INTERNAL_DOMAIN,
CMS_JAIL_IP,
CODE_SERVICE_INTERNAL_DOMAIN,
CONTROLPLANE_INTERNAL_DOMAIN,
GIT_JAIL_IP,
MAIN_GROUP_FOLDER,
PLATFORM_INTERNAL_BASE,
STRIPE_KEY_MODE,
STRIPE_REFUNDS_ENABLED,
STRIPE_STATUS,
SUBNET_BASE,
TENANT_ID,
TENANT_DISPLAY_NAME,
} from './config.js';
import { closePool, getPool, getTaskById } from './db.js';
import { formatDisplayDate as formatHumanDate } from './display-date.js';
import { assessHealth } from './health.js';
import { platformServiceDomain } from './platform-layout.js';
import { tenantContextWarning } from './startup-report.js';
import { formatSurfaceMapLines } from './surface-map.js';
import { getTenantRecord, loadTenantRegistry } from './tenant-registry.js';
@ -21,6 +33,9 @@ import {
} from './split-brain-status.js';
import { WATCHDOG_SOCKET_PATH } from './watchdog.js';
const MORNING_REPORT_TASK_ID = 'morning-report-0800';
const MORNING_REPORT_CRON = '0 8 * * *';
interface WatchdogStatus {
mode: string;
throttled: boolean;
@ -62,22 +77,150 @@ function queryWatchdog(): Promise<WatchdogStatus | null> {
});
}
function formatSnapshotDate(timestamp?: string): string {
function formatSnapshotDate(timestamp?: string | null): string {
if (!timestamp) return 'n/a';
return formatHumanDate(timestamp);
}
function parseTime(timestamp?: string | null): number | null {
if (!timestamp) return null;
const value = Date.parse(timestamp);
return Number.isNaN(value) ? null : value;
}
async function resolveFrom(server: string, host: string): Promise<string[]> {
const resolver = new dns.promises.Resolver();
resolver.setServers([server]);
return resolver.resolve4(host);
}
async function collectDnsIssues(): Promise<{ lines: string[]; issues: string[] }> {
const lines: string[] = [];
const issues: string[] = [];
const gatewayIp = `${SUBNET_BASE}.1`;
const expected = [
{ host: CONTROLPLANE_INTERNAL_DOMAIN, ip: gatewayIp },
{ host: CMS_INTERNAL_DOMAIN, ip: CMS_JAIL_IP },
{ host: platformServiceDomain('web', PLATFORM_INTERNAL_BASE), ip: CMS_JAIL_IP },
{ host: CODE_SERVICE_INTERNAL_DOMAIN, ip: GIT_JAIL_IP },
];
const status = spawnSync('service', ['dnsmasq', 'onestatus'], {
encoding: 'utf-8',
});
const serviceOk = status.status === 0;
lines.push(`DNSMASQ_SERVICE: ${serviceOk ? 'running' : 'not-running'}`);
if (!serviceOk) {
const detail = (status.stderr || status.stdout || '').trim();
issues.push(`dnsmasq service is not running${detail ? ` (${detail})` : ''}`);
}
const sockstat = spawnSync('sockstat', ['-4', '-l'], { encoding: 'utf-8' });
if (sockstat.status === 0) {
const out = sockstat.stdout;
const loopbackListening = out.includes('127.0.0.1:53');
const gatewayListening = out.includes(`${gatewayIp}:53`);
lines.push(`DNSMASQ_LISTEN_LOCALHOST: ${loopbackListening ? 'yes' : 'no'}`);
lines.push(`DNSMASQ_LISTEN_GATEWAY: ${gatewayListening ? 'yes' : 'no'} (${gatewayIp})`);
if (!loopbackListening) issues.push('dnsmasq is not listening on 127.0.0.1:53');
if (!gatewayListening) issues.push(`dnsmasq is not listening on ${gatewayIp}:53`);
} else {
lines.push('DNSMASQ_LISTEN: unknown');
issues.push('unable to inspect dnsmasq listeners with sockstat');
}
for (const { host, ip } of expected) {
try {
const records = await resolveFrom('127.0.0.1', host);
const ok = records.includes(ip);
lines.push(`DNS_${host.toUpperCase().replace(/[^A-Z0-9]+/g, '_')}: ${ok ? 'ok' : `mismatch (${records.join(',') || 'none'})`}`);
if (!ok) issues.push(`${host} does not resolve to ${ip} via dnsmasq localhost`);
} catch (err) {
const detail = err instanceof Error ? err.message : String(err);
lines.push(`DNS_${host.toUpperCase().replace(/[^A-Z0-9]+/g, '_')}: error`);
issues.push(`${host} does not resolve via dnsmasq localhost (${detail})`);
}
}
return { lines, issues };
}
async function collectMorningReportIssues(): Promise<{ lines: string[]; issues: string[] }> {
const lines: string[] = [];
const issues: string[] = [];
try {
const task = await getTaskById(MORNING_REPORT_TASK_ID);
if (!task) {
lines.push('MORNING_REPORT_TASK: missing');
issues.push(`${MORNING_REPORT_TASK_ID} scheduled task is missing`);
return { lines, issues };
}
lines.push(`MORNING_REPORT_TASK: ${task.status}`);
lines.push(`MORNING_REPORT_CRON: ${task.schedule_value}`);
lines.push(`MORNING_REPORT_NEXT_RUN: ${formatSnapshotDate(task.next_run)}`);
lines.push(`MORNING_REPORT_LAST_RUN: ${formatSnapshotDate(task.last_run)}`);
if (task.status !== 'active') {
issues.push(`${MORNING_REPORT_TASK_ID} is ${task.status}, not active`);
}
if (task.group_folder !== MAIN_GROUP_FOLDER) {
issues.push(`${MORNING_REPORT_TASK_ID} targets group folder ${task.group_folder}, expected ${MAIN_GROUP_FOLDER}`);
}
if (task.schedule_type !== 'cron' || task.schedule_value !== MORNING_REPORT_CRON) {
issues.push(`${MORNING_REPORT_TASK_ID} schedule is ${task.schedule_type}:${task.schedule_value}, expected cron:${MORNING_REPORT_CRON}`);
}
const nextRun = parseTime(task.next_run);
const now = Date.now();
if (nextRun === null) {
issues.push(`${MORNING_REPORT_TASK_ID} has no valid next_run`);
} else {
const untilNext = nextRun - now;
if (untilNext < -10 * 60 * 1000) {
issues.push(`${MORNING_REPORT_TASK_ID} next_run is overdue by more than 10 minutes`);
}
if (untilNext > 30 * 60 * 60 * 1000) {
issues.push(`${MORNING_REPORT_TASK_ID} next_run is more than 30 hours away`);
}
}
const last = await getPool().query(
`SELECT run_at, status, error FROM task_run_logs WHERE task_id = $1 ORDER BY run_at DESC LIMIT 1`,
[MORNING_REPORT_TASK_ID],
);
const latest = last.rows[0] as { run_at?: string; status?: string; error?: string } | undefined;
lines.push(`MORNING_REPORT_LAST_LOG: ${latest ? `${latest.status || 'unknown'} at ${formatSnapshotDate(latest.run_at)}` : 'none'}`);
if (latest?.status === 'error') {
issues.push(`${MORNING_REPORT_TASK_ID} latest run failed${latest.error ? `: ${latest.error}` : ''}`);
}
} catch (err) {
const detail = err instanceof Error ? err.message : String(err);
lines.push('MORNING_REPORT_TASK: unknown');
issues.push(`unable to inspect morning report schedule (${detail})`);
}
return { lines, issues };
}
async function main(): Promise<void> {
const health = assessHealth();
const splitBrain = await collectSplitBrainStatus();
const watchdog = await queryWatchdog();
const dnsCheck = await collectDnsIssues();
const morningReportCheck = await collectMorningReportIssues();
const splitBrainIssues = collectSplitBrainIssues(splitBrain);
const splitBrainStatus = deriveSplitBrainReadiness(splitBrain);
const tmpMountIssues = collectTmpMountFindings().map(
({ source, mountpoint }) =>
`mounted filesystem inside tmp/: ${mountpoint} (${source}) — tmp/ must contain ordinary temp files only`,
);
const issues = [...health.issues, ...splitBrainIssues, ...tmpMountIssues];
const issues = [
...health.issues,
...splitBrainIssues,
...tmpMountIssues,
...dnsCheck.issues,
...morningReportCheck.issues,
];
const criticalSplitBrain =
splitBrain.skillsDb !== 'available' ||
splitBrain.memoryDb !== 'available' ||
@ -165,6 +308,12 @@ async function main(): Promise<void> {
} else {
console.log('WATCHDOG: offline');
}
for (const line of dnsCheck.lines) {
console.log(line);
}
for (const line of morningReportCheck.lines) {
console.log(line);
}
console.log(`SPLIT_BRAIN: ${splitBrainStatus}`);
console.log(`SKILLS_DB: ${splitBrain.skillsDb}`);
console.log(`SKILLS_ARTIFACT: ${splitBrain.skillsArtifact}`);
@ -185,7 +334,11 @@ async function main(): Promise<void> {
}
}
await closePool();
if (status === 'error') process.exitCode = 1;
}
await main();
await main().finally(async () => {
await closePool();
});