Add doctor checks for DNS and morning reports (Sam & Codex)
--- Build: pass | Tests: pass — 2249 passed (666 files)
This commit is contained in:
parent
85da1ee050
commit
5f6f6aaede
1 changed files with 156 additions and 3 deletions
159
src/doctor.ts
159
src/doctor.ts
|
|
@ -1,15 +1,27 @@
|
|||
import { SERVICE_NAME } from './platform-identity.js';
|
||||
import { spawnSync } from 'child_process';
|
||||
import dns from 'dns';
|
||||
import net from 'net';
|
||||
|
||||
import {
|
||||
CMS_INTERNAL_DOMAIN,
|
||||
CMS_JAIL_IP,
|
||||
CODE_SERVICE_INTERNAL_DOMAIN,
|
||||
CONTROLPLANE_INTERNAL_DOMAIN,
|
||||
GIT_JAIL_IP,
|
||||
MAIN_GROUP_FOLDER,
|
||||
PLATFORM_INTERNAL_BASE,
|
||||
STRIPE_KEY_MODE,
|
||||
STRIPE_REFUNDS_ENABLED,
|
||||
STRIPE_STATUS,
|
||||
SUBNET_BASE,
|
||||
TENANT_ID,
|
||||
TENANT_DISPLAY_NAME,
|
||||
} from './config.js';
|
||||
import { closePool, getPool, getTaskById } from './db.js';
|
||||
import { formatDisplayDate as formatHumanDate } from './display-date.js';
|
||||
import { assessHealth } from './health.js';
|
||||
import { platformServiceDomain } from './platform-layout.js';
|
||||
import { tenantContextWarning } from './startup-report.js';
|
||||
import { formatSurfaceMapLines } from './surface-map.js';
|
||||
import { getTenantRecord, loadTenantRegistry } from './tenant-registry.js';
|
||||
|
|
@ -21,6 +33,9 @@ import {
|
|||
} from './split-brain-status.js';
|
||||
import { WATCHDOG_SOCKET_PATH } from './watchdog.js';
|
||||
|
||||
const MORNING_REPORT_TASK_ID = 'morning-report-0800';
|
||||
const MORNING_REPORT_CRON = '0 8 * * *';
|
||||
|
||||
interface WatchdogStatus {
|
||||
mode: string;
|
||||
throttled: boolean;
|
||||
|
|
@ -62,22 +77,150 @@ function queryWatchdog(): Promise<WatchdogStatus | null> {
|
|||
});
|
||||
}
|
||||
|
||||
function formatSnapshotDate(timestamp?: string): string {
|
||||
function formatSnapshotDate(timestamp?: string | null): string {
|
||||
if (!timestamp) return 'n/a';
|
||||
return formatHumanDate(timestamp);
|
||||
}
|
||||
|
||||
function parseTime(timestamp?: string | null): number | null {
|
||||
if (!timestamp) return null;
|
||||
const value = Date.parse(timestamp);
|
||||
return Number.isNaN(value) ? null : value;
|
||||
}
|
||||
|
||||
async function resolveFrom(server: string, host: string): Promise<string[]> {
|
||||
const resolver = new dns.promises.Resolver();
|
||||
resolver.setServers([server]);
|
||||
return resolver.resolve4(host);
|
||||
}
|
||||
|
||||
async function collectDnsIssues(): Promise<{ lines: string[]; issues: string[] }> {
|
||||
const lines: string[] = [];
|
||||
const issues: string[] = [];
|
||||
const gatewayIp = `${SUBNET_BASE}.1`;
|
||||
const expected = [
|
||||
{ host: CONTROLPLANE_INTERNAL_DOMAIN, ip: gatewayIp },
|
||||
{ host: CMS_INTERNAL_DOMAIN, ip: CMS_JAIL_IP },
|
||||
{ host: platformServiceDomain('web', PLATFORM_INTERNAL_BASE), ip: CMS_JAIL_IP },
|
||||
{ host: CODE_SERVICE_INTERNAL_DOMAIN, ip: GIT_JAIL_IP },
|
||||
];
|
||||
|
||||
const status = spawnSync('service', ['dnsmasq', 'onestatus'], {
|
||||
encoding: 'utf-8',
|
||||
});
|
||||
const serviceOk = status.status === 0;
|
||||
lines.push(`DNSMASQ_SERVICE: ${serviceOk ? 'running' : 'not-running'}`);
|
||||
if (!serviceOk) {
|
||||
const detail = (status.stderr || status.stdout || '').trim();
|
||||
issues.push(`dnsmasq service is not running${detail ? ` (${detail})` : ''}`);
|
||||
}
|
||||
|
||||
const sockstat = spawnSync('sockstat', ['-4', '-l'], { encoding: 'utf-8' });
|
||||
if (sockstat.status === 0) {
|
||||
const out = sockstat.stdout;
|
||||
const loopbackListening = out.includes('127.0.0.1:53');
|
||||
const gatewayListening = out.includes(`${gatewayIp}:53`);
|
||||
lines.push(`DNSMASQ_LISTEN_LOCALHOST: ${loopbackListening ? 'yes' : 'no'}`);
|
||||
lines.push(`DNSMASQ_LISTEN_GATEWAY: ${gatewayListening ? 'yes' : 'no'} (${gatewayIp})`);
|
||||
if (!loopbackListening) issues.push('dnsmasq is not listening on 127.0.0.1:53');
|
||||
if (!gatewayListening) issues.push(`dnsmasq is not listening on ${gatewayIp}:53`);
|
||||
} else {
|
||||
lines.push('DNSMASQ_LISTEN: unknown');
|
||||
issues.push('unable to inspect dnsmasq listeners with sockstat');
|
||||
}
|
||||
|
||||
for (const { host, ip } of expected) {
|
||||
try {
|
||||
const records = await resolveFrom('127.0.0.1', host);
|
||||
const ok = records.includes(ip);
|
||||
lines.push(`DNS_${host.toUpperCase().replace(/[^A-Z0-9]+/g, '_')}: ${ok ? 'ok' : `mismatch (${records.join(',') || 'none'})`}`);
|
||||
if (!ok) issues.push(`${host} does not resolve to ${ip} via dnsmasq localhost`);
|
||||
} catch (err) {
|
||||
const detail = err instanceof Error ? err.message : String(err);
|
||||
lines.push(`DNS_${host.toUpperCase().replace(/[^A-Z0-9]+/g, '_')}: error`);
|
||||
issues.push(`${host} does not resolve via dnsmasq localhost (${detail})`);
|
||||
}
|
||||
}
|
||||
|
||||
return { lines, issues };
|
||||
}
|
||||
|
||||
async function collectMorningReportIssues(): Promise<{ lines: string[]; issues: string[] }> {
|
||||
const lines: string[] = [];
|
||||
const issues: string[] = [];
|
||||
try {
|
||||
const task = await getTaskById(MORNING_REPORT_TASK_ID);
|
||||
if (!task) {
|
||||
lines.push('MORNING_REPORT_TASK: missing');
|
||||
issues.push(`${MORNING_REPORT_TASK_ID} scheduled task is missing`);
|
||||
return { lines, issues };
|
||||
}
|
||||
|
||||
lines.push(`MORNING_REPORT_TASK: ${task.status}`);
|
||||
lines.push(`MORNING_REPORT_CRON: ${task.schedule_value}`);
|
||||
lines.push(`MORNING_REPORT_NEXT_RUN: ${formatSnapshotDate(task.next_run)}`);
|
||||
lines.push(`MORNING_REPORT_LAST_RUN: ${formatSnapshotDate(task.last_run)}`);
|
||||
|
||||
if (task.status !== 'active') {
|
||||
issues.push(`${MORNING_REPORT_TASK_ID} is ${task.status}, not active`);
|
||||
}
|
||||
if (task.group_folder !== MAIN_GROUP_FOLDER) {
|
||||
issues.push(`${MORNING_REPORT_TASK_ID} targets group folder ${task.group_folder}, expected ${MAIN_GROUP_FOLDER}`);
|
||||
}
|
||||
if (task.schedule_type !== 'cron' || task.schedule_value !== MORNING_REPORT_CRON) {
|
||||
issues.push(`${MORNING_REPORT_TASK_ID} schedule is ${task.schedule_type}:${task.schedule_value}, expected cron:${MORNING_REPORT_CRON}`);
|
||||
}
|
||||
|
||||
const nextRun = parseTime(task.next_run);
|
||||
const now = Date.now();
|
||||
if (nextRun === null) {
|
||||
issues.push(`${MORNING_REPORT_TASK_ID} has no valid next_run`);
|
||||
} else {
|
||||
const untilNext = nextRun - now;
|
||||
if (untilNext < -10 * 60 * 1000) {
|
||||
issues.push(`${MORNING_REPORT_TASK_ID} next_run is overdue by more than 10 minutes`);
|
||||
}
|
||||
if (untilNext > 30 * 60 * 60 * 1000) {
|
||||
issues.push(`${MORNING_REPORT_TASK_ID} next_run is more than 30 hours away`);
|
||||
}
|
||||
}
|
||||
|
||||
const last = await getPool().query(
|
||||
`SELECT run_at, status, error FROM task_run_logs WHERE task_id = $1 ORDER BY run_at DESC LIMIT 1`,
|
||||
[MORNING_REPORT_TASK_ID],
|
||||
);
|
||||
const latest = last.rows[0] as { run_at?: string; status?: string; error?: string } | undefined;
|
||||
lines.push(`MORNING_REPORT_LAST_LOG: ${latest ? `${latest.status || 'unknown'} at ${formatSnapshotDate(latest.run_at)}` : 'none'}`);
|
||||
if (latest?.status === 'error') {
|
||||
issues.push(`${MORNING_REPORT_TASK_ID} latest run failed${latest.error ? `: ${latest.error}` : ''}`);
|
||||
}
|
||||
} catch (err) {
|
||||
const detail = err instanceof Error ? err.message : String(err);
|
||||
lines.push('MORNING_REPORT_TASK: unknown');
|
||||
issues.push(`unable to inspect morning report schedule (${detail})`);
|
||||
}
|
||||
return { lines, issues };
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const health = assessHealth();
|
||||
const splitBrain = await collectSplitBrainStatus();
|
||||
const watchdog = await queryWatchdog();
|
||||
const dnsCheck = await collectDnsIssues();
|
||||
const morningReportCheck = await collectMorningReportIssues();
|
||||
const splitBrainIssues = collectSplitBrainIssues(splitBrain);
|
||||
const splitBrainStatus = deriveSplitBrainReadiness(splitBrain);
|
||||
const tmpMountIssues = collectTmpMountFindings().map(
|
||||
({ source, mountpoint }) =>
|
||||
`mounted filesystem inside tmp/: ${mountpoint} (${source}) — tmp/ must contain ordinary temp files only`,
|
||||
);
|
||||
const issues = [...health.issues, ...splitBrainIssues, ...tmpMountIssues];
|
||||
const issues = [
|
||||
...health.issues,
|
||||
...splitBrainIssues,
|
||||
...tmpMountIssues,
|
||||
...dnsCheck.issues,
|
||||
...morningReportCheck.issues,
|
||||
];
|
||||
const criticalSplitBrain =
|
||||
splitBrain.skillsDb !== 'available' ||
|
||||
splitBrain.memoryDb !== 'available' ||
|
||||
|
|
@ -165,6 +308,12 @@ async function main(): Promise<void> {
|
|||
} else {
|
||||
console.log('WATCHDOG: offline');
|
||||
}
|
||||
for (const line of dnsCheck.lines) {
|
||||
console.log(line);
|
||||
}
|
||||
for (const line of morningReportCheck.lines) {
|
||||
console.log(line);
|
||||
}
|
||||
console.log(`SPLIT_BRAIN: ${splitBrainStatus}`);
|
||||
console.log(`SKILLS_DB: ${splitBrain.skillsDb}`);
|
||||
console.log(`SKILLS_ARTIFACT: ${splitBrain.skillsArtifact}`);
|
||||
|
|
@ -185,7 +334,11 @@ async function main(): Promise<void> {
|
|||
}
|
||||
}
|
||||
|
||||
await closePool();
|
||||
|
||||
if (status === 'error') process.exitCode = 1;
|
||||
}
|
||||
|
||||
await main();
|
||||
await main().finally(async () => {
|
||||
await closePool();
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue