Add doctor checks for TLS certs and acme.sh renewal cron (Sam & Claude)

Closes the highest-leverage real-risk item from the rank: today the
platform's HTTPS certs renew via acme.sh's own cron — invisible to
/publishreport, the morning report, and just doctor. If the cron breaks
or the renewal hook silently fails, certs expire ~90 days later and
HTTPS dies with no warning shape until clients see errors.

Two new doctor checks, mirroring the dnsmasq/morning-report pattern:

- collectTlsIssues() — for each registered cert path, runs
  `openssl x509 -noout -enddate -subject` and computes days remaining.
  Emits TLS_<LABEL>: <days> days (subject=...) per cert. Issues at:
    < 21 days → warning
    < 7 days  → critical (distinct phrasing)
    expired   → EXPIRED N day(s) ago
  Default probe paths match the live nginx vhosts in
  html/clawdie-si/nginx and html/docs-clawdie-si/nginx. Operators with
  extra domains can override via $TLS_CERT_PATHS (comma-separated).

- collectAcmeRenewalIssues() — verifies acme.sh's renewal cron entry
  still exists in root's crontab or /etc/cron.d. Catches the case
  where the cron got dropped after a host reboot or crontab edit,
  ~80 days before the cert-expiry check would otherwise notice.

Both wired into src/doctor.ts alongside the existing DNS and morning-
report checks. Lines emit before SPLIT_BRAIN; issues fold into the
top-level doctor status (warn/error).

Deliberately NOT in this slice (per the handoff doc 8654c4e):
- Pulling TLS state into the morning-report assembly. Doctor surfaces
  it interactively today; report integration is a small follow-up
  once we've seen what the doctor output actually looks like in
  production.
- Codifying acme.sh setup as setup/tls.ts. Trust what works; this
  commit just observes.

10 new tests cover: openssl notAfter parsing (happy + missing + bad
date), TLS happy path / warning threshold / critical threshold /
expired / unreadable file, acme cron present / missing.

---
2279 tests passing locally. Pre-existing argon2/controlplane-*/cms.test
failures unchanged.

Codex on-host validation: run `just doctor` after this lands. Expect
4 new lines (TLS_CLAWDIE, TLS_DOCS, ACME_RENEWAL_CRON; plus issues if
anything is genuinely off). The warning shape is now real — if certs
are healthy, lines say so; if anything drifts, issues raise it.

---
Build: FAIL | Tests: FAIL — 16 failed
This commit is contained in:
Operator & Claude Code 2026-05-10 10:15:24 +02:00
parent 63eab7e8ab
commit a2f2be600c
3 changed files with 348 additions and 0 deletions

View file

@ -3,9 +3,14 @@ import { describe, expect, it } from 'vitest';
import {
MORNING_REPORT_CRON,
MORNING_REPORT_TASK_ID,
TLS_RENEWAL_CRITICAL_DAYS,
TLS_RENEWAL_WARNING_DAYS,
collectAcmeRenewalIssues,
collectDnsIssues,
collectMorningReportIssues,
collectTlsIssues,
normalizeCronExpression,
parseOpensslNotAfter,
} from './doctor-checks.js';
import type { ScheduledTask } from './types.js';
@ -257,3 +262,155 @@ describe('collectMorningReportIssues', () => {
);
});
});
describe('parseOpensslNotAfter', () => {
it('parses the canonical openssl x509 -enddate output', () => {
const raw = 'notAfter=Jul 18 12:00:00 2026 GMT\n';
const ms = parseOpensslNotAfter(raw);
expect(ms).not.toBeNull();
expect(new Date(ms!).toISOString()).toBe('2026-07-18T12:00:00.000Z');
});
it('returns null when no notAfter line is present', () => {
expect(parseOpensslNotAfter('subject=CN=example.com\n')).toBeNull();
});
it('returns null on unparseable date', () => {
expect(parseOpensslNotAfter('notAfter=not-a-date\n')).toBeNull();
});
});
describe('collectTlsIssues', () => {
const FAR_FUTURE = '2027-06-01T00:00:00.000Z';
const NEAR_WARN = '2026-05-25T00:00:00.000Z'; // ~15 days from clock
const NEAR_CRIT = '2026-05-15T00:00:00.000Z'; // ~5 days from clock
const PAST = '2026-04-30T00:00:00.000Z'; // ~10 days ago
const CLOCK_MS = Date.parse('2026-05-10T00:00:00.000Z');
function spawnMock(byPath: Record<string, { enddate: string; subject: string; status?: number }>) {
return ((command: string, args: string[]) => {
const path = args[args.indexOf('-in') + 1];
const entry = byPath[path];
if (!entry) {
return { status: 1, stdout: '', stderr: 'No such file', pid: 0, output: [], signal: null };
}
const stdout =
`notAfter=${new Date(entry.enddate).toUTCString().replace('GMT', 'GMT')}\n` +
`subject=${entry.subject}\n`;
return {
status: entry.status ?? 0,
stdout,
stderr: '',
pid: 0,
output: [],
signal: null,
};
}) as unknown as Parameters<typeof collectTlsIssues>[0]['spawnSync'];
}
it('passes when every cert has plenty of runway', async () => {
const result = await collectTlsIssues({
probes: [
{ label: 'clawdie', path: '/etc/ssl/clawdie/fullchain.cer' },
{ label: 'docs', path: '/etc/ssl/docs/fullchain.cer' },
],
nowMs: () => CLOCK_MS,
spawnSync: spawnMock({
'/etc/ssl/clawdie/fullchain.cer': { enddate: FAR_FUTURE, subject: 'CN=clawdie.si' },
'/etc/ssl/docs/fullchain.cer': { enddate: FAR_FUTURE, subject: 'CN=docs.clawdie.si' },
}),
});
expect(result.issues).toEqual([]);
expect(result.lines.some((l) => l.startsWith('TLS_CLAWDIE: '))).toBe(true);
expect(result.lines.some((l) => l.includes('subject=CN=clawdie.si'))).toBe(true);
});
it('warns when a cert is below the warning threshold', async () => {
const result = await collectTlsIssues({
probes: [{ label: 'clawdie', path: '/etc/ssl/clawdie/fullchain.cer' }],
nowMs: () => CLOCK_MS,
spawnSync: spawnMock({
'/etc/ssl/clawdie/fullchain.cer': { enddate: NEAR_WARN, subject: 'CN=clawdie.si' },
}),
});
expect(
result.issues.some((i) =>
i.includes('clawdie') && i.includes(`under ${TLS_RENEWAL_WARNING_DAYS}d warning`),
),
).toBe(true);
// Below warning but above critical → not flagged as critical.
expect(result.issues.some((i) => i.includes('critical'))).toBe(false);
});
it('flags critical when below the critical threshold', async () => {
const result = await collectTlsIssues({
probes: [{ label: 'docs', path: '/etc/ssl/docs/fullchain.cer' }],
nowMs: () => CLOCK_MS,
spawnSync: spawnMock({
'/etc/ssl/docs/fullchain.cer': { enddate: NEAR_CRIT, subject: 'CN=docs.clawdie.si' },
}),
});
expect(
result.issues.some(
(i) => i.includes('docs') && i.includes('critical') && i.includes(`under ${TLS_RENEWAL_CRITICAL_DAYS}d`),
),
).toBe(true);
});
it('flags expired certs distinctly from soon-to-expire ones', async () => {
const result = await collectTlsIssues({
probes: [{ label: 'clawdie', path: '/etc/ssl/clawdie/fullchain.cer' }],
nowMs: () => CLOCK_MS,
spawnSync: spawnMock({
'/etc/ssl/clawdie/fullchain.cer': { enddate: PAST, subject: 'CN=clawdie.si' },
}),
});
expect(result.issues.some((i) => i.includes('EXPIRED'))).toBe(true);
});
it('reports an issue when a cert file cannot be read', async () => {
const result = await collectTlsIssues({
probes: [{ label: 'clawdie', path: '/etc/ssl/clawdie/fullchain.cer' }],
nowMs: () => CLOCK_MS,
spawnSync: spawnMock({}),
});
expect(result.lines).toContain('TLS_CLAWDIE: error');
expect(result.issues.some((i) => i.includes('cannot be read'))).toBe(true);
});
});
describe('collectAcmeRenewalIssues', () => {
it('passes when root crontab contains an acme.sh entry', async () => {
const result = await collectAcmeRenewalIssues({
cronListUser: 'root',
cronDir: '/nonexistent',
spawnSync: (() => ({
status: 0,
stdout: '12 0 * * * "/root/.acme.sh"/acme.sh --cron --home "/root/.acme.sh" > /dev/null\n',
stderr: '',
pid: 0,
output: [],
signal: null,
})) as Parameters<typeof collectAcmeRenewalIssues>[0]['spawnSync'],
});
expect(result.issues).toEqual([]);
expect(result.lines).toContain('ACME_RENEWAL_CRON: present');
});
it('flags missing renewal cron when crontab is empty and /etc/cron.d has nothing', async () => {
const result = await collectAcmeRenewalIssues({
cronListUser: 'root',
cronDir: '/nonexistent',
spawnSync: (() => ({
status: 1,
stdout: '',
stderr: 'crontab: no crontab for root',
pid: 0,
output: [],
signal: null,
})) as Parameters<typeof collectAcmeRenewalIssues>[0]['spawnSync'],
});
expect(result.lines).toContain('ACME_RENEWAL_CRON: missing');
expect(result.issues.some((i) => i.includes('not found'))).toBe(true);
});
});

View file

@ -24,12 +24,24 @@ export const MORNING_REPORT_CRON = '0 8 * * *';
const SERVICE_CMD = '/usr/sbin/service';
const SOCKSTAT_CMD = '/usr/bin/sockstat';
const OPENSSL_CMD = '/usr/bin/openssl';
const CRONTAB_CMD = '/usr/bin/crontab';
const MS_PER_MINUTE = 60 * 1000;
const MS_PER_HOUR = 60 * MS_PER_MINUTE;
const MS_PER_DAY = 24 * MS_PER_HOUR;
/** How far past `next_run` we tolerate before flagging the task as overdue. */
const NEXT_RUN_OVERDUE_TOLERANCE_MS = 10 * MS_PER_MINUTE;
/** Upper bound for `next_run` against now — daily task should be ≤24h ahead. */
const NEXT_RUN_AHEAD_LIMIT_MS = 30 * MS_PER_HOUR;
/** Below this, the cert is approaching expiry — surface as a warning issue. */
export const TLS_RENEWAL_WARNING_DAYS = 21;
/** Below this, the cert is critically close to expiry. */
export const TLS_RENEWAL_CRITICAL_DAYS = 7;
/** Canonical cert paths — match the live nginx vhosts in html/clawdie-si and html/docs-clawdie-si. */
const DEFAULT_TLS_CERT_PATHS = [
'/usr/local/etc/nginx/ssl/clawdie/fullchain.cer',
'/usr/local/etc/nginx/ssl/docs/fullchain.cer',
];
export interface DoctorCheckResult {
lines: string[];
@ -282,3 +294,170 @@ export async function collectMorningReportIssues(
}
return { lines, issues };
}
// ── TLS cert lifecycle ────────────────────────────────────────────────────
//
// The platform's HTTPS endpoints (clawdie.si, docs.clawdie.si) renew via
// acme.sh's own cron, which is invisible to /publishreport, the morning
// report, and the doctor. If acme.sh's cron breaks or the renewal hook
// silently fails, certs expire ~90 days later and HTTPS dies. These checks
// give the operator the warning shape that's missing today: days remaining
// per cert (warn at 21d, critical at 7d) plus a sanity check that acme.sh's
// own cron entry still exists.
export interface TlsCertProbe {
/** Short label used in line output (e.g. "clawdie", "docs"). */
label: string;
/** Absolute path to the fullchain.cer (or equivalent). */
path: string;
}
export interface TlsCheckDeps {
spawnSync?: SpawnSyncText;
/** Probes to inspect. When omitted, derives from $TLS_CERT_PATHS env var
* (comma-separated, label inferred from second-to-last path segment) or
* falls back to the canonical platform paths. */
probes?: TlsCertProbe[];
nowMs?: () => number;
/** Override for tests. */
warningDays?: number;
criticalDays?: number;
}
export interface AcmeRenewalCheckDeps {
spawnSync?: SpawnSyncText;
cronListUser?: string;
cronDir?: string;
}
function defaultTlsProbes(envValue: string | undefined): TlsCertProbe[] {
const fromEnv = (envValue ?? '')
.split(',')
.map((p) => p.trim())
.filter(Boolean);
const paths = fromEnv.length > 0 ? fromEnv : DEFAULT_TLS_CERT_PATHS;
return paths.map((p) => ({
// Path like /usr/local/etc/nginx/ssl/clawdie/fullchain.cer → label "clawdie"
label: p.split('/').slice(-2, -1)[0] || 'cert',
path: p,
}));
}
/**
* Parse the `notAfter=` line emitted by `openssl x509 -enddate`. Format:
* `notAfter=Jul 18 12:00:00 2026 GMT`. Returns ms-since-epoch or null when
* the line is absent or unparseable.
*/
export function parseOpensslNotAfter(raw: string): number | null {
const match = raw.split(/\r?\n/u).find((line) => line.startsWith('notAfter='));
if (!match) return null;
const value = match.slice('notAfter='.length).trim();
const parsed = Date.parse(value);
return Number.isNaN(parsed) ? null : parsed;
}
export async function collectTlsIssues(
deps: TlsCheckDeps = {},
): Promise<DoctorCheckResult> {
const lines: string[] = [];
const issues: string[] = [];
const spawnSync = deps.spawnSync ?? defaultSpawnSync;
const probes = deps.probes ?? defaultTlsProbes(process.env.TLS_CERT_PATHS);
const now = (deps.nowMs ?? Date.now)();
const warnDays = deps.warningDays ?? TLS_RENEWAL_WARNING_DAYS;
const critDays = deps.criticalDays ?? TLS_RENEWAL_CRITICAL_DAYS;
for (const probe of probes) {
const labelKey = probe.label.toUpperCase().replace(/[^A-Z0-9]+/g, '_');
const result = spawnSync(
OPENSSL_CMD,
['x509', '-in', probe.path, '-noout', '-enddate', '-subject'],
{ encoding: 'utf-8' },
);
if (result.status !== 0) {
const detail = (result.stderr || result.stdout || '').toString().trim();
lines.push(`TLS_${labelKey}: error`);
issues.push(
`cert ${probe.label} (${probe.path}) cannot be read${detail ? ` (${detail})` : ''}`,
);
continue;
}
const stdout = String(result.stdout || '');
const expiresAtMs = parseOpensslNotAfter(stdout);
if (expiresAtMs === null) {
lines.push(`TLS_${labelKey}: error`);
issues.push(
`cert ${probe.label} (${probe.path}) has no parseable notAfter`,
);
continue;
}
const daysRemaining = Math.floor((expiresAtMs - now) / MS_PER_DAY);
const subjectLine =
stdout.split(/\r?\n/u).find((l) => l.startsWith('subject=')) ?? '';
const subject = subjectLine.slice('subject='.length).trim() || 'unknown';
lines.push(`TLS_${labelKey}: ${daysRemaining} days (subject=${subject})`);
if (daysRemaining < 0) {
issues.push(
`cert ${probe.label} EXPIRED ${Math.abs(daysRemaining)} day(s) ago`,
);
} else if (daysRemaining < critDays) {
issues.push(
`cert ${probe.label} expires in ${daysRemaining} day(s) — critical, well under ${critDays}d threshold`,
);
} else if (daysRemaining < warnDays) {
issues.push(
`cert ${probe.label} expires in ${daysRemaining} day(s) — under ${warnDays}d warning threshold`,
);
}
}
return { lines, issues };
}
/**
* Verify acme.sh's own cron entry still exists. If acme.sh sets up its
* renewal job at install time and a host reboot or crontab edit drops it,
* Clawdie has no other warning path before the cert expires ~90 days later.
*/
export async function collectAcmeRenewalIssues(
deps: AcmeRenewalCheckDeps = {},
): Promise<DoctorCheckResult> {
const lines: string[] = [];
const issues: string[] = [];
const spawnSync = deps.spawnSync ?? defaultSpawnSync;
const user = deps.cronListUser ?? 'root';
const cronDir = deps.cronDir ?? '/etc/cron.d';
const result = spawnSync(CRONTAB_CMD, ['-l', '-u', user], {
encoding: 'utf-8',
});
const userCronOutput = result.status === 0 ? String(result.stdout || '') : '';
const userCronHasAcme = /acme\.sh|--cron/i.test(userCronOutput);
let systemCronHasAcme = false;
try {
const fsModule = await import('fs');
if (fsModule.existsSync(cronDir)) {
for (const entry of fsModule.readdirSync(cronDir)) {
const text = fsModule.readFileSync(`${cronDir}/${entry}`, 'utf-8');
if (/acme\.sh|--cron/i.test(text)) {
systemCronHasAcme = true;
break;
}
}
}
} catch {
// best-effort; leave systemCronHasAcme=false
}
const present = userCronHasAcme || systemCronHasAcme;
lines.push(`ACME_RENEWAL_CRON: ${present ? 'present' : 'missing'}`);
if (!present) {
issues.push(
`acme.sh renewal cron entry not found in ${user}'s crontab or ${cronDir}; certs may stop renewing`,
);
}
return { lines, issues };
}

View file

@ -10,8 +10,10 @@ import {
} from './config.js';
import { closePool } from './db.js';
import {
collectAcmeRenewalIssues,
collectDnsIssues,
collectMorningReportIssues,
collectTlsIssues,
} from './doctor-checks.js';
import { formatDisplayDate as formatHumanDate } from './display-date.js';
import { assessHealth } from './health.js';
@ -93,6 +95,8 @@ async function main(): Promise<void> {
const watchdog = await queryWatchdog();
const dnsCheck = await collectDnsIssues();
const morningReportCheck = await collectMorningReportIssues();
const tlsCheck = await collectTlsIssues();
const acmeCheck = await collectAcmeRenewalIssues();
const splitBrainIssues = collectSplitBrainIssues(splitBrain);
const splitBrainStatus = deriveSplitBrainReadiness(splitBrain);
const tmpMountIssues = collectTmpMountFindings().map(
@ -105,6 +109,8 @@ async function main(): Promise<void> {
...tmpMountIssues,
...dnsCheck.issues,
...morningReportCheck.issues,
...tlsCheck.issues,
...acmeCheck.issues,
];
const criticalSplitBrain =
splitBrain.skillsDb !== 'available' ||
@ -205,6 +211,12 @@ async function main(): Promise<void> {
for (const line of morningReportCheck.lines) {
console.log(line);
}
for (const line of tlsCheck.lines) {
console.log(line);
}
for (const line of acmeCheck.lines) {
console.log(line);
}
console.log(`SPLIT_BRAIN: ${splitBrainStatus}`);
console.log(`SKILLS_DB: ${splitBrain.skillsDb}`);
console.log(`SKILLS_ARTIFACT: ${splitBrain.skillsArtifact}`);