Add doctor checks for TLS certs and acme.sh renewal cron (Sam & Claude)
Closes the highest-leverage real-risk item from the rank: today the
platform's HTTPS certs renew via acme.sh's own cron — invisible to
/publishreport, the morning report, and just doctor. If the cron breaks
or the renewal hook silently fails, certs expire ~90 days later and
HTTPS dies with no warning shape until clients see errors.
Two new doctor checks, mirroring the dnsmasq/morning-report pattern:
- collectTlsIssues() — for each registered cert path, runs
`openssl x509 -noout -enddate -subject` and computes days remaining.
Emits TLS_<LABEL>: <days> days (subject=...) per cert. Issues at:
< 21 days → warning
< 7 days → critical (distinct phrasing)
expired → EXPIRED N day(s) ago
Default probe paths match the live nginx vhosts in
html/clawdie-si/nginx and html/docs-clawdie-si/nginx. Operators with
extra domains can override via $TLS_CERT_PATHS (comma-separated).
- collectAcmeRenewalIssues() — verifies acme.sh's renewal cron entry
still exists in root's crontab or /etc/cron.d. Catches the case
where the cron got dropped after a host reboot or crontab edit,
~80 days before the cert-expiry check would otherwise notice.
Both wired into src/doctor.ts alongside the existing DNS and morning-
report checks. Lines emit before SPLIT_BRAIN; issues fold into the
top-level doctor status (warn/error).
Deliberately NOT in this slice (per the handoff doc 8654c4e):
- Pulling TLS state into the morning-report assembly. Doctor surfaces
it interactively today; report integration is a small follow-up
once we've seen what the doctor output actually looks like in
production.
- Codifying acme.sh setup as setup/tls.ts. Trust what works; this
commit just observes.
10 new tests cover: openssl notAfter parsing (happy + missing + bad
date), TLS happy path / warning threshold / critical threshold /
expired / unreadable file, acme cron present / missing.
---
2279 tests passing locally. Pre-existing argon2/controlplane-*/cms.test
failures unchanged.
Codex on-host validation: run `just doctor` after this lands. Expect
4 new lines (TLS_CLAWDIE, TLS_DOCS, ACME_RENEWAL_CRON; plus issues if
anything is genuinely off). The warning shape is now real — if certs
are healthy, lines say so; if anything drifts, issues raise it.
---
Build: FAIL | Tests: FAIL — 16 failed
This commit is contained in:
parent
63eab7e8ab
commit
a2f2be600c
3 changed files with 348 additions and 0 deletions
|
|
@ -3,9 +3,14 @@ import { describe, expect, it } from 'vitest';
|
|||
import {
|
||||
MORNING_REPORT_CRON,
|
||||
MORNING_REPORT_TASK_ID,
|
||||
TLS_RENEWAL_CRITICAL_DAYS,
|
||||
TLS_RENEWAL_WARNING_DAYS,
|
||||
collectAcmeRenewalIssues,
|
||||
collectDnsIssues,
|
||||
collectMorningReportIssues,
|
||||
collectTlsIssues,
|
||||
normalizeCronExpression,
|
||||
parseOpensslNotAfter,
|
||||
} from './doctor-checks.js';
|
||||
import type { ScheduledTask } from './types.js';
|
||||
|
||||
|
|
@ -257,3 +262,155 @@ describe('collectMorningReportIssues', () => {
|
|||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('parseOpensslNotAfter', () => {
|
||||
it('parses the canonical openssl x509 -enddate output', () => {
|
||||
const raw = 'notAfter=Jul 18 12:00:00 2026 GMT\n';
|
||||
const ms = parseOpensslNotAfter(raw);
|
||||
expect(ms).not.toBeNull();
|
||||
expect(new Date(ms!).toISOString()).toBe('2026-07-18T12:00:00.000Z');
|
||||
});
|
||||
|
||||
it('returns null when no notAfter line is present', () => {
|
||||
expect(parseOpensslNotAfter('subject=CN=example.com\n')).toBeNull();
|
||||
});
|
||||
|
||||
it('returns null on unparseable date', () => {
|
||||
expect(parseOpensslNotAfter('notAfter=not-a-date\n')).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe('collectTlsIssues', () => {
|
||||
const FAR_FUTURE = '2027-06-01T00:00:00.000Z';
|
||||
const NEAR_WARN = '2026-05-25T00:00:00.000Z'; // ~15 days from clock
|
||||
const NEAR_CRIT = '2026-05-15T00:00:00.000Z'; // ~5 days from clock
|
||||
const PAST = '2026-04-30T00:00:00.000Z'; // ~10 days ago
|
||||
const CLOCK_MS = Date.parse('2026-05-10T00:00:00.000Z');
|
||||
|
||||
function spawnMock(byPath: Record<string, { enddate: string; subject: string; status?: number }>) {
|
||||
return ((command: string, args: string[]) => {
|
||||
const path = args[args.indexOf('-in') + 1];
|
||||
const entry = byPath[path];
|
||||
if (!entry) {
|
||||
return { status: 1, stdout: '', stderr: 'No such file', pid: 0, output: [], signal: null };
|
||||
}
|
||||
const stdout =
|
||||
`notAfter=${new Date(entry.enddate).toUTCString().replace('GMT', 'GMT')}\n` +
|
||||
`subject=${entry.subject}\n`;
|
||||
return {
|
||||
status: entry.status ?? 0,
|
||||
stdout,
|
||||
stderr: '',
|
||||
pid: 0,
|
||||
output: [],
|
||||
signal: null,
|
||||
};
|
||||
}) as unknown as Parameters<typeof collectTlsIssues>[0]['spawnSync'];
|
||||
}
|
||||
|
||||
it('passes when every cert has plenty of runway', async () => {
|
||||
const result = await collectTlsIssues({
|
||||
probes: [
|
||||
{ label: 'clawdie', path: '/etc/ssl/clawdie/fullchain.cer' },
|
||||
{ label: 'docs', path: '/etc/ssl/docs/fullchain.cer' },
|
||||
],
|
||||
nowMs: () => CLOCK_MS,
|
||||
spawnSync: spawnMock({
|
||||
'/etc/ssl/clawdie/fullchain.cer': { enddate: FAR_FUTURE, subject: 'CN=clawdie.si' },
|
||||
'/etc/ssl/docs/fullchain.cer': { enddate: FAR_FUTURE, subject: 'CN=docs.clawdie.si' },
|
||||
}),
|
||||
});
|
||||
expect(result.issues).toEqual([]);
|
||||
expect(result.lines.some((l) => l.startsWith('TLS_CLAWDIE: '))).toBe(true);
|
||||
expect(result.lines.some((l) => l.includes('subject=CN=clawdie.si'))).toBe(true);
|
||||
});
|
||||
|
||||
it('warns when a cert is below the warning threshold', async () => {
|
||||
const result = await collectTlsIssues({
|
||||
probes: [{ label: 'clawdie', path: '/etc/ssl/clawdie/fullchain.cer' }],
|
||||
nowMs: () => CLOCK_MS,
|
||||
spawnSync: spawnMock({
|
||||
'/etc/ssl/clawdie/fullchain.cer': { enddate: NEAR_WARN, subject: 'CN=clawdie.si' },
|
||||
}),
|
||||
});
|
||||
expect(
|
||||
result.issues.some((i) =>
|
||||
i.includes('clawdie') && i.includes(`under ${TLS_RENEWAL_WARNING_DAYS}d warning`),
|
||||
),
|
||||
).toBe(true);
|
||||
// Below warning but above critical → not flagged as critical.
|
||||
expect(result.issues.some((i) => i.includes('critical'))).toBe(false);
|
||||
});
|
||||
|
||||
it('flags critical when below the critical threshold', async () => {
|
||||
const result = await collectTlsIssues({
|
||||
probes: [{ label: 'docs', path: '/etc/ssl/docs/fullchain.cer' }],
|
||||
nowMs: () => CLOCK_MS,
|
||||
spawnSync: spawnMock({
|
||||
'/etc/ssl/docs/fullchain.cer': { enddate: NEAR_CRIT, subject: 'CN=docs.clawdie.si' },
|
||||
}),
|
||||
});
|
||||
expect(
|
||||
result.issues.some(
|
||||
(i) => i.includes('docs') && i.includes('critical') && i.includes(`under ${TLS_RENEWAL_CRITICAL_DAYS}d`),
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it('flags expired certs distinctly from soon-to-expire ones', async () => {
|
||||
const result = await collectTlsIssues({
|
||||
probes: [{ label: 'clawdie', path: '/etc/ssl/clawdie/fullchain.cer' }],
|
||||
nowMs: () => CLOCK_MS,
|
||||
spawnSync: spawnMock({
|
||||
'/etc/ssl/clawdie/fullchain.cer': { enddate: PAST, subject: 'CN=clawdie.si' },
|
||||
}),
|
||||
});
|
||||
expect(result.issues.some((i) => i.includes('EXPIRED'))).toBe(true);
|
||||
});
|
||||
|
||||
it('reports an issue when a cert file cannot be read', async () => {
|
||||
const result = await collectTlsIssues({
|
||||
probes: [{ label: 'clawdie', path: '/etc/ssl/clawdie/fullchain.cer' }],
|
||||
nowMs: () => CLOCK_MS,
|
||||
spawnSync: spawnMock({}),
|
||||
});
|
||||
expect(result.lines).toContain('TLS_CLAWDIE: error');
|
||||
expect(result.issues.some((i) => i.includes('cannot be read'))).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('collectAcmeRenewalIssues', () => {
|
||||
it('passes when root crontab contains an acme.sh entry', async () => {
|
||||
const result = await collectAcmeRenewalIssues({
|
||||
cronListUser: 'root',
|
||||
cronDir: '/nonexistent',
|
||||
spawnSync: (() => ({
|
||||
status: 0,
|
||||
stdout: '12 0 * * * "/root/.acme.sh"/acme.sh --cron --home "/root/.acme.sh" > /dev/null\n',
|
||||
stderr: '',
|
||||
pid: 0,
|
||||
output: [],
|
||||
signal: null,
|
||||
})) as Parameters<typeof collectAcmeRenewalIssues>[0]['spawnSync'],
|
||||
});
|
||||
expect(result.issues).toEqual([]);
|
||||
expect(result.lines).toContain('ACME_RENEWAL_CRON: present');
|
||||
});
|
||||
|
||||
it('flags missing renewal cron when crontab is empty and /etc/cron.d has nothing', async () => {
|
||||
const result = await collectAcmeRenewalIssues({
|
||||
cronListUser: 'root',
|
||||
cronDir: '/nonexistent',
|
||||
spawnSync: (() => ({
|
||||
status: 1,
|
||||
stdout: '',
|
||||
stderr: 'crontab: no crontab for root',
|
||||
pid: 0,
|
||||
output: [],
|
||||
signal: null,
|
||||
})) as Parameters<typeof collectAcmeRenewalIssues>[0]['spawnSync'],
|
||||
});
|
||||
expect(result.lines).toContain('ACME_RENEWAL_CRON: missing');
|
||||
expect(result.issues.some((i) => i.includes('not found'))).toBe(true);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -24,12 +24,24 @@ export const MORNING_REPORT_CRON = '0 8 * * *';
|
|||
|
||||
const SERVICE_CMD = '/usr/sbin/service';
|
||||
const SOCKSTAT_CMD = '/usr/bin/sockstat';
|
||||
const OPENSSL_CMD = '/usr/bin/openssl';
|
||||
const CRONTAB_CMD = '/usr/bin/crontab';
|
||||
const MS_PER_MINUTE = 60 * 1000;
|
||||
const MS_PER_HOUR = 60 * MS_PER_MINUTE;
|
||||
const MS_PER_DAY = 24 * MS_PER_HOUR;
|
||||
/** How far past `next_run` we tolerate before flagging the task as overdue. */
|
||||
const NEXT_RUN_OVERDUE_TOLERANCE_MS = 10 * MS_PER_MINUTE;
|
||||
/** Upper bound for `next_run` against now — daily task should be ≤24h ahead. */
|
||||
const NEXT_RUN_AHEAD_LIMIT_MS = 30 * MS_PER_HOUR;
|
||||
/** Below this, the cert is approaching expiry — surface as a warning issue. */
|
||||
export const TLS_RENEWAL_WARNING_DAYS = 21;
|
||||
/** Below this, the cert is critically close to expiry. */
|
||||
export const TLS_RENEWAL_CRITICAL_DAYS = 7;
|
||||
/** Canonical cert paths — match the live nginx vhosts in html/clawdie-si and html/docs-clawdie-si. */
|
||||
const DEFAULT_TLS_CERT_PATHS = [
|
||||
'/usr/local/etc/nginx/ssl/clawdie/fullchain.cer',
|
||||
'/usr/local/etc/nginx/ssl/docs/fullchain.cer',
|
||||
];
|
||||
|
||||
export interface DoctorCheckResult {
|
||||
lines: string[];
|
||||
|
|
@ -282,3 +294,170 @@ export async function collectMorningReportIssues(
|
|||
}
|
||||
return { lines, issues };
|
||||
}
|
||||
|
||||
// ── TLS cert lifecycle ────────────────────────────────────────────────────
|
||||
//
|
||||
// The platform's HTTPS endpoints (clawdie.si, docs.clawdie.si) renew via
|
||||
// acme.sh's own cron, which is invisible to /publishreport, the morning
|
||||
// report, and the doctor. If acme.sh's cron breaks or the renewal hook
|
||||
// silently fails, certs expire ~90 days later and HTTPS dies. These checks
|
||||
// give the operator the warning shape that's missing today: days remaining
|
||||
// per cert (warn at 21d, critical at 7d) plus a sanity check that acme.sh's
|
||||
// own cron entry still exists.
|
||||
|
||||
export interface TlsCertProbe {
|
||||
/** Short label used in line output (e.g. "clawdie", "docs"). */
|
||||
label: string;
|
||||
/** Absolute path to the fullchain.cer (or equivalent). */
|
||||
path: string;
|
||||
}
|
||||
|
||||
export interface TlsCheckDeps {
|
||||
spawnSync?: SpawnSyncText;
|
||||
/** Probes to inspect. When omitted, derives from $TLS_CERT_PATHS env var
|
||||
* (comma-separated, label inferred from second-to-last path segment) or
|
||||
* falls back to the canonical platform paths. */
|
||||
probes?: TlsCertProbe[];
|
||||
nowMs?: () => number;
|
||||
/** Override for tests. */
|
||||
warningDays?: number;
|
||||
criticalDays?: number;
|
||||
}
|
||||
|
||||
export interface AcmeRenewalCheckDeps {
|
||||
spawnSync?: SpawnSyncText;
|
||||
cronListUser?: string;
|
||||
cronDir?: string;
|
||||
}
|
||||
|
||||
function defaultTlsProbes(envValue: string | undefined): TlsCertProbe[] {
|
||||
const fromEnv = (envValue ?? '')
|
||||
.split(',')
|
||||
.map((p) => p.trim())
|
||||
.filter(Boolean);
|
||||
const paths = fromEnv.length > 0 ? fromEnv : DEFAULT_TLS_CERT_PATHS;
|
||||
return paths.map((p) => ({
|
||||
// Path like /usr/local/etc/nginx/ssl/clawdie/fullchain.cer → label "clawdie"
|
||||
label: p.split('/').slice(-2, -1)[0] || 'cert',
|
||||
path: p,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the `notAfter=` line emitted by `openssl x509 -enddate`. Format:
|
||||
* `notAfter=Jul 18 12:00:00 2026 GMT`. Returns ms-since-epoch or null when
|
||||
* the line is absent or unparseable.
|
||||
*/
|
||||
export function parseOpensslNotAfter(raw: string): number | null {
|
||||
const match = raw.split(/\r?\n/u).find((line) => line.startsWith('notAfter='));
|
||||
if (!match) return null;
|
||||
const value = match.slice('notAfter='.length).trim();
|
||||
const parsed = Date.parse(value);
|
||||
return Number.isNaN(parsed) ? null : parsed;
|
||||
}
|
||||
|
||||
export async function collectTlsIssues(
|
||||
deps: TlsCheckDeps = {},
|
||||
): Promise<DoctorCheckResult> {
|
||||
const lines: string[] = [];
|
||||
const issues: string[] = [];
|
||||
const spawnSync = deps.spawnSync ?? defaultSpawnSync;
|
||||
const probes = deps.probes ?? defaultTlsProbes(process.env.TLS_CERT_PATHS);
|
||||
const now = (deps.nowMs ?? Date.now)();
|
||||
const warnDays = deps.warningDays ?? TLS_RENEWAL_WARNING_DAYS;
|
||||
const critDays = deps.criticalDays ?? TLS_RENEWAL_CRITICAL_DAYS;
|
||||
|
||||
for (const probe of probes) {
|
||||
const labelKey = probe.label.toUpperCase().replace(/[^A-Z0-9]+/g, '_');
|
||||
const result = spawnSync(
|
||||
OPENSSL_CMD,
|
||||
['x509', '-in', probe.path, '-noout', '-enddate', '-subject'],
|
||||
{ encoding: 'utf-8' },
|
||||
);
|
||||
if (result.status !== 0) {
|
||||
const detail = (result.stderr || result.stdout || '').toString().trim();
|
||||
lines.push(`TLS_${labelKey}: error`);
|
||||
issues.push(
|
||||
`cert ${probe.label} (${probe.path}) cannot be read${detail ? ` (${detail})` : ''}`,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
const stdout = String(result.stdout || '');
|
||||
const expiresAtMs = parseOpensslNotAfter(stdout);
|
||||
if (expiresAtMs === null) {
|
||||
lines.push(`TLS_${labelKey}: error`);
|
||||
issues.push(
|
||||
`cert ${probe.label} (${probe.path}) has no parseable notAfter`,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
const daysRemaining = Math.floor((expiresAtMs - now) / MS_PER_DAY);
|
||||
const subjectLine =
|
||||
stdout.split(/\r?\n/u).find((l) => l.startsWith('subject=')) ?? '';
|
||||
const subject = subjectLine.slice('subject='.length).trim() || 'unknown';
|
||||
|
||||
lines.push(`TLS_${labelKey}: ${daysRemaining} days (subject=${subject})`);
|
||||
|
||||
if (daysRemaining < 0) {
|
||||
issues.push(
|
||||
`cert ${probe.label} EXPIRED ${Math.abs(daysRemaining)} day(s) ago`,
|
||||
);
|
||||
} else if (daysRemaining < critDays) {
|
||||
issues.push(
|
||||
`cert ${probe.label} expires in ${daysRemaining} day(s) — critical, well under ${critDays}d threshold`,
|
||||
);
|
||||
} else if (daysRemaining < warnDays) {
|
||||
issues.push(
|
||||
`cert ${probe.label} expires in ${daysRemaining} day(s) — under ${warnDays}d warning threshold`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return { lines, issues };
|
||||
}
|
||||
|
||||
/**
|
||||
* Verify acme.sh's own cron entry still exists. If acme.sh sets up its
|
||||
* renewal job at install time and a host reboot or crontab edit drops it,
|
||||
* Clawdie has no other warning path before the cert expires ~90 days later.
|
||||
*/
|
||||
export async function collectAcmeRenewalIssues(
|
||||
deps: AcmeRenewalCheckDeps = {},
|
||||
): Promise<DoctorCheckResult> {
|
||||
const lines: string[] = [];
|
||||
const issues: string[] = [];
|
||||
const spawnSync = deps.spawnSync ?? defaultSpawnSync;
|
||||
const user = deps.cronListUser ?? 'root';
|
||||
const cronDir = deps.cronDir ?? '/etc/cron.d';
|
||||
|
||||
const result = spawnSync(CRONTAB_CMD, ['-l', '-u', user], {
|
||||
encoding: 'utf-8',
|
||||
});
|
||||
const userCronOutput = result.status === 0 ? String(result.stdout || '') : '';
|
||||
const userCronHasAcme = /acme\.sh|--cron/i.test(userCronOutput);
|
||||
|
||||
let systemCronHasAcme = false;
|
||||
try {
|
||||
const fsModule = await import('fs');
|
||||
if (fsModule.existsSync(cronDir)) {
|
||||
for (const entry of fsModule.readdirSync(cronDir)) {
|
||||
const text = fsModule.readFileSync(`${cronDir}/${entry}`, 'utf-8');
|
||||
if (/acme\.sh|--cron/i.test(text)) {
|
||||
systemCronHasAcme = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// best-effort; leave systemCronHasAcme=false
|
||||
}
|
||||
|
||||
const present = userCronHasAcme || systemCronHasAcme;
|
||||
lines.push(`ACME_RENEWAL_CRON: ${present ? 'present' : 'missing'}`);
|
||||
if (!present) {
|
||||
issues.push(
|
||||
`acme.sh renewal cron entry not found in ${user}'s crontab or ${cronDir}; certs may stop renewing`,
|
||||
);
|
||||
}
|
||||
return { lines, issues };
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,8 +10,10 @@ import {
|
|||
} from './config.js';
|
||||
import { closePool } from './db.js';
|
||||
import {
|
||||
collectAcmeRenewalIssues,
|
||||
collectDnsIssues,
|
||||
collectMorningReportIssues,
|
||||
collectTlsIssues,
|
||||
} from './doctor-checks.js';
|
||||
import { formatDisplayDate as formatHumanDate } from './display-date.js';
|
||||
import { assessHealth } from './health.js';
|
||||
|
|
@ -93,6 +95,8 @@ async function main(): Promise<void> {
|
|||
const watchdog = await queryWatchdog();
|
||||
const dnsCheck = await collectDnsIssues();
|
||||
const morningReportCheck = await collectMorningReportIssues();
|
||||
const tlsCheck = await collectTlsIssues();
|
||||
const acmeCheck = await collectAcmeRenewalIssues();
|
||||
const splitBrainIssues = collectSplitBrainIssues(splitBrain);
|
||||
const splitBrainStatus = deriveSplitBrainReadiness(splitBrain);
|
||||
const tmpMountIssues = collectTmpMountFindings().map(
|
||||
|
|
@ -105,6 +109,8 @@ async function main(): Promise<void> {
|
|||
...tmpMountIssues,
|
||||
...dnsCheck.issues,
|
||||
...morningReportCheck.issues,
|
||||
...tlsCheck.issues,
|
||||
...acmeCheck.issues,
|
||||
];
|
||||
const criticalSplitBrain =
|
||||
splitBrain.skillsDb !== 'available' ||
|
||||
|
|
@ -205,6 +211,12 @@ async function main(): Promise<void> {
|
|||
for (const line of morningReportCheck.lines) {
|
||||
console.log(line);
|
||||
}
|
||||
for (const line of tlsCheck.lines) {
|
||||
console.log(line);
|
||||
}
|
||||
for (const line of acmeCheck.lines) {
|
||||
console.log(line);
|
||||
}
|
||||
console.log(`SPLIT_BRAIN: ${splitBrainStatus}`);
|
||||
console.log(`SKILLS_DB: ${splitBrain.skillsDb}`);
|
||||
console.log(`SKILLS_ARTIFACT: ${splitBrain.skillsArtifact}`);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue