Harden host DB reboot path

---
Build: pass | Tests: FAIL — Tests  3 failed | 2079 passed (2082)
This commit is contained in:
Operator & Codex 2026-05-01 10:22:54 +02:00
parent d456aa4be1
commit b02746c298
7 changed files with 231 additions and 13 deletions

View file

@ -10,6 +10,7 @@ import fs from 'fs';
import path from 'path';
import {
DB_RUNTIME,
PLATFORM_RUNTIME_HOME,
PLATFORM_RUNTIME_USER,
PLATFORM_SERVICE_NAME,
@ -180,11 +181,13 @@ function generateRcdService(
logPath: string,
): string {
const envFile = path.join(projectRoot, '.env');
const requireTargets =
DB_RUNTIME === 'host' ? 'NETWORKING LOGIN postgresql' : 'NETWORKING LOGIN';
return [
'#!/bin/sh',
'#',
`# PROVIDE: ${runtime.serviceName}`,
'# REQUIRE: NETWORKING LOGIN',
`# REQUIRE: ${requireTargets}`,
'# KEYWORD: shutdown',
'',
'. /etc/rc.subr',
@ -199,7 +202,7 @@ function generateRcdService(
'load_rc_config $name',
`: \$\{${runtime.serviceName}_enable:="NONE"\}`,
'',
`case "\$\{${agentName}_enable\}" in`,
`case "\$\{${runtime.serviceName}_enable\}" in`,
' [Nn][Oo][Nn][Ee])',
' if [ "$1" = "start" ] || [ "$1" = "restart" ]; then',
' echo ""',

View file

@ -100,14 +100,13 @@ async function checkDbJail(hostdAvailable: boolean): Promise<ControlPlaneCheckRe
}
if (DB_RUNTIME === 'host') {
logger.info('controlplane: host postgres unreachable — attempting service start via hostd');
const res = await hostd('service-start', { name: 'postgresql' });
return {
id, label,
status: res.ok ? 'ok' : 'fail',
detail: res.ok ? 'started via hostd' : (res.error ?? res.output),
fixAttempted: true,
fixResult: res.ok ? 'success' : 'failed',
id,
label,
status: 'fail',
detail:
`PostgreSQL unreachable at ${DB_HOST}:5432; host runtime requires host-level service ordering/recovery`,
fixAttempted: false,
};
}

View file

@ -138,5 +138,25 @@ describe('authorizeHostdOperation', () => {
owner: 'shared-platform',
reason: 'platform-level operation requires operator approval',
});
expect(
authorizeHostdOperation(
'maintenance-reboot',
{},
{ tenantId: 'mevy', caller: 'operator', registry },
),
).toEqual({ allowed: true, owner: 'shared-platform' });
expect(
authorizeHostdOperation(
'maintenance-reboot',
{},
{ tenantId: 'mevy', caller: 'tenant-agent', registry },
),
).toEqual({
allowed: false,
owner: 'shared-platform',
reason: 'platform-level operation requires operator approval',
});
});
});

View file

@ -165,6 +165,7 @@ export function authorizeHostdOperation(
case 'pkg-fetch':
case 'pkg-cache-init':
case 'shutdown-reboot':
case 'maintenance-reboot':
case 'sysrc-set':
case 'sanoid-snapshot':
return caller === 'operator'

View file

@ -38,7 +38,7 @@ describe('handleOp — dispatch', () => {
'zfs-list-snapshot-usage', 'zfs-create', 'zfs-rollback',
'zpool-list', 'zpool-status', 'pf-reload', 'pf-enable',
'service-start', 'service-stop', 'service-restart', 'service-status',
'pkg-install', 'pkg-version', 'pkg-upgrade', 'shutdown-reboot', 'pkg-fetch', 'pkg-cache-init',
'pkg-install', 'pkg-version', 'pkg-upgrade', 'shutdown-reboot', 'maintenance-reboot', 'pkg-fetch', 'pkg-cache-init',
'sysrc-set', 'sanoid-snapshot',
]) {
expect(result.output).toContain(name);
@ -462,3 +462,82 @@ describe('handleOp — shutdown-reboot', () => {
expect(mockSpawnSync).not.toHaveBeenCalled();
});
});
describe('handleOp — maintenance-reboot', () => {
it('stops mevy, checkpoints postgres, snapshots both datasets, and schedules reboot', () => {
mockSpawnSync
.mockReturnValueOnce(spawnOk()) // service mevy onestatus
.mockReturnValueOnce(spawnOk()) // service mevy stop
.mockReturnValueOnce(spawnOk()) // service postgresql onestatus
.mockReturnValueOnce(spawnOk()) // su postgres psql CHECKPOINT
.mockReturnValueOnce(spawnOk()) // service postgresql onestatus
.mockReturnValueOnce(spawnOk()) // service postgresql stop
.mockReturnValueOnce(spawnOk()) // zfs snapshot data
.mockReturnValueOnce(spawnOk()) // zfs snapshot wal
.mockReturnValueOnce(spawnOk()); // shutdown
const result = handleOp('maintenance-reboot', { delayMinutes: 1 });
expect(result.ok).toBe(true);
expect(mockSpawnSync).toHaveBeenNthCalledWith(
1,
'service',
['mevy', 'onestatus'],
expect.any(Object),
);
expect(mockSpawnSync).toHaveBeenNthCalledWith(
2,
'service',
['mevy', 'stop'],
expect.any(Object),
);
expect(mockSpawnSync).toHaveBeenNthCalledWith(
4,
'su',
['-m', 'postgres', '-c', '/usr/local/bin/psql -d postgres -tAc "CHECKPOINT"'],
expect.any(Object),
);
expect(mockSpawnSync).toHaveBeenNthCalledWith(
7,
'zfs',
expect.arrayContaining(['snapshot', expect.stringContaining('/pgdata@pre-reboot-')]),
expect.any(Object),
);
expect(mockSpawnSync).toHaveBeenNthCalledWith(
8,
'zfs',
expect.arrayContaining(['snapshot', expect.stringContaining('/pgwal@pre-reboot-')]),
expect.any(Object),
);
expect(mockSpawnSync).toHaveBeenLastCalledWith(
'shutdown',
['-r', '+1'],
expect.any(Object),
);
});
it('continues when mevy is already stopped', () => {
mockSpawnSync
.mockReturnValueOnce(spawnFail('not running')) // service mevy onestatus
.mockReturnValueOnce(spawnOk()) // service postgresql onestatus
.mockReturnValueOnce(spawnOk()) // su postgres psql CHECKPOINT
.mockReturnValueOnce(spawnOk()) // service postgresql onestatus
.mockReturnValueOnce(spawnOk()) // service postgresql stop
.mockReturnValueOnce(spawnOk()) // zfs snapshot data
.mockReturnValueOnce(spawnOk()) // zfs snapshot wal
.mockReturnValueOnce(spawnOk()); // shutdown
const result = handleOp('maintenance-reboot', {});
expect(result.ok).toBe(true);
expect(mockSpawnSync).toHaveBeenNthCalledWith(
1,
'service',
['mevy', 'onestatus'],
expect.any(Object),
);
expect(mockSpawnSync).not.toHaveBeenCalledWith(
'service',
['mevy', 'stop'],
expect.any(Object),
);
});
});

View file

@ -8,6 +8,9 @@ import { spawnSync } from 'child_process';
import { z } from 'zod';
import { formatSnapshotStamp } from '../display-date.js';
import { PLATFORM_SERVICE_NAME, ZFS_PREFIX } from '../config.js';
// ── Param validators ──────────────────────────────────────────────────────────
const jailName = z.string().regex(/^[a-zA-Z0-9_-]{1,32}$/, 'invalid jail name');
@ -61,6 +64,112 @@ function exec(cmd: string, args: string[]): OpResult {
return { ok: exitCode === 0, output, exitCode };
}
function execAs(user: string, command: string): OpResult {
return exec('su', ['-m', user, '-c', command]);
}
function serviceMaybeStop(name: string): OpResult {
const status = exec('service', [name, 'onestatus']);
if (!status.ok) {
return { ok: true, output: `${name} already stopped` };
}
return exec('service', [name, 'stop']);
}
function serviceMaybeStart(name: string): OpResult {
const status = exec('service', [name, 'onestatus']);
if (status.ok) {
return { ok: true, output: `${name} already running` };
}
return exec('service', [name, 'start']);
}
function buildHostDbDatasets(): { data: string; wal: string } {
const pool = (process.env.ZFS_POOL || 'zroot').trim() || 'zroot';
const prefix = (ZFS_PREFIX || '').trim();
return {
data: `${pool}/${prefix}/pgdata`,
wal: `${pool}/${prefix}/pgwal`,
};
}
function runMaintenanceReboot(delayMinutes: number): OpResult {
const mevyStop = serviceMaybeStop(PLATFORM_SERVICE_NAME);
if (!mevyStop.ok) {
return {
ok: false,
error: `failed to stop ${PLATFORM_SERVICE_NAME}: ${mevyStop.error || mevyStop.output || 'unknown error'}`,
output: mevyStop.output,
exitCode: mevyStop.exitCode,
};
}
const postgresStatus = exec('service', ['postgresql', 'onestatus']);
if (postgresStatus.ok) {
const checkpoint = execAs(
'postgres',
'/usr/local/bin/psql -d postgres -tAc "CHECKPOINT"',
);
if (!checkpoint.ok) {
return {
ok: false,
error: `postgresql CHECKPOINT failed: ${checkpoint.error || checkpoint.output || 'unknown error'}`,
output: checkpoint.output,
exitCode: checkpoint.exitCode,
};
}
}
const postgresStop = serviceMaybeStop('postgresql');
if (!postgresStop.ok) {
return {
ok: false,
error: `failed to stop postgresql: ${postgresStop.error || postgresStop.output || 'unknown error'}`,
output: postgresStop.output,
exitCode: postgresStop.exitCode,
};
}
const stamp = formatSnapshotStamp(new Date());
const snapshotName = `pre-reboot-${stamp}`;
const datasets = buildHostDbDatasets();
const dataSnapshot = exec('zfs', ['snapshot', `${datasets.data}@${snapshotName}`]);
if (!dataSnapshot.ok) {
return {
ok: false,
error: `failed to snapshot ${datasets.data}: ${dataSnapshot.error || dataSnapshot.output || 'unknown error'}`,
output: dataSnapshot.output,
exitCode: dataSnapshot.exitCode,
};
}
const walSnapshot = exec('zfs', ['snapshot', `${datasets.wal}@${snapshotName}`]);
if (!walSnapshot.ok) {
return {
ok: false,
error: `failed to snapshot ${datasets.wal}: ${walSnapshot.error || walSnapshot.output || 'unknown error'}`,
output: walSnapshot.output,
exitCode: walSnapshot.exitCode,
};
}
const shutdown = exec('shutdown', ['-r', `+${delayMinutes}`]);
if (!shutdown.ok) {
return shutdown;
}
return {
ok: true,
output: [
`${PLATFORM_SERVICE_NAME}: ${mevyStop.output || 'stopped'}`,
`postgresql: ${postgresStop.output || 'stopped'}`,
`snapshots: ${datasets.data}@${snapshotName}, ${datasets.wal}@${snapshotName}`,
`reboot: scheduled in ${delayMinutes} minute${delayMinutes === 1 ? '' : 's'}`,
].join('\n'),
};
}
// ── Op registry ───────────────────────────────────────────────────────────────
type ParamMap = Record<string, string | number | boolean>;
@ -230,6 +339,13 @@ const OPS: Record<string, OpEntry> = {
return exec('shutdown', ['-r', `+${delay}`]);
},
},
'maintenance-reboot': {
schema: z.object({ delayMinutes: rebootDelayMinutes.optional() }),
handler: (p) => {
const delay = typeof p.delayMinutes === 'number' ? p.delayMinutes : 1;
return runMaintenanceReboot(delay);
},
},
// Fetch one or more packages (+ deps) to the host cache without installing.
// packages: space-separated list, e.g. "nginx node24 postgresql18-server"

View file

@ -1602,7 +1602,7 @@ export async function handleUpdatesCallback(
}
await ctxArg.answerCallbackQuery({ text: 'Scheduling reboot...' });
const res = await callOperatorHostd('shutdown-reboot', {
const res = await callOperatorHostd('maintenance-reboot', {
delayMinutes: 1,
}).catch((err: any) => ({
ok: false,
@ -1723,8 +1723,8 @@ export async function handleUpdatesCallback(
return;
}
await ctxArg.reply('Idle reached. Scheduling reboot in 1 minute...');
const res = await callOperatorHostd('shutdown-reboot', {
await ctxArg.reply('Idle reached. Scheduling maintenance reboot in 1 minute...');
const res = await callOperatorHostd('maintenance-reboot', {
delayMinutes: 1,
}).catch((err: any) => ({
ok: false,