Harden host DB reboot path
--- Build: pass | Tests: FAIL — Tests 3 failed | 2079 passed (2082)
This commit is contained in:
parent
d456aa4be1
commit
b02746c298
7 changed files with 231 additions and 13 deletions
|
|
@ -10,6 +10,7 @@ import fs from 'fs';
|
|||
import path from 'path';
|
||||
|
||||
import {
|
||||
DB_RUNTIME,
|
||||
PLATFORM_RUNTIME_HOME,
|
||||
PLATFORM_RUNTIME_USER,
|
||||
PLATFORM_SERVICE_NAME,
|
||||
|
|
@ -180,11 +181,13 @@ function generateRcdService(
|
|||
logPath: string,
|
||||
): string {
|
||||
const envFile = path.join(projectRoot, '.env');
|
||||
const requireTargets =
|
||||
DB_RUNTIME === 'host' ? 'NETWORKING LOGIN postgresql' : 'NETWORKING LOGIN';
|
||||
return [
|
||||
'#!/bin/sh',
|
||||
'#',
|
||||
`# PROVIDE: ${runtime.serviceName}`,
|
||||
'# REQUIRE: NETWORKING LOGIN',
|
||||
`# REQUIRE: ${requireTargets}`,
|
||||
'# KEYWORD: shutdown',
|
||||
'',
|
||||
'. /etc/rc.subr',
|
||||
|
|
@ -199,7 +202,7 @@ function generateRcdService(
|
|||
'load_rc_config $name',
|
||||
`: \$\{${runtime.serviceName}_enable:="NONE"\}`,
|
||||
'',
|
||||
`case "\$\{${agentName}_enable\}" in`,
|
||||
`case "\$\{${runtime.serviceName}_enable\}" in`,
|
||||
' [Nn][Oo][Nn][Ee])',
|
||||
' if [ "$1" = "start" ] || [ "$1" = "restart" ]; then',
|
||||
' echo ""',
|
||||
|
|
|
|||
|
|
@ -100,14 +100,13 @@ async function checkDbJail(hostdAvailable: boolean): Promise<ControlPlaneCheckRe
|
|||
}
|
||||
|
||||
if (DB_RUNTIME === 'host') {
|
||||
logger.info('controlplane: host postgres unreachable — attempting service start via hostd');
|
||||
const res = await hostd('service-start', { name: 'postgresql' });
|
||||
return {
|
||||
id, label,
|
||||
status: res.ok ? 'ok' : 'fail',
|
||||
detail: res.ok ? 'started via hostd' : (res.error ?? res.output),
|
||||
fixAttempted: true,
|
||||
fixResult: res.ok ? 'success' : 'failed',
|
||||
id,
|
||||
label,
|
||||
status: 'fail',
|
||||
detail:
|
||||
`PostgreSQL unreachable at ${DB_HOST}:5432; host runtime requires host-level service ordering/recovery`,
|
||||
fixAttempted: false,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -138,5 +138,25 @@ describe('authorizeHostdOperation', () => {
|
|||
owner: 'shared-platform',
|
||||
reason: 'platform-level operation requires operator approval',
|
||||
});
|
||||
|
||||
expect(
|
||||
authorizeHostdOperation(
|
||||
'maintenance-reboot',
|
||||
{},
|
||||
{ tenantId: 'mevy', caller: 'operator', registry },
|
||||
),
|
||||
).toEqual({ allowed: true, owner: 'shared-platform' });
|
||||
|
||||
expect(
|
||||
authorizeHostdOperation(
|
||||
'maintenance-reboot',
|
||||
{},
|
||||
{ tenantId: 'mevy', caller: 'tenant-agent', registry },
|
||||
),
|
||||
).toEqual({
|
||||
allowed: false,
|
||||
owner: 'shared-platform',
|
||||
reason: 'platform-level operation requires operator approval',
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -165,6 +165,7 @@ export function authorizeHostdOperation(
|
|||
case 'pkg-fetch':
|
||||
case 'pkg-cache-init':
|
||||
case 'shutdown-reboot':
|
||||
case 'maintenance-reboot':
|
||||
case 'sysrc-set':
|
||||
case 'sanoid-snapshot':
|
||||
return caller === 'operator'
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ describe('handleOp — dispatch', () => {
|
|||
'zfs-list-snapshot-usage', 'zfs-create', 'zfs-rollback',
|
||||
'zpool-list', 'zpool-status', 'pf-reload', 'pf-enable',
|
||||
'service-start', 'service-stop', 'service-restart', 'service-status',
|
||||
'pkg-install', 'pkg-version', 'pkg-upgrade', 'shutdown-reboot', 'pkg-fetch', 'pkg-cache-init',
|
||||
'pkg-install', 'pkg-version', 'pkg-upgrade', 'shutdown-reboot', 'maintenance-reboot', 'pkg-fetch', 'pkg-cache-init',
|
||||
'sysrc-set', 'sanoid-snapshot',
|
||||
]) {
|
||||
expect(result.output).toContain(name);
|
||||
|
|
@ -462,3 +462,82 @@ describe('handleOp — shutdown-reboot', () => {
|
|||
expect(mockSpawnSync).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
describe('handleOp — maintenance-reboot', () => {
|
||||
it('stops mevy, checkpoints postgres, snapshots both datasets, and schedules reboot', () => {
|
||||
mockSpawnSync
|
||||
.mockReturnValueOnce(spawnOk()) // service mevy onestatus
|
||||
.mockReturnValueOnce(spawnOk()) // service mevy stop
|
||||
.mockReturnValueOnce(spawnOk()) // service postgresql onestatus
|
||||
.mockReturnValueOnce(spawnOk()) // su postgres psql CHECKPOINT
|
||||
.mockReturnValueOnce(spawnOk()) // service postgresql onestatus
|
||||
.mockReturnValueOnce(spawnOk()) // service postgresql stop
|
||||
.mockReturnValueOnce(spawnOk()) // zfs snapshot data
|
||||
.mockReturnValueOnce(spawnOk()) // zfs snapshot wal
|
||||
.mockReturnValueOnce(spawnOk()); // shutdown
|
||||
|
||||
const result = handleOp('maintenance-reboot', { delayMinutes: 1 });
|
||||
expect(result.ok).toBe(true);
|
||||
expect(mockSpawnSync).toHaveBeenNthCalledWith(
|
||||
1,
|
||||
'service',
|
||||
['mevy', 'onestatus'],
|
||||
expect.any(Object),
|
||||
);
|
||||
expect(mockSpawnSync).toHaveBeenNthCalledWith(
|
||||
2,
|
||||
'service',
|
||||
['mevy', 'stop'],
|
||||
expect.any(Object),
|
||||
);
|
||||
expect(mockSpawnSync).toHaveBeenNthCalledWith(
|
||||
4,
|
||||
'su',
|
||||
['-m', 'postgres', '-c', '/usr/local/bin/psql -d postgres -tAc "CHECKPOINT"'],
|
||||
expect.any(Object),
|
||||
);
|
||||
expect(mockSpawnSync).toHaveBeenNthCalledWith(
|
||||
7,
|
||||
'zfs',
|
||||
expect.arrayContaining(['snapshot', expect.stringContaining('/pgdata@pre-reboot-')]),
|
||||
expect.any(Object),
|
||||
);
|
||||
expect(mockSpawnSync).toHaveBeenNthCalledWith(
|
||||
8,
|
||||
'zfs',
|
||||
expect.arrayContaining(['snapshot', expect.stringContaining('/pgwal@pre-reboot-')]),
|
||||
expect.any(Object),
|
||||
);
|
||||
expect(mockSpawnSync).toHaveBeenLastCalledWith(
|
||||
'shutdown',
|
||||
['-r', '+1'],
|
||||
expect.any(Object),
|
||||
);
|
||||
});
|
||||
|
||||
it('continues when mevy is already stopped', () => {
|
||||
mockSpawnSync
|
||||
.mockReturnValueOnce(spawnFail('not running')) // service mevy onestatus
|
||||
.mockReturnValueOnce(spawnOk()) // service postgresql onestatus
|
||||
.mockReturnValueOnce(spawnOk()) // su postgres psql CHECKPOINT
|
||||
.mockReturnValueOnce(spawnOk()) // service postgresql onestatus
|
||||
.mockReturnValueOnce(spawnOk()) // service postgresql stop
|
||||
.mockReturnValueOnce(spawnOk()) // zfs snapshot data
|
||||
.mockReturnValueOnce(spawnOk()) // zfs snapshot wal
|
||||
.mockReturnValueOnce(spawnOk()); // shutdown
|
||||
|
||||
const result = handleOp('maintenance-reboot', {});
|
||||
expect(result.ok).toBe(true);
|
||||
expect(mockSpawnSync).toHaveBeenNthCalledWith(
|
||||
1,
|
||||
'service',
|
||||
['mevy', 'onestatus'],
|
||||
expect.any(Object),
|
||||
);
|
||||
expect(mockSpawnSync).not.toHaveBeenCalledWith(
|
||||
'service',
|
||||
['mevy', 'stop'],
|
||||
expect.any(Object),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -8,6 +8,9 @@ import { spawnSync } from 'child_process';
|
|||
|
||||
import { z } from 'zod';
|
||||
|
||||
import { formatSnapshotStamp } from '../display-date.js';
|
||||
import { PLATFORM_SERVICE_NAME, ZFS_PREFIX } from '../config.js';
|
||||
|
||||
// ── Param validators ──────────────────────────────────────────────────────────
|
||||
|
||||
const jailName = z.string().regex(/^[a-zA-Z0-9_-]{1,32}$/, 'invalid jail name');
|
||||
|
|
@ -61,6 +64,112 @@ function exec(cmd: string, args: string[]): OpResult {
|
|||
return { ok: exitCode === 0, output, exitCode };
|
||||
}
|
||||
|
||||
function execAs(user: string, command: string): OpResult {
|
||||
return exec('su', ['-m', user, '-c', command]);
|
||||
}
|
||||
|
||||
function serviceMaybeStop(name: string): OpResult {
|
||||
const status = exec('service', [name, 'onestatus']);
|
||||
if (!status.ok) {
|
||||
return { ok: true, output: `${name} already stopped` };
|
||||
}
|
||||
return exec('service', [name, 'stop']);
|
||||
}
|
||||
|
||||
function serviceMaybeStart(name: string): OpResult {
|
||||
const status = exec('service', [name, 'onestatus']);
|
||||
if (status.ok) {
|
||||
return { ok: true, output: `${name} already running` };
|
||||
}
|
||||
return exec('service', [name, 'start']);
|
||||
}
|
||||
|
||||
function buildHostDbDatasets(): { data: string; wal: string } {
|
||||
const pool = (process.env.ZFS_POOL || 'zroot').trim() || 'zroot';
|
||||
const prefix = (ZFS_PREFIX || '').trim();
|
||||
return {
|
||||
data: `${pool}/${prefix}/pgdata`,
|
||||
wal: `${pool}/${prefix}/pgwal`,
|
||||
};
|
||||
}
|
||||
|
||||
function runMaintenanceReboot(delayMinutes: number): OpResult {
|
||||
const mevyStop = serviceMaybeStop(PLATFORM_SERVICE_NAME);
|
||||
if (!mevyStop.ok) {
|
||||
return {
|
||||
ok: false,
|
||||
error: `failed to stop ${PLATFORM_SERVICE_NAME}: ${mevyStop.error || mevyStop.output || 'unknown error'}`,
|
||||
output: mevyStop.output,
|
||||
exitCode: mevyStop.exitCode,
|
||||
};
|
||||
}
|
||||
|
||||
const postgresStatus = exec('service', ['postgresql', 'onestatus']);
|
||||
if (postgresStatus.ok) {
|
||||
const checkpoint = execAs(
|
||||
'postgres',
|
||||
'/usr/local/bin/psql -d postgres -tAc "CHECKPOINT"',
|
||||
);
|
||||
if (!checkpoint.ok) {
|
||||
return {
|
||||
ok: false,
|
||||
error: `postgresql CHECKPOINT failed: ${checkpoint.error || checkpoint.output || 'unknown error'}`,
|
||||
output: checkpoint.output,
|
||||
exitCode: checkpoint.exitCode,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
const postgresStop = serviceMaybeStop('postgresql');
|
||||
if (!postgresStop.ok) {
|
||||
return {
|
||||
ok: false,
|
||||
error: `failed to stop postgresql: ${postgresStop.error || postgresStop.output || 'unknown error'}`,
|
||||
output: postgresStop.output,
|
||||
exitCode: postgresStop.exitCode,
|
||||
};
|
||||
}
|
||||
|
||||
const stamp = formatSnapshotStamp(new Date());
|
||||
const snapshotName = `pre-reboot-${stamp}`;
|
||||
const datasets = buildHostDbDatasets();
|
||||
|
||||
const dataSnapshot = exec('zfs', ['snapshot', `${datasets.data}@${snapshotName}`]);
|
||||
if (!dataSnapshot.ok) {
|
||||
return {
|
||||
ok: false,
|
||||
error: `failed to snapshot ${datasets.data}: ${dataSnapshot.error || dataSnapshot.output || 'unknown error'}`,
|
||||
output: dataSnapshot.output,
|
||||
exitCode: dataSnapshot.exitCode,
|
||||
};
|
||||
}
|
||||
|
||||
const walSnapshot = exec('zfs', ['snapshot', `${datasets.wal}@${snapshotName}`]);
|
||||
if (!walSnapshot.ok) {
|
||||
return {
|
||||
ok: false,
|
||||
error: `failed to snapshot ${datasets.wal}: ${walSnapshot.error || walSnapshot.output || 'unknown error'}`,
|
||||
output: walSnapshot.output,
|
||||
exitCode: walSnapshot.exitCode,
|
||||
};
|
||||
}
|
||||
|
||||
const shutdown = exec('shutdown', ['-r', `+${delayMinutes}`]);
|
||||
if (!shutdown.ok) {
|
||||
return shutdown;
|
||||
}
|
||||
|
||||
return {
|
||||
ok: true,
|
||||
output: [
|
||||
`${PLATFORM_SERVICE_NAME}: ${mevyStop.output || 'stopped'}`,
|
||||
`postgresql: ${postgresStop.output || 'stopped'}`,
|
||||
`snapshots: ${datasets.data}@${snapshotName}, ${datasets.wal}@${snapshotName}`,
|
||||
`reboot: scheduled in ${delayMinutes} minute${delayMinutes === 1 ? '' : 's'}`,
|
||||
].join('\n'),
|
||||
};
|
||||
}
|
||||
|
||||
// ── Op registry ───────────────────────────────────────────────────────────────
|
||||
|
||||
type ParamMap = Record<string, string | number | boolean>;
|
||||
|
|
@ -230,6 +339,13 @@ const OPS: Record<string, OpEntry> = {
|
|||
return exec('shutdown', ['-r', `+${delay}`]);
|
||||
},
|
||||
},
|
||||
'maintenance-reboot': {
|
||||
schema: z.object({ delayMinutes: rebootDelayMinutes.optional() }),
|
||||
handler: (p) => {
|
||||
const delay = typeof p.delayMinutes === 'number' ? p.delayMinutes : 1;
|
||||
return runMaintenanceReboot(delay);
|
||||
},
|
||||
},
|
||||
|
||||
// Fetch one or more packages (+ deps) to the host cache without installing.
|
||||
// packages: space-separated list, e.g. "nginx node24 postgresql18-server"
|
||||
|
|
|
|||
|
|
@ -1602,7 +1602,7 @@ export async function handleUpdatesCallback(
|
|||
}
|
||||
|
||||
await ctxArg.answerCallbackQuery({ text: 'Scheduling reboot...' });
|
||||
const res = await callOperatorHostd('shutdown-reboot', {
|
||||
const res = await callOperatorHostd('maintenance-reboot', {
|
||||
delayMinutes: 1,
|
||||
}).catch((err: any) => ({
|
||||
ok: false,
|
||||
|
|
@ -1723,8 +1723,8 @@ export async function handleUpdatesCallback(
|
|||
return;
|
||||
}
|
||||
|
||||
await ctxArg.reply('Idle reached. Scheduling reboot in 1 minute...');
|
||||
const res = await callOperatorHostd('shutdown-reboot', {
|
||||
await ctxArg.reply('Idle reached. Scheduling maintenance reboot in 1 minute...');
|
||||
const res = await callOperatorHostd('maintenance-reboot', {
|
||||
delayMinutes: 1,
|
||||
}).catch((err: any) => ({
|
||||
ok: false,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue