fix(rc): FreeBSD rc.d deep-audit — 6 bugs after live-host testing #75

Merged
clawdie merged 3 commits from fix/freebsd-rc-d-deep-audit into main 2026-06-15 09:24:20 +02:00
8 changed files with 266 additions and 54 deletions

View file

@ -71,9 +71,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Start the Colibri control-plane socket server
let socket_state = state.clone();
let socket_shutdown = state.shutdown_rx.resubscribe();
let socket_handle = tokio::spawn(async move {
socket::serve(socket_state, socket_shutdown).await;
});
let socket_handle =
tokio::spawn(async move { socket::serve(socket_state, socket_shutdown).await });
// Start the daemon background loop (heartbeat, session rotation, scheduler)
let loop_state = state.clone();
@ -86,8 +85,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Listen for shutdown signals
let shutdown_state = state.clone();
tokio::spawn(async move {
tokio::signal::ctrl_c().await.ok();
info!("received interrupt signal, initiating graceful shutdown");
wait_for_shutdown_signal().await;
info!("received shutdown signal, initiating graceful shutdown");
let _ = shutdown_state.shutdown_tx.send(());
// Give sub-tasks a moment to clean up
@ -103,9 +102,49 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Wait for both the socket server and the daemon loop to finish
let (socket_result, loop_result) = tokio::join!(socket_handle, loop_handle);
socket_result?;
socket_result??;
loop_result?;
info!("colibri-daemon shut down cleanly");
Ok(())
}
/// Wait for a process shutdown signal.
///
/// On Unix this resolves on **either** SIGTERM (what `service stop`/`restart`
/// sends — daemon(8) forwards it to this child) or SIGINT (Ctrl+C). Handling
/// SIGTERM is what lets the graceful path run on a normal `service stop`: the
/// socket is removed (see `socket::serve`) and spawned agents are reaped. Awaiting
/// only `ctrl_c()` (SIGINT) would leave SIGTERM on its default disposition —
/// immediate kill with no cleanup, leaking the socket file and child agents.
#[cfg(unix)]
async fn wait_for_shutdown_signal() {
use tokio::signal::unix::{signal, SignalKind};
let mut sigterm = match signal(SignalKind::terminate()) {
Ok(s) => s,
Err(e) => {
tracing::error!(error = %e, "failed to install SIGTERM handler; falling back to SIGINT only");
tokio::signal::ctrl_c().await.ok();
return;
}
};
let mut sigint = match signal(SignalKind::interrupt()) {
Ok(s) => s,
Err(e) => {
tracing::error!(error = %e, "failed to install SIGINT handler; waiting on SIGTERM only");
sigterm.recv().await;
return;
}
};
tokio::select! {
_ = sigterm.recv() => info!("received SIGTERM"),
_ = sigint.recv() => info!("received SIGINT"),
}
}
#[cfg(not(unix))]
async fn wait_for_shutdown_signal() {
tokio::signal::ctrl_c().await.ok();
}

View file

@ -7,7 +7,7 @@
//! - Client sends: `{"cmd":"status"}\n`
//! - Server responds: `{"ok":true,"data":{...}}\n`
use std::time::SystemTime;
use std::{io, time::SystemTime};
use std::collections::HashMap;
@ -26,20 +26,59 @@ use crate::{ColibriCommand, ColibriResponse, SharedState};
// Socket server
// ---------------------------------------------------------------------------
/// Prepare `socket_path` for `bind()`: remove a stale socket file, but never one
/// a live daemon is still serving.
///
/// A bare `remove_file` would happily delete the socket of a running instance —
/// e.g. if rc.subr's pid detection misfires and a second daemon starts — which
/// orphans the first and races two listeners on the same path. So probe first:
///
/// - file absent → nothing to do, `true` (safe to bind)
/// - connect succeeds → another daemon owns it → `false` (caller must NOT bind)
/// - connect refused / not a socket → stale leftover → remove it, `true`
///
/// Returns `true` when the caller may proceed to `bind()`.
async fn clear_stale_socket(socket_path: &std::path::Path) -> bool {
if !socket_path.exists() {
return true;
}
match UnixStream::connect(socket_path).await {
Ok(_) => {
error!(
path = %socket_path.display(),
"another colibri-daemon is already listening on this socket; refusing to start"
);
false
}
Err(_) => {
// Stale socket (connection refused, or not a socket): safe to remove.
if let Err(e) = tokio::fs::remove_file(socket_path).await {
warn!(
path = %socket_path.display(),
error = %e,
"failed to remove stale socket file"
);
}
true
}
}
}
/// Run the Colibri control-plane socket server. Binds to the configured Unix socket path
/// and processes inbound commands until a shutdown signal is received.
pub async fn serve(state: SharedState, mut shutdown_rx: broadcast::Receiver<()>) {
pub async fn serve(state: SharedState, mut shutdown_rx: broadcast::Receiver<()>) -> io::Result<()> {
let socket_path = state.config.socket_path.clone();
// Clean up stale socket file
if socket_path.exists() {
if let Err(e) = tokio::fs::remove_file(&socket_path).await {
warn!(
path = %socket_path.display(),
error = %e,
"failed to remove stale socket file"
);
}
// Clear a stale socket file, refusing to start if a live daemon owns it.
if !clear_stale_socket(&socket_path).await {
let _ = state.shutdown_tx.send(());
return Err(io::Error::new(
io::ErrorKind::AddrInUse,
format!(
"another colibri-daemon is already listening on {}",
socket_path.display()
),
));
}
// Ensure parent directory exists
@ -50,7 +89,8 @@ pub async fn serve(state: SharedState, mut shutdown_rx: broadcast::Receiver<()>)
error = %e,
"failed to create socket parent directory"
);
return;
let _ = state.shutdown_tx.send(());
return Err(e);
}
}
@ -62,7 +102,8 @@ pub async fn serve(state: SharedState, mut shutdown_rx: broadcast::Receiver<()>)
error = %e,
"failed to bind Unix socket"
);
return;
let _ = state.shutdown_tx.send(());
return Err(e);
}
};
@ -111,6 +152,7 @@ pub async fn serve(state: SharedState, mut shutdown_rx: broadcast::Receiver<()>)
// Clean up socket file on graceful exit
let _ = tokio::fs::remove_file(&socket_path).await;
info!("Colibri control-plane socket shut down");
Ok(())
}
// ---------------------------------------------------------------------------
@ -785,4 +827,56 @@ mod tests {
assert_eq!(pane.state, colibri_glasspane::AgentState::Done);
assert_eq!(pane.pi_session_id.as_deref(), Some("pi-fake"));
}
#[tokio::test]
async fn clear_stale_socket_returns_true_when_absent() {
let dir = std::env::temp_dir().join(format!(
"colibri-daemon-socket-test-{}",
uuid::Uuid::new_v4()
));
std::fs::create_dir_all(&dir).unwrap();
let path = dir.join("colibri.sock");
assert!(!path.exists());
assert!(clear_stale_socket(&path).await, "absent socket is bindable");
let _ = std::fs::remove_dir_all(&dir);
}
#[tokio::test]
async fn clear_stale_socket_removes_dead_socket_file() {
let dir = std::env::temp_dir().join(format!(
"colibri-daemon-socket-test-{}",
uuid::Uuid::new_v4()
));
std::fs::create_dir_all(&dir).unwrap();
let path = dir.join("colibri.sock");
// A bound-then-dropped listener leaves the socket file with no listener
// behind it — exactly the "stale leftover after a hard kill" case.
let listener = UnixListener::bind(&path).unwrap();
drop(listener);
assert!(path.exists(), "dropped listener leaves the socket file");
assert!(
clear_stale_socket(&path).await,
"stale socket should be cleared and bindable"
);
assert!(!path.exists(), "stale socket file should have been removed");
let _ = std::fs::remove_dir_all(&dir);
}
#[tokio::test]
async fn clear_stale_socket_refuses_when_live_listener_present() {
let dir = std::env::temp_dir().join(format!(
"colibri-daemon-socket-test-{}",
uuid::Uuid::new_v4()
));
std::fs::create_dir_all(&dir).unwrap();
let path = dir.join("colibri.sock");
// Hold a live listener: a connect will succeed, so cleanup must refuse.
let _listener = UnixListener::bind(&path).unwrap();
assert!(
!clear_stale_socket(&path).await,
"must refuse to clear a socket a live daemon is serving"
);
assert!(path.exists(), "live socket must not be removed");
let _ = std::fs::remove_dir_all(&dir);
}
}

View file

@ -35,7 +35,7 @@ async fn intake_task_over_socket_drains_to_sqlite_via_run_loop() {
let server_state = state.clone();
let server_shutdown = state.shutdown_rx.resubscribe();
let server = tokio::spawn(async move {
socket::serve(server_state, server_shutdown).await;
let _ = socket::serve(server_state, server_shutdown).await;
});
// Background loop with a fast scheduler tick so the test stays quick.

View file

@ -0,0 +1,74 @@
//! Graceful-shutdown-on-SIGTERM proof — integration test.
//!
//! `service stop`/`restart` on FreeBSD sends SIGTERM (daemon(8) forwards it to
//! the colibri-daemon child). The daemon must treat SIGTERM like SIGINT: run the
//! graceful path and remove its Unix socket on the way out. If SIGTERM were left
//! on its default disposition, the child would die without cleanup and leak the
//! socket file — which is what forced the rc.d script to `rm` it. This test
//! spawns the real binary, waits for the socket, sends SIGTERM, and asserts the
//! socket is gone.
#![cfg(unix)]
use std::process::Command as StdCommand;
use std::time::{Duration, Instant};
#[tokio::test]
async fn sigterm_triggers_graceful_socket_cleanup() {
let tmp = std::env::temp_dir().join(format!("colibri-sigterm-test-{}", std::process::id()));
std::fs::create_dir_all(&tmp).expect("create temp data dir");
let socket_path = tmp.join("colibri.sock");
let bin = env!("CARGO_BIN_EXE_colibri-daemon");
let mut child = StdCommand::new(bin)
.env("COLIBRI_DAEMON_DATA_DIR", &tmp)
.env("COLIBRI_DAEMON_SOCKET", &socket_path)
.env("COLIBRI_DB_PATH", tmp.join("colibri.sqlite"))
.env("COLIBRI_HOST", "sigterm-test")
// Keep logs quiet; errors still surface on failure via the assertions.
.env("RUST_LOG", "warn")
.spawn()
.expect("failed to spawn colibri-daemon");
// Wait for the daemon to bind its socket (it forks no children of its own
// here; the file appears once `socket::serve` calls bind()).
let deadline = Instant::now() + Duration::from_secs(15);
while !socket_path.exists() {
if Instant::now() > deadline {
let _ = child.kill();
panic!("daemon did not create socket within timeout");
}
tokio::time::sleep(Duration::from_millis(100)).await;
}
// Send SIGTERM (the signal `service stop` delivers), then wait for exit.
let pid = child.id();
let status = StdCommand::new("kill")
.arg("-TERM")
.arg(pid.to_string())
.status()
.expect("failed to run kill");
assert!(status.success(), "kill -TERM failed");
// Poll for process exit so we don't block forever if shutdown hangs.
let deadline = Instant::now() + Duration::from_secs(15);
loop {
match child.try_wait().expect("try_wait failed") {
Some(_) => break,
None => {
if Instant::now() > deadline {
let _ = child.kill();
panic!("daemon did not exit within timeout after SIGTERM");
}
tokio::time::sleep(Duration::from_millis(100)).await;
}
}
}
assert!(
!socket_path.exists(),
"socket file should be removed by the graceful SIGTERM path, but it still exists at {}",
socket_path.display()
);
let _ = std::fs::remove_dir_all(&tmp);
}

View file

@ -65,7 +65,7 @@ tasks
scheduler.interval_secs
```
Confirm `cost.mode` matches `colibri_cost_mode` from rc.conf.
Confirm `cost.mode` matches `colibri_daemon_cost_mode` from rc.conf.
## 3. Task board check

View file

@ -19,6 +19,7 @@
/usr/local/etc/rc.d/colibri_daemon ← rc.d service script
/usr/local/etc/colibri/
rc.conf.sample ← service config template
provider.env ← optional API keys (mode 0600, root-owned)
/usr/local/etc/newsyslog.conf.d/
colibri.conf ← log rotation (1MB, 7 archives)
@ -27,8 +28,9 @@
sessions/ ← JSONL session files
/var/run/colibri/ ← tmpfs (recreated each boot)
colibri.sock ← Unix socket (750 colibri:colibri)
colibri-daemon.pid ← daemon(8) supervisor pidfile
colibri.sock ← Unix socket (770 colibri:colibri, set by daemon)
colibri-daemon.pid ← child pidfile (daemon(8) -p)
colibri-daemon-supervisor.pid ← supervisor pidfile (daemon(8) -P)
/var/log/colibri/ ← persistent logs
daemon.log ← stdout/stderr from colibri-daemon
@ -45,8 +47,8 @@
## Shutdown behavior
1. `rc.d` sends SIGTERM to daemon(8)
2. daemon(8) forwards to colibri-daemon child
1. `rc.d` sends SIGTERM to daemon(8) supervisor (via custom `stop_cmd`)
2. Supervisor forwards signal to colibri-daemon child and exits (no restart)
3. Colibri closes socket, flushes SQLite, stops scheduler
4. `poststop`: removes stale socket from tmpfs
@ -67,17 +69,17 @@ colibri list-tasks --status queued
## Config knobs (set in /etc/rc.conf or /etc/rc.conf.d/colibri_daemon)
| Variable | Default | Purpose |
| ------------------------- | --------------------------- | ----------------------- |
| `colibri_daemon_enable` | NO | Enable at boot (YES/NO) |
| `colibri_daemon_user` | colibri | Runtime user |
| `colibri_daemon_group` | colibri | Runtime group |
| `colibri_cost_mode` | smart | fast/smart/max |
| `colibri_daemon_data_dir` | /var/db/colibri | Persistent data |
| `colibri_daemon_run_dir` | /var/run/colibri | tmpfs runtime |
| `colibri_daemon_socket` | $run_dir/colibri.sock | Unix socket path |
| `colibri_daemon_db_path` | $data_dir/colibri.sqlite | SQLite path |
| `colibri_daemon_logfile` | /var/log/colibri/daemon.log | Log output |
| Variable | Default | Purpose |
| -------------------------- | --------------------------- | ----------------------- |
| `colibri_daemon_enable` | NO | Enable at boot (YES/NO) |
| `colibri_daemon_user` | colibri | Runtime user |
| `colibri_daemon_group` | colibri | Runtime group |
| `colibri_daemon_cost_mode` | smart | fast/smart/max |
| `colibri_daemon_data_dir` | /var/db/colibri | Persistent data |
| `colibri_daemon_run_dir` | /var/run/colibri | tmpfs runtime |
| `colibri_daemon_socket` | $run_dir/colibri.sock | Unix socket path |
| `colibri_daemon_db_path` | $data_dir/colibri.sqlite | SQLite path |
| `colibri_daemon_logfile` | /var/log/colibri/daemon.log | Log output |
## Log rotation

View file

@ -46,7 +46,7 @@ load_rc_config $name
: ${colibri_daemon_logfile:="/var/log/colibri/daemon.log"}
: ${colibri_daemon_provider_env:="/usr/local/etc/colibri/provider.env"}
: ${colibri_daemon_host:="$(/bin/hostname)"}
: ${colibri_cost_mode:="smart"}
: ${colibri_daemon_cost_mode:="smart"}
pidfile="${colibri_daemon_run_dir}/colibri-daemon.pid"
# Supervisor pidfile (the daemon(8) parent). Kept distinct from the child
@ -84,9 +84,12 @@ colibri_daemon_prestart()
install -d -o "${colibri_daemon_user}" -g "${colibri_daemon_group}" -m 0750 \
"$(/usr/bin/dirname "${colibri_daemon_logfile}")"
# Clear runtime files before starting so repeated service starts always
# bind a fresh socket and write fresh pidfiles.
rm -f "${colibri_daemon_socket}" "${pidfile}" "${supervisor_pidfile}"
# Clear stale pidfiles before starting so a previous hard-killed run does
# not trip rc.subr's "already running" check. The socket is left to the
# daemon: it removes a stale socket on bind but refuses to start if a live
# daemon is already listening, so an rc-side rm here could only clobber a
# running instance.
rm -f "${pidfile}" "${supervisor_pidfile}"
# Provider keys are optional. Keep them in a root-owned env file instead of
# rc.conf so they are easy to rotate and not world-readable.
@ -101,7 +104,7 @@ colibri_daemon_prestart()
export COLIBRI_DAEMON_SOCKET="${colibri_daemon_socket}"
export COLIBRI_DB_PATH="${colibri_daemon_db_path}"
export COLIBRI_HOST="${colibri_daemon_host}"
export COLIBRI_COST_MODE="${colibri_cost_mode}"
export COLIBRI_COST_MODE="${colibri_daemon_cost_mode}"
export PATH="/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin"
}
@ -117,8 +120,6 @@ colibri_daemon_poststart()
if [ -S "${colibri_daemon_socket}" ]; then
echo "colibri-daemon socket ready after ${waited}s"
chmod 644 "${pidfile}" 2>/dev/null || true
chmod 660 "${colibri_daemon_socket}" 2>/dev/null || true
else
echo "WARNING: colibri-daemon socket not ready after ${timeout}s"
fi
@ -169,18 +170,20 @@ colibri_daemon_poststop()
health_cmd="colibri_daemon_health"
colibri_daemon_health()
{
if [ -S "${colibri_daemon_socket}" ]; then
if printf '{"cmd":"status"}\n' | nc -U "${colibri_daemon_socket}" -w 2 >/dev/null 2>&1; then
echo "colibri-daemon is healthy (socket responding)"
return 0
else
echo "colibri-daemon socket exists but not responding"
return 1
fi
else
if [ ! -S "${colibri_daemon_socket}" ]; then
echo "colibri-daemon socket not found"
return 1
fi
local _resp
_resp=$(printf '{"cmd":"status"}\n' | nc -U "${colibri_daemon_socket}" -w 2 2>/dev/null)
if [ -n "${_resp}" ]; then
echo "colibri-daemon is healthy (socket responding)"
return 0
else
echo "colibri-daemon socket exists but no response"
return 1
fi
}
run_rc_command "$1"

View file

@ -68,8 +68,8 @@ colibri_daemon_run_dir="/var/run/colibri"
colibri_daemon_socket="/var/run/colibri/colibri.sock"
colibri_daemon_db_path="/var/db/colibri/colibri.sqlite"
colibri_daemon_logfile="/var/log/colibri/daemon.log"
colibri_daemon_host="\$(hostname)"
colibri_cost_mode="smart"
colibri_daemon_host="\$(/bin/hostname)"
colibri_daemon_cost_mode="smart"
EOF
cat > "$ETC_DIR/README.iso" <<'EOF'