fix(daemon): handle SIGTERM + liveness-aware socket cleanup (Sam & Claude)
The rc.d "rm stale socket on prestart" fix (07e4660) was a band-aid over
two daemon-side defects that surfaced on the live FreeBSD host:
1. colibri-daemon never handled SIGTERM. main.rs awaited only ctrl_c()
(SIGINT), so `service stop`/`restart` — which sends SIGTERM via
daemon(8) to the child — killed it on the default disposition with no
cleanup. The graceful path (socket removal, agent reaping) never ran,
leaking the socket file and orphaning spawned agents across restarts.
Now wait_for_shutdown_signal() selects on SIGTERM or SIGINT, so the
same graceful path runs on a normal service stop. New integration test
(tests/sigterm_shutdown.rs) spawns the binary, sends SIGTERM, and
asserts the socket is removed.
2. Stale-socket cleanup had no liveness check — both the daemon
(socket.rs) and the rc prestart would unconditionally rm the socket
before bind, which could delete a *running* instance's socket if
rc.subr's pid detection misfires and starts a second daemon. Cleanup
now probes first (clear_stale_socket): connect succeeds -> refuse to
start; refused/dead -> remove and bind. Unit-tested for absent, stale,
and live cases.
With the daemon owning safe socket cleanup, the rc prestart no longer
removes the socket (only stale pidfiles), eliminating the restart-time
clobber hazard. This also makes the SIGTERM shutdown described in
ISO-SERVICE-LAYOUT.md (PR #75) actually true.
Gates: cargo fmt --check, clippy -D warnings, cargo test --workspace all
green on Linux; sh -n on the rc script OK. FreeBSD runtime validation
still pending per FREEBSD-BUILD-LANE-HANDOFF.md.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
df5fbab051
commit
b32c3acaed
4 changed files with 215 additions and 14 deletions
|
|
@ -86,8 +86,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
// Listen for shutdown signals
|
||||
let shutdown_state = state.clone();
|
||||
tokio::spawn(async move {
|
||||
tokio::signal::ctrl_c().await.ok();
|
||||
info!("received interrupt signal, initiating graceful shutdown");
|
||||
wait_for_shutdown_signal().await;
|
||||
info!("received shutdown signal, initiating graceful shutdown");
|
||||
let _ = shutdown_state.shutdown_tx.send(());
|
||||
|
||||
// Give sub-tasks a moment to clean up
|
||||
|
|
@ -109,3 +109,43 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
info!("colibri-daemon shut down cleanly");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Wait for a process shutdown signal.
|
||||
///
|
||||
/// On Unix this resolves on **either** SIGTERM (what `service stop`/`restart`
|
||||
/// sends — daemon(8) forwards it to this child) or SIGINT (Ctrl+C). Handling
|
||||
/// SIGTERM is what lets the graceful path run on a normal `service stop`: the
|
||||
/// socket is removed (see `socket::serve`) and spawned agents are reaped. Awaiting
|
||||
/// only `ctrl_c()` (SIGINT) would leave SIGTERM on its default disposition —
|
||||
/// immediate kill with no cleanup, leaking the socket file and child agents.
|
||||
#[cfg(unix)]
|
||||
async fn wait_for_shutdown_signal() {
|
||||
use tokio::signal::unix::{signal, SignalKind};
|
||||
|
||||
let mut sigterm = match signal(SignalKind::terminate()) {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "failed to install SIGTERM handler; falling back to SIGINT only");
|
||||
tokio::signal::ctrl_c().await.ok();
|
||||
return;
|
||||
}
|
||||
};
|
||||
let mut sigint = match signal(SignalKind::interrupt()) {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "failed to install SIGINT handler; waiting on SIGTERM only");
|
||||
sigterm.recv().await;
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
_ = sigterm.recv() => info!("received SIGTERM"),
|
||||
_ = sigint.recv() => info!("received SIGINT"),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(unix))]
|
||||
async fn wait_for_shutdown_signal() {
|
||||
tokio::signal::ctrl_c().await.ok();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,20 +26,52 @@ use crate::{ColibriCommand, ColibriResponse, SharedState};
|
|||
// Socket server
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Prepare `socket_path` for `bind()`: remove a stale socket file, but never one
|
||||
/// a live daemon is still serving.
|
||||
///
|
||||
/// A bare `remove_file` would happily delete the socket of a running instance —
|
||||
/// e.g. if rc.subr's pid detection misfires and a second daemon starts — which
|
||||
/// orphans the first and races two listeners on the same path. So probe first:
|
||||
///
|
||||
/// - file absent → nothing to do, `true` (safe to bind)
|
||||
/// - connect succeeds → another daemon owns it → `false` (caller must NOT bind)
|
||||
/// - connect refused / not a socket → stale leftover → remove it, `true`
|
||||
///
|
||||
/// Returns `true` when the caller may proceed to `bind()`.
|
||||
async fn clear_stale_socket(socket_path: &std::path::Path) -> bool {
|
||||
if !socket_path.exists() {
|
||||
return true;
|
||||
}
|
||||
match UnixStream::connect(socket_path).await {
|
||||
Ok(_) => {
|
||||
error!(
|
||||
path = %socket_path.display(),
|
||||
"another colibri-daemon is already listening on this socket; refusing to start"
|
||||
);
|
||||
false
|
||||
}
|
||||
Err(_) => {
|
||||
// Stale socket (connection refused, or not a socket): safe to remove.
|
||||
if let Err(e) = tokio::fs::remove_file(socket_path).await {
|
||||
warn!(
|
||||
path = %socket_path.display(),
|
||||
error = %e,
|
||||
"failed to remove stale socket file"
|
||||
);
|
||||
}
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Run the Colibri control-plane socket server. Binds to the configured Unix socket path
|
||||
/// and processes inbound commands until a shutdown signal is received.
|
||||
pub async fn serve(state: SharedState, mut shutdown_rx: broadcast::Receiver<()>) {
|
||||
let socket_path = state.config.socket_path.clone();
|
||||
|
||||
// Clean up stale socket file
|
||||
if socket_path.exists() {
|
||||
if let Err(e) = tokio::fs::remove_file(&socket_path).await {
|
||||
warn!(
|
||||
path = %socket_path.display(),
|
||||
error = %e,
|
||||
"failed to remove stale socket file"
|
||||
);
|
||||
}
|
||||
// Clear a stale socket file, refusing to start if a live daemon owns it.
|
||||
if !clear_stale_socket(&socket_path).await {
|
||||
return;
|
||||
}
|
||||
|
||||
// Ensure parent directory exists
|
||||
|
|
@ -785,4 +817,56 @@ mod tests {
|
|||
assert_eq!(pane.state, colibri_glasspane::AgentState::Done);
|
||||
assert_eq!(pane.pi_session_id.as_deref(), Some("pi-fake"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn clear_stale_socket_returns_true_when_absent() {
|
||||
let dir = std::env::temp_dir().join(format!(
|
||||
"colibri-daemon-socket-test-{}",
|
||||
uuid::Uuid::new_v4()
|
||||
));
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let path = dir.join("colibri.sock");
|
||||
assert!(!path.exists());
|
||||
assert!(clear_stale_socket(&path).await, "absent socket is bindable");
|
||||
let _ = std::fs::remove_dir_all(&dir);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn clear_stale_socket_removes_dead_socket_file() {
|
||||
let dir = std::env::temp_dir().join(format!(
|
||||
"colibri-daemon-socket-test-{}",
|
||||
uuid::Uuid::new_v4()
|
||||
));
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let path = dir.join("colibri.sock");
|
||||
// A bound-then-dropped listener leaves the socket file with no listener
|
||||
// behind it — exactly the "stale leftover after a hard kill" case.
|
||||
let listener = UnixListener::bind(&path).unwrap();
|
||||
drop(listener);
|
||||
assert!(path.exists(), "dropped listener leaves the socket file");
|
||||
assert!(
|
||||
clear_stale_socket(&path).await,
|
||||
"stale socket should be cleared and bindable"
|
||||
);
|
||||
assert!(!path.exists(), "stale socket file should have been removed");
|
||||
let _ = std::fs::remove_dir_all(&dir);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn clear_stale_socket_refuses_when_live_listener_present() {
|
||||
let dir = std::env::temp_dir().join(format!(
|
||||
"colibri-daemon-socket-test-{}",
|
||||
uuid::Uuid::new_v4()
|
||||
));
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let path = dir.join("colibri.sock");
|
||||
// Hold a live listener: a connect will succeed, so cleanup must refuse.
|
||||
let _listener = UnixListener::bind(&path).unwrap();
|
||||
assert!(
|
||||
!clear_stale_socket(&path).await,
|
||||
"must refuse to clear a socket a live daemon is serving"
|
||||
);
|
||||
assert!(path.exists(), "live socket must not be removed");
|
||||
let _ = std::fs::remove_dir_all(&dir);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
74
crates/colibri-daemon/tests/sigterm_shutdown.rs
Normal file
74
crates/colibri-daemon/tests/sigterm_shutdown.rs
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
//! Graceful-shutdown-on-SIGTERM proof — integration test.
|
||||
//!
|
||||
//! `service stop`/`restart` on FreeBSD sends SIGTERM (daemon(8) forwards it to
|
||||
//! the colibri-daemon child). The daemon must treat SIGTERM like SIGINT: run the
|
||||
//! graceful path and remove its Unix socket on the way out. If SIGTERM were left
|
||||
//! on its default disposition, the child would die without cleanup and leak the
|
||||
//! socket file — which is what forced the rc.d script to `rm` it. This test
|
||||
//! spawns the real binary, waits for the socket, sends SIGTERM, and asserts the
|
||||
//! socket is gone.
|
||||
#![cfg(unix)]
|
||||
|
||||
use std::process::Command as StdCommand;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
#[tokio::test]
|
||||
async fn sigterm_triggers_graceful_socket_cleanup() {
|
||||
let tmp = std::env::temp_dir().join(format!("colibri-sigterm-test-{}", std::process::id()));
|
||||
std::fs::create_dir_all(&tmp).expect("create temp data dir");
|
||||
let socket_path = tmp.join("colibri.sock");
|
||||
|
||||
let bin = env!("CARGO_BIN_EXE_colibri-daemon");
|
||||
let mut child = StdCommand::new(bin)
|
||||
.env("COLIBRI_DAEMON_DATA_DIR", &tmp)
|
||||
.env("COLIBRI_DAEMON_SOCKET", &socket_path)
|
||||
.env("COLIBRI_DB_PATH", tmp.join("colibri.sqlite"))
|
||||
.env("COLIBRI_HOST", "sigterm-test")
|
||||
// Keep logs quiet; errors still surface on failure via the assertions.
|
||||
.env("RUST_LOG", "warn")
|
||||
.spawn()
|
||||
.expect("failed to spawn colibri-daemon");
|
||||
|
||||
// Wait for the daemon to bind its socket (it forks no children of its own
|
||||
// here; the file appears once `socket::serve` calls bind()).
|
||||
let deadline = Instant::now() + Duration::from_secs(15);
|
||||
while !socket_path.exists() {
|
||||
if Instant::now() > deadline {
|
||||
let _ = child.kill();
|
||||
panic!("daemon did not create socket within timeout");
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
|
||||
// Send SIGTERM (the signal `service stop` delivers), then wait for exit.
|
||||
let pid = child.id();
|
||||
let status = StdCommand::new("kill")
|
||||
.arg("-TERM")
|
||||
.arg(pid.to_string())
|
||||
.status()
|
||||
.expect("failed to run kill");
|
||||
assert!(status.success(), "kill -TERM failed");
|
||||
|
||||
// Poll for process exit so we don't block forever if shutdown hangs.
|
||||
let deadline = Instant::now() + Duration::from_secs(15);
|
||||
loop {
|
||||
match child.try_wait().expect("try_wait failed") {
|
||||
Some(_) => break,
|
||||
None => {
|
||||
if Instant::now() > deadline {
|
||||
let _ = child.kill();
|
||||
panic!("daemon did not exit within timeout after SIGTERM");
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert!(
|
||||
!socket_path.exists(),
|
||||
"socket file should be removed by the graceful SIGTERM path, but it still exists at {}",
|
||||
socket_path.display()
|
||||
);
|
||||
|
||||
let _ = std::fs::remove_dir_all(&tmp);
|
||||
}
|
||||
|
|
@ -84,9 +84,12 @@ colibri_daemon_prestart()
|
|||
install -d -o "${colibri_daemon_user}" -g "${colibri_daemon_group}" -m 0750 \
|
||||
"$(/usr/bin/dirname "${colibri_daemon_logfile}")"
|
||||
|
||||
# Clear runtime files before starting so repeated service starts always
|
||||
# bind a fresh socket and write fresh pidfiles.
|
||||
rm -f "${colibri_daemon_socket}" "${pidfile}" "${supervisor_pidfile}"
|
||||
# Clear stale pidfiles before starting so a previous hard-killed run does
|
||||
# not trip rc.subr's "already running" check. The socket is left to the
|
||||
# daemon: it removes a stale socket on bind but refuses to start if a live
|
||||
# daemon is already listening, so an rc-side rm here could only clobber a
|
||||
# running instance.
|
||||
rm -f "${pidfile}" "${supervisor_pidfile}"
|
||||
|
||||
# Provider keys are optional. Keep them in a root-owned env file instead of
|
||||
# rc.conf so they are easy to rotate and not world-readable.
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue