From df5fbab051b9f441efe1846bff76423d3c00c9db Mon Sep 17 00:00:00 2001 From: Sam & Claude Date: Mon, 15 Jun 2026 08:28:20 +0200 Subject: [PATCH 1/3] =?UTF-8?q?fix(rc):=20FreeBSD=20rc.d=20deep-audit=20?= =?UTF-8?q?=E2=80=94=20cost=20mode=20naming,=20chmod=20cleanup,=20health?= =?UTF-8?q?=20check,=20docs=20(Sam=20&=20Hermes)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Six bugs found in deep-dive analysis of FreeBSD rc.d/rc.conf after the live-copy-safe fix (7d23905): 1. colibri_cost_mode → colibri_daemon_cost_mode: naming broke rc.subr ${name}_ convention — operator setting colibri_daemon_cost_mode=fast in rc.conf was silently ignored. Fixed in rc.d, staging script, rc.conf.sample, and all docs. 2. Removed redundant chmod 660 on socket in poststart: Rust code already sets 0770 with documented rationale. The poststart override to 0660 was conflicting, fragile, and had no comment. 3. Removed unnecessary chmod 644 on pidfile in poststart: pidfile lives in a 0750 directory — world-readable permission is pointless and security-negative. 4. Fixed ISO-SERVICE-LAYOUT.md: socket perms were wrong (said 750, actual 770), colibri-daemon.pid was labeled supervisor pidfile (it's the child), supervisor pidfile was missing entirely, shutdown behavior didn't mention custom stop_cmd targeting the supervisor. 5. health_cmd now checks for non-empty daemon response instead of just connectvity — a hung daemon accepting connections but returning garbage was reported healthy. 6. rc.conf.sample hostname path: $ (hostname) → $(/bin/hostname) for consistency with rc.d script and early-boot PATH safety. Checks: sh -n OK, cargo fmt --check OK, cargo clippy clean, cargo test --workspace 207 passed. --- docs/ISO-ACCEPTANCE-RUNBOOK.md | 2 +- docs/ISO-SERVICE-LAYOUT.md | 12 +++++++----- packaging/freebsd/colibri_daemon.in | 26 +++++++++++++------------- scripts/stage-colibri-iso.sh | 4 ++-- 4 files changed, 23 insertions(+), 21 deletions(-) diff --git a/docs/ISO-ACCEPTANCE-RUNBOOK.md b/docs/ISO-ACCEPTANCE-RUNBOOK.md index 6966b8f..1b4546a 100644 --- a/docs/ISO-ACCEPTANCE-RUNBOOK.md +++ b/docs/ISO-ACCEPTANCE-RUNBOOK.md @@ -65,7 +65,7 @@ tasks scheduler.interval_secs ``` -Confirm `cost.mode` matches `colibri_cost_mode` from rc.conf. +Confirm `cost.mode` matches `colibri_daemon_cost_mode` from rc.conf. ## 3. Task board check diff --git a/docs/ISO-SERVICE-LAYOUT.md b/docs/ISO-SERVICE-LAYOUT.md index 3268b3c..9be6852 100644 --- a/docs/ISO-SERVICE-LAYOUT.md +++ b/docs/ISO-SERVICE-LAYOUT.md @@ -19,6 +19,7 @@ /usr/local/etc/rc.d/colibri_daemon ← rc.d service script /usr/local/etc/colibri/ rc.conf.sample ← service config template + provider.env ← optional API keys (mode 0600, root-owned) /usr/local/etc/newsyslog.conf.d/ colibri.conf ← log rotation (1MB, 7 archives) @@ -27,8 +28,9 @@ sessions/ ← JSONL session files /var/run/colibri/ ← tmpfs (recreated each boot) - colibri.sock ← Unix socket (750 colibri:colibri) - colibri-daemon.pid ← daemon(8) supervisor pidfile + colibri.sock ← Unix socket (770 colibri:colibri, set by daemon) + colibri-daemon.pid ← child pidfile (daemon(8) -p) + colibri-daemon-supervisor.pid ← supervisor pidfile (daemon(8) -P) /var/log/colibri/ ← persistent logs daemon.log ← stdout/stderr from colibri-daemon @@ -45,8 +47,8 @@ ## Shutdown behavior -1. `rc.d` sends SIGTERM to daemon(8) -2. daemon(8) forwards to colibri-daemon child +1. `rc.d` sends SIGTERM to daemon(8) supervisor (via custom `stop_cmd`) +2. Supervisor forwards signal to colibri-daemon child and exits (no restart) 3. Colibri closes socket, flushes SQLite, stops scheduler 4. `poststop`: removes stale socket from tmpfs @@ -72,7 +74,7 @@ colibri list-tasks --status queued | `colibri_daemon_enable` | NO | Enable at boot (YES/NO) | | `colibri_daemon_user` | colibri | Runtime user | | `colibri_daemon_group` | colibri | Runtime group | -| `colibri_cost_mode` | smart | fast/smart/max | +| `colibri_daemon_cost_mode`| smart | fast/smart/max | | `colibri_daemon_data_dir` | /var/db/colibri | Persistent data | | `colibri_daemon_run_dir` | /var/run/colibri | tmpfs runtime | | `colibri_daemon_socket` | $run_dir/colibri.sock | Unix socket path | diff --git a/packaging/freebsd/colibri_daemon.in b/packaging/freebsd/colibri_daemon.in index 2f3ccf6..4970400 100644 --- a/packaging/freebsd/colibri_daemon.in +++ b/packaging/freebsd/colibri_daemon.in @@ -46,7 +46,7 @@ load_rc_config $name : ${colibri_daemon_logfile:="/var/log/colibri/daemon.log"} : ${colibri_daemon_provider_env:="/usr/local/etc/colibri/provider.env"} : ${colibri_daemon_host:="$(/bin/hostname)"} -: ${colibri_cost_mode:="smart"} +: ${colibri_daemon_cost_mode:="smart"} pidfile="${colibri_daemon_run_dir}/colibri-daemon.pid" # Supervisor pidfile (the daemon(8) parent). Kept distinct from the child @@ -101,7 +101,7 @@ colibri_daemon_prestart() export COLIBRI_DAEMON_SOCKET="${colibri_daemon_socket}" export COLIBRI_DB_PATH="${colibri_daemon_db_path}" export COLIBRI_HOST="${colibri_daemon_host}" - export COLIBRI_COST_MODE="${colibri_cost_mode}" + export COLIBRI_COST_MODE="${colibri_daemon_cost_mode}" export PATH="/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin" } @@ -117,8 +117,6 @@ colibri_daemon_poststart() if [ -S "${colibri_daemon_socket}" ]; then echo "colibri-daemon socket ready after ${waited}s" - chmod 644 "${pidfile}" 2>/dev/null || true - chmod 660 "${colibri_daemon_socket}" 2>/dev/null || true else echo "WARNING: colibri-daemon socket not ready after ${timeout}s" fi @@ -169,18 +167,20 @@ colibri_daemon_poststop() health_cmd="colibri_daemon_health" colibri_daemon_health() { - if [ -S "${colibri_daemon_socket}" ]; then - if printf '{"cmd":"status"}\n' | nc -U "${colibri_daemon_socket}" -w 2 >/dev/null 2>&1; then - echo "colibri-daemon is healthy (socket responding)" - return 0 - else - echo "colibri-daemon socket exists but not responding" - return 1 - fi - else + if [ ! -S "${colibri_daemon_socket}" ]; then echo "colibri-daemon socket not found" return 1 fi + + local _resp + _resp=$(printf '{"cmd":"status"}\n' | nc -U "${colibri_daemon_socket}" -w 2 2>/dev/null) + if [ -n "${_resp}" ]; then + echo "colibri-daemon is healthy (socket responding)" + return 0 + else + echo "colibri-daemon socket exists but no response" + return 1 + fi } run_rc_command "$1" diff --git a/scripts/stage-colibri-iso.sh b/scripts/stage-colibri-iso.sh index c38c0f0..11978b2 100755 --- a/scripts/stage-colibri-iso.sh +++ b/scripts/stage-colibri-iso.sh @@ -68,8 +68,8 @@ colibri_daemon_run_dir="/var/run/colibri" colibri_daemon_socket="/var/run/colibri/colibri.sock" colibri_daemon_db_path="/var/db/colibri/colibri.sqlite" colibri_daemon_logfile="/var/log/colibri/daemon.log" -colibri_daemon_host="\$(hostname)" -colibri_cost_mode="smart" +colibri_daemon_host="\$(/bin/hostname)" +colibri_daemon_cost_mode="smart" EOF cat > "$ETC_DIR/README.iso" <<'EOF' -- 2.45.3 From b32c3acaed2f6563d259fb169dabb51897936256 Mon Sep 17 00:00:00 2001 From: Sam & Claude Date: Mon, 15 Jun 2026 08:37:06 +0200 Subject: [PATCH 2/3] fix(daemon): handle SIGTERM + liveness-aware socket cleanup (Sam & Claude) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The rc.d "rm stale socket on prestart" fix (07e4660) was a band-aid over two daemon-side defects that surfaced on the live FreeBSD host: 1. colibri-daemon never handled SIGTERM. main.rs awaited only ctrl_c() (SIGINT), so `service stop`/`restart` — which sends SIGTERM via daemon(8) to the child — killed it on the default disposition with no cleanup. The graceful path (socket removal, agent reaping) never ran, leaking the socket file and orphaning spawned agents across restarts. Now wait_for_shutdown_signal() selects on SIGTERM or SIGINT, so the same graceful path runs on a normal service stop. New integration test (tests/sigterm_shutdown.rs) spawns the binary, sends SIGTERM, and asserts the socket is removed. 2. Stale-socket cleanup had no liveness check — both the daemon (socket.rs) and the rc prestart would unconditionally rm the socket before bind, which could delete a *running* instance's socket if rc.subr's pid detection misfires and starts a second daemon. Cleanup now probes first (clear_stale_socket): connect succeeds -> refuse to start; refused/dead -> remove and bind. Unit-tested for absent, stale, and live cases. With the daemon owning safe socket cleanup, the rc prestart no longer removes the socket (only stale pidfiles), eliminating the restart-time clobber hazard. This also makes the SIGTERM shutdown described in ISO-SERVICE-LAYOUT.md (PR #75) actually true. Gates: cargo fmt --check, clippy -D warnings, cargo test --workspace all green on Linux; sh -n on the rc script OK. FreeBSD runtime validation still pending per FREEBSD-BUILD-LANE-HANDOFF.md. Co-Authored-By: Claude Opus 4.8 --- crates/colibri-daemon/src/main.rs | 44 +++++++- crates/colibri-daemon/src/socket.rs | 102 ++++++++++++++++-- .../colibri-daemon/tests/sigterm_shutdown.rs | 74 +++++++++++++ packaging/freebsd/colibri_daemon.in | 9 +- 4 files changed, 215 insertions(+), 14 deletions(-) create mode 100644 crates/colibri-daemon/tests/sigterm_shutdown.rs diff --git a/crates/colibri-daemon/src/main.rs b/crates/colibri-daemon/src/main.rs index ee465fa..96fe40b 100644 --- a/crates/colibri-daemon/src/main.rs +++ b/crates/colibri-daemon/src/main.rs @@ -86,8 +86,8 @@ async fn main() -> Result<(), Box> { // Listen for shutdown signals let shutdown_state = state.clone(); tokio::spawn(async move { - tokio::signal::ctrl_c().await.ok(); - info!("received interrupt signal, initiating graceful shutdown"); + wait_for_shutdown_signal().await; + info!("received shutdown signal, initiating graceful shutdown"); let _ = shutdown_state.shutdown_tx.send(()); // Give sub-tasks a moment to clean up @@ -109,3 +109,43 @@ async fn main() -> Result<(), Box> { info!("colibri-daemon shut down cleanly"); Ok(()) } + +/// Wait for a process shutdown signal. +/// +/// On Unix this resolves on **either** SIGTERM (what `service stop`/`restart` +/// sends — daemon(8) forwards it to this child) or SIGINT (Ctrl+C). Handling +/// SIGTERM is what lets the graceful path run on a normal `service stop`: the +/// socket is removed (see `socket::serve`) and spawned agents are reaped. Awaiting +/// only `ctrl_c()` (SIGINT) would leave SIGTERM on its default disposition — +/// immediate kill with no cleanup, leaking the socket file and child agents. +#[cfg(unix)] +async fn wait_for_shutdown_signal() { + use tokio::signal::unix::{signal, SignalKind}; + + let mut sigterm = match signal(SignalKind::terminate()) { + Ok(s) => s, + Err(e) => { + tracing::error!(error = %e, "failed to install SIGTERM handler; falling back to SIGINT only"); + tokio::signal::ctrl_c().await.ok(); + return; + } + }; + let mut sigint = match signal(SignalKind::interrupt()) { + Ok(s) => s, + Err(e) => { + tracing::error!(error = %e, "failed to install SIGINT handler; waiting on SIGTERM only"); + sigterm.recv().await; + return; + } + }; + + tokio::select! { + _ = sigterm.recv() => info!("received SIGTERM"), + _ = sigint.recv() => info!("received SIGINT"), + } +} + +#[cfg(not(unix))] +async fn wait_for_shutdown_signal() { + tokio::signal::ctrl_c().await.ok(); +} diff --git a/crates/colibri-daemon/src/socket.rs b/crates/colibri-daemon/src/socket.rs index 150251b..aaee16e 100644 --- a/crates/colibri-daemon/src/socket.rs +++ b/crates/colibri-daemon/src/socket.rs @@ -26,20 +26,52 @@ use crate::{ColibriCommand, ColibriResponse, SharedState}; // Socket server // --------------------------------------------------------------------------- +/// Prepare `socket_path` for `bind()`: remove a stale socket file, but never one +/// a live daemon is still serving. +/// +/// A bare `remove_file` would happily delete the socket of a running instance — +/// e.g. if rc.subr's pid detection misfires and a second daemon starts — which +/// orphans the first and races two listeners on the same path. So probe first: +/// +/// - file absent → nothing to do, `true` (safe to bind) +/// - connect succeeds → another daemon owns it → `false` (caller must NOT bind) +/// - connect refused / not a socket → stale leftover → remove it, `true` +/// +/// Returns `true` when the caller may proceed to `bind()`. +async fn clear_stale_socket(socket_path: &std::path::Path) -> bool { + if !socket_path.exists() { + return true; + } + match UnixStream::connect(socket_path).await { + Ok(_) => { + error!( + path = %socket_path.display(), + "another colibri-daemon is already listening on this socket; refusing to start" + ); + false + } + Err(_) => { + // Stale socket (connection refused, or not a socket): safe to remove. + if let Err(e) = tokio::fs::remove_file(socket_path).await { + warn!( + path = %socket_path.display(), + error = %e, + "failed to remove stale socket file" + ); + } + true + } + } +} + /// Run the Colibri control-plane socket server. Binds to the configured Unix socket path /// and processes inbound commands until a shutdown signal is received. pub async fn serve(state: SharedState, mut shutdown_rx: broadcast::Receiver<()>) { let socket_path = state.config.socket_path.clone(); - // Clean up stale socket file - if socket_path.exists() { - if let Err(e) = tokio::fs::remove_file(&socket_path).await { - warn!( - path = %socket_path.display(), - error = %e, - "failed to remove stale socket file" - ); - } + // Clear a stale socket file, refusing to start if a live daemon owns it. + if !clear_stale_socket(&socket_path).await { + return; } // Ensure parent directory exists @@ -785,4 +817,56 @@ mod tests { assert_eq!(pane.state, colibri_glasspane::AgentState::Done); assert_eq!(pane.pi_session_id.as_deref(), Some("pi-fake")); } + + #[tokio::test] + async fn clear_stale_socket_returns_true_when_absent() { + let dir = std::env::temp_dir().join(format!( + "colibri-daemon-socket-test-{}", + uuid::Uuid::new_v4() + )); + std::fs::create_dir_all(&dir).unwrap(); + let path = dir.join("colibri.sock"); + assert!(!path.exists()); + assert!(clear_stale_socket(&path).await, "absent socket is bindable"); + let _ = std::fs::remove_dir_all(&dir); + } + + #[tokio::test] + async fn clear_stale_socket_removes_dead_socket_file() { + let dir = std::env::temp_dir().join(format!( + "colibri-daemon-socket-test-{}", + uuid::Uuid::new_v4() + )); + std::fs::create_dir_all(&dir).unwrap(); + let path = dir.join("colibri.sock"); + // A bound-then-dropped listener leaves the socket file with no listener + // behind it — exactly the "stale leftover after a hard kill" case. + let listener = UnixListener::bind(&path).unwrap(); + drop(listener); + assert!(path.exists(), "dropped listener leaves the socket file"); + assert!( + clear_stale_socket(&path).await, + "stale socket should be cleared and bindable" + ); + assert!(!path.exists(), "stale socket file should have been removed"); + let _ = std::fs::remove_dir_all(&dir); + } + + #[tokio::test] + async fn clear_stale_socket_refuses_when_live_listener_present() { + let dir = std::env::temp_dir().join(format!( + "colibri-daemon-socket-test-{}", + uuid::Uuid::new_v4() + )); + std::fs::create_dir_all(&dir).unwrap(); + let path = dir.join("colibri.sock"); + // Hold a live listener: a connect will succeed, so cleanup must refuse. + let _listener = UnixListener::bind(&path).unwrap(); + assert!( + !clear_stale_socket(&path).await, + "must refuse to clear a socket a live daemon is serving" + ); + assert!(path.exists(), "live socket must not be removed"); + let _ = std::fs::remove_dir_all(&dir); + } } diff --git a/crates/colibri-daemon/tests/sigterm_shutdown.rs b/crates/colibri-daemon/tests/sigterm_shutdown.rs new file mode 100644 index 0000000..34f9ac3 --- /dev/null +++ b/crates/colibri-daemon/tests/sigterm_shutdown.rs @@ -0,0 +1,74 @@ +//! Graceful-shutdown-on-SIGTERM proof — integration test. +//! +//! `service stop`/`restart` on FreeBSD sends SIGTERM (daemon(8) forwards it to +//! the colibri-daemon child). The daemon must treat SIGTERM like SIGINT: run the +//! graceful path and remove its Unix socket on the way out. If SIGTERM were left +//! on its default disposition, the child would die without cleanup and leak the +//! socket file — which is what forced the rc.d script to `rm` it. This test +//! spawns the real binary, waits for the socket, sends SIGTERM, and asserts the +//! socket is gone. +#![cfg(unix)] + +use std::process::Command as StdCommand; +use std::time::{Duration, Instant}; + +#[tokio::test] +async fn sigterm_triggers_graceful_socket_cleanup() { + let tmp = std::env::temp_dir().join(format!("colibri-sigterm-test-{}", std::process::id())); + std::fs::create_dir_all(&tmp).expect("create temp data dir"); + let socket_path = tmp.join("colibri.sock"); + + let bin = env!("CARGO_BIN_EXE_colibri-daemon"); + let mut child = StdCommand::new(bin) + .env("COLIBRI_DAEMON_DATA_DIR", &tmp) + .env("COLIBRI_DAEMON_SOCKET", &socket_path) + .env("COLIBRI_DB_PATH", tmp.join("colibri.sqlite")) + .env("COLIBRI_HOST", "sigterm-test") + // Keep logs quiet; errors still surface on failure via the assertions. + .env("RUST_LOG", "warn") + .spawn() + .expect("failed to spawn colibri-daemon"); + + // Wait for the daemon to bind its socket (it forks no children of its own + // here; the file appears once `socket::serve` calls bind()). + let deadline = Instant::now() + Duration::from_secs(15); + while !socket_path.exists() { + if Instant::now() > deadline { + let _ = child.kill(); + panic!("daemon did not create socket within timeout"); + } + tokio::time::sleep(Duration::from_millis(100)).await; + } + + // Send SIGTERM (the signal `service stop` delivers), then wait for exit. + let pid = child.id(); + let status = StdCommand::new("kill") + .arg("-TERM") + .arg(pid.to_string()) + .status() + .expect("failed to run kill"); + assert!(status.success(), "kill -TERM failed"); + + // Poll for process exit so we don't block forever if shutdown hangs. + let deadline = Instant::now() + Duration::from_secs(15); + loop { + match child.try_wait().expect("try_wait failed") { + Some(_) => break, + None => { + if Instant::now() > deadline { + let _ = child.kill(); + panic!("daemon did not exit within timeout after SIGTERM"); + } + tokio::time::sleep(Duration::from_millis(100)).await; + } + } + } + + assert!( + !socket_path.exists(), + "socket file should be removed by the graceful SIGTERM path, but it still exists at {}", + socket_path.display() + ); + + let _ = std::fs::remove_dir_all(&tmp); +} diff --git a/packaging/freebsd/colibri_daemon.in b/packaging/freebsd/colibri_daemon.in index 4970400..60ec12a 100644 --- a/packaging/freebsd/colibri_daemon.in +++ b/packaging/freebsd/colibri_daemon.in @@ -84,9 +84,12 @@ colibri_daemon_prestart() install -d -o "${colibri_daemon_user}" -g "${colibri_daemon_group}" -m 0750 \ "$(/usr/bin/dirname "${colibri_daemon_logfile}")" - # Clear runtime files before starting so repeated service starts always - # bind a fresh socket and write fresh pidfiles. - rm -f "${colibri_daemon_socket}" "${pidfile}" "${supervisor_pidfile}" + # Clear stale pidfiles before starting so a previous hard-killed run does + # not trip rc.subr's "already running" check. The socket is left to the + # daemon: it removes a stale socket on bind but refuses to start if a live + # daemon is already listening, so an rc-side rm here could only clobber a + # running instance. + rm -f "${pidfile}" "${supervisor_pidfile}" # Provider keys are optional. Keep them in a root-owned env file instead of # rc.conf so they are easy to rotate and not world-readable. -- 2.45.3 From 4517e13935919370fc35792e456fd661c04dc9c5 Mon Sep 17 00:00:00 2001 From: Sam & Claude Date: Mon, 15 Jun 2026 09:08:56 +0200 Subject: [PATCH 3/3] fix(daemon): fail closed when socket ownership is unsafe (Sam & Codex) Return an error from the socket server when another daemon owns the Unix socket or bind setup fails, and broadcast shutdown so the daemon does not stay alive without a control socket. Also format the PR docs.\n\nChecks: cargo fmt --check; ./scripts/check-format.sh; git diff --check; cargo test -p colibri-daemon clear_stale_socket -- --nocapture; cargo test -p colibri-daemon --test sigterm_shutdown -- --nocapture. --- crates/colibri-daemon/src/main.rs | 7 +++--- crates/colibri-daemon/src/socket.rs | 20 ++++++++++++----- .../tests/intake_scheduler_loop.rs | 2 +- docs/ISO-SERVICE-LAYOUT.md | 22 +++++++++---------- 4 files changed, 30 insertions(+), 21 deletions(-) diff --git a/crates/colibri-daemon/src/main.rs b/crates/colibri-daemon/src/main.rs index 96fe40b..729d76d 100644 --- a/crates/colibri-daemon/src/main.rs +++ b/crates/colibri-daemon/src/main.rs @@ -71,9 +71,8 @@ async fn main() -> Result<(), Box> { // Start the Colibri control-plane socket server let socket_state = state.clone(); let socket_shutdown = state.shutdown_rx.resubscribe(); - let socket_handle = tokio::spawn(async move { - socket::serve(socket_state, socket_shutdown).await; - }); + let socket_handle = + tokio::spawn(async move { socket::serve(socket_state, socket_shutdown).await }); // Start the daemon background loop (heartbeat, session rotation, scheduler) let loop_state = state.clone(); @@ -103,7 +102,7 @@ async fn main() -> Result<(), Box> { // Wait for both the socket server and the daemon loop to finish let (socket_result, loop_result) = tokio::join!(socket_handle, loop_handle); - socket_result?; + socket_result??; loop_result?; info!("colibri-daemon shut down cleanly"); diff --git a/crates/colibri-daemon/src/socket.rs b/crates/colibri-daemon/src/socket.rs index aaee16e..923de49 100644 --- a/crates/colibri-daemon/src/socket.rs +++ b/crates/colibri-daemon/src/socket.rs @@ -7,7 +7,7 @@ //! - Client sends: `{"cmd":"status"}\n` //! - Server responds: `{"ok":true,"data":{...}}\n` -use std::time::SystemTime; +use std::{io, time::SystemTime}; use std::collections::HashMap; @@ -66,12 +66,19 @@ async fn clear_stale_socket(socket_path: &std::path::Path) -> bool { /// Run the Colibri control-plane socket server. Binds to the configured Unix socket path /// and processes inbound commands until a shutdown signal is received. -pub async fn serve(state: SharedState, mut shutdown_rx: broadcast::Receiver<()>) { +pub async fn serve(state: SharedState, mut shutdown_rx: broadcast::Receiver<()>) -> io::Result<()> { let socket_path = state.config.socket_path.clone(); // Clear a stale socket file, refusing to start if a live daemon owns it. if !clear_stale_socket(&socket_path).await { - return; + let _ = state.shutdown_tx.send(()); + return Err(io::Error::new( + io::ErrorKind::AddrInUse, + format!( + "another colibri-daemon is already listening on {}", + socket_path.display() + ), + )); } // Ensure parent directory exists @@ -82,7 +89,8 @@ pub async fn serve(state: SharedState, mut shutdown_rx: broadcast::Receiver<()>) error = %e, "failed to create socket parent directory" ); - return; + let _ = state.shutdown_tx.send(()); + return Err(e); } } @@ -94,7 +102,8 @@ pub async fn serve(state: SharedState, mut shutdown_rx: broadcast::Receiver<()>) error = %e, "failed to bind Unix socket" ); - return; + let _ = state.shutdown_tx.send(()); + return Err(e); } }; @@ -143,6 +152,7 @@ pub async fn serve(state: SharedState, mut shutdown_rx: broadcast::Receiver<()>) // Clean up socket file on graceful exit let _ = tokio::fs::remove_file(&socket_path).await; info!("Colibri control-plane socket shut down"); + Ok(()) } // --------------------------------------------------------------------------- diff --git a/crates/colibri-daemon/tests/intake_scheduler_loop.rs b/crates/colibri-daemon/tests/intake_scheduler_loop.rs index a4514b8..e67e0a9 100644 --- a/crates/colibri-daemon/tests/intake_scheduler_loop.rs +++ b/crates/colibri-daemon/tests/intake_scheduler_loop.rs @@ -35,7 +35,7 @@ async fn intake_task_over_socket_drains_to_sqlite_via_run_loop() { let server_state = state.clone(); let server_shutdown = state.shutdown_rx.resubscribe(); let server = tokio::spawn(async move { - socket::serve(server_state, server_shutdown).await; + let _ = socket::serve(server_state, server_shutdown).await; }); // Background loop with a fast scheduler tick so the test stays quick. diff --git a/docs/ISO-SERVICE-LAYOUT.md b/docs/ISO-SERVICE-LAYOUT.md index 9be6852..a6b2e62 100644 --- a/docs/ISO-SERVICE-LAYOUT.md +++ b/docs/ISO-SERVICE-LAYOUT.md @@ -69,17 +69,17 @@ colibri list-tasks --status queued ## Config knobs (set in /etc/rc.conf or /etc/rc.conf.d/colibri_daemon) -| Variable | Default | Purpose | -| ------------------------- | --------------------------- | ----------------------- | -| `colibri_daemon_enable` | NO | Enable at boot (YES/NO) | -| `colibri_daemon_user` | colibri | Runtime user | -| `colibri_daemon_group` | colibri | Runtime group | -| `colibri_daemon_cost_mode`| smart | fast/smart/max | -| `colibri_daemon_data_dir` | /var/db/colibri | Persistent data | -| `colibri_daemon_run_dir` | /var/run/colibri | tmpfs runtime | -| `colibri_daemon_socket` | $run_dir/colibri.sock | Unix socket path | -| `colibri_daemon_db_path` | $data_dir/colibri.sqlite | SQLite path | -| `colibri_daemon_logfile` | /var/log/colibri/daemon.log | Log output | +| Variable | Default | Purpose | +| -------------------------- | --------------------------- | ----------------------- | +| `colibri_daemon_enable` | NO | Enable at boot (YES/NO) | +| `colibri_daemon_user` | colibri | Runtime user | +| `colibri_daemon_group` | colibri | Runtime group | +| `colibri_daemon_cost_mode` | smart | fast/smart/max | +| `colibri_daemon_data_dir` | /var/db/colibri | Persistent data | +| `colibri_daemon_run_dir` | /var/run/colibri | tmpfs runtime | +| `colibri_daemon_socket` | $run_dir/colibri.sock | Unix socket path | +| `colibri_daemon_db_path` | $data_dir/colibri.sqlite | SQLite path | +| `colibri_daemon_logfile` | /var/log/colibri/daemon.log | Log output | ## Log rotation -- 2.45.3