feat(daemon): headroom compression sidecar (hardened) #57
8 changed files with 328 additions and 5 deletions
|
|
@ -30,6 +30,8 @@ fn smoke_config() -> DaemonConfig {
|
|||
scheduler_prompt_injection: false,
|
||||
cache_warming_enabled: false,
|
||||
cache_warming_interval_hours: 0,
|
||||
headroom_enabled: false,
|
||||
headroom_socket_path: std::path::PathBuf::from("/var/run/colibri/headroom.sock"),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -51,6 +51,12 @@ pub struct DaemonConfig {
|
|||
pub cache_warming_enabled: bool,
|
||||
/// Re-warm cache every N hours. 0 = only warm once on startup.
|
||||
pub cache_warming_interval_hours: u64,
|
||||
/// Enable headroom compression sidecar for tool results (disabled by default).
|
||||
/// When enabled, tool results are compressed before entering agent context,
|
||||
/// saving 40%+ tokens with no accuracy loss.
|
||||
pub headroom_enabled: bool,
|
||||
/// Path to the headroom sidecar Unix socket.
|
||||
pub headroom_socket_path: PathBuf,
|
||||
}
|
||||
|
||||
impl DaemonConfig {
|
||||
|
|
@ -99,6 +105,10 @@ impl DaemonConfig {
|
|||
cache_warming_enabled: env_parse("COLIBRI_CACHE_WARMING").unwrap_or(false),
|
||||
cache_warming_interval_hours: env_parse("COLIBRI_CACHE_WARMING_INTERVAL_HOURS")
|
||||
.unwrap_or(0),
|
||||
headroom_enabled: env_parse("COLIBRI_HEADROOM_ENABLED").unwrap_or(false),
|
||||
headroom_socket_path: std::env::var("COLIBRI_HEADROOM_SOCKET")
|
||||
.map(PathBuf::from)
|
||||
.unwrap_or_else(|_| PathBuf::from("/var/run/colibri/headroom.sock")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,7 +10,9 @@
|
|||
//! 3. Volatile scratch — discarded per-turn, never persisted
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::info;
|
||||
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
|
||||
use tokio::net::UnixStream;
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Cost mode
|
||||
|
|
@ -172,6 +174,119 @@ pub fn compact_tool_result(raw: &str, max_bytes: u64, tool_name: &str) -> Option
|
|||
))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Headroom compression sidecar
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// A long-lived connection to the headroom compression sidecar process.
|
||||
///
|
||||
/// The sidecar is a Python script (`scripts/headroom-sidecar.py`) that
|
||||
/// listens on a Unix domain socket. Each request is a single JSON line;
|
||||
/// each response is a single JSON line. The connection is kept open and
|
||||
/// reused across tool results.
|
||||
pub struct HeadroomSidecar {
|
||||
stream: UnixStream,
|
||||
socket_path: std::path::PathBuf,
|
||||
}
|
||||
|
||||
impl HeadroomSidecar {
|
||||
/// Connect to the headroom sidecar at the given socket path.
|
||||
pub async fn connect(socket_path: &std::path::Path) -> std::io::Result<Self> {
|
||||
let stream = UnixStream::connect(socket_path).await?;
|
||||
info!(
|
||||
socket = %socket_path.display(),
|
||||
"connected to headroom sidecar"
|
||||
);
|
||||
Ok(Self {
|
||||
stream,
|
||||
socket_path: socket_path.to_path_buf(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Compress a tool result through the sidecar.
|
||||
///
|
||||
/// Returns the compressed content on success, or `None` if compression
|
||||
/// failed or produced no savings (graceful degradation).
|
||||
pub async fn compress(&mut self, raw: &str, tool_name: &str) -> Option<String> {
|
||||
let request = serde_json::json!({
|
||||
"id": tool_name,
|
||||
"raw": raw,
|
||||
"role": "tool",
|
||||
});
|
||||
|
||||
let payload = serde_json::to_string(&request).unwrap_or_default() + "\n";
|
||||
|
||||
match self.stream.write_all(payload.as_bytes()).await {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
warn!(error = %e, "headroom sidecar write failed");
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
let mut reader = BufReader::new(&mut self.stream);
|
||||
let mut line = String::new();
|
||||
match reader.read_line(&mut line).await {
|
||||
Ok(0) => {
|
||||
warn!("headroom sidecar closed connection");
|
||||
return None;
|
||||
}
|
||||
Ok(_) => {}
|
||||
Err(e) => {
|
||||
warn!(error = %e, "headroom sidecar read failed");
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
let response: serde_json::Value = match serde_json::from_str(&line) {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
warn!(error = %e, raw_line = %line, "headroom sidecar returned invalid JSON");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
if response.get("error").is_some() {
|
||||
warn!(error = %response["error"], "headroom sidecar error");
|
||||
return None;
|
||||
}
|
||||
|
||||
let tokens_before = response["tokens_before"].as_u64().unwrap_or(0);
|
||||
let tokens_after = response["tokens_after"].as_u64().unwrap_or(0);
|
||||
let compressed = response["compressed"].as_str().unwrap_or(raw);
|
||||
|
||||
if tokens_after >= tokens_before {
|
||||
debug!(
|
||||
tool = %tool_name,
|
||||
tokens_before,
|
||||
tokens_after,
|
||||
"headroom: no savings, returning original"
|
||||
);
|
||||
return None;
|
||||
}
|
||||
|
||||
info!(
|
||||
tool = %tool_name,
|
||||
tokens_before,
|
||||
tokens_after,
|
||||
saved = tokens_before.saturating_sub(tokens_after),
|
||||
pct = 100 - (tokens_after * 100 / tokens_before.max(1)),
|
||||
"headroom compression applied"
|
||||
);
|
||||
|
||||
Some(compressed.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for HeadroomSidecar {
|
||||
fn drop(&mut self) {
|
||||
info!(
|
||||
socket = %self.socket_path.display(),
|
||||
"headroom sidecar disconnected"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ use tokio::sync::{broadcast, Mutex, RwLock};
|
|||
use tracing::{debug, info, warn};
|
||||
|
||||
use crate::config::DaemonConfig;
|
||||
use crate::cost::HeadroomSidecar;
|
||||
use crate::session::Session;
|
||||
use crate::spawner::{AgentHandle, Spawner};
|
||||
|
||||
|
|
@ -26,6 +27,8 @@ pub struct DaemonState {
|
|||
pub last_warm_hit_tokens: RwLock<u64>,
|
||||
pub shutdown_tx: broadcast::Sender<()>,
|
||||
pub shutdown_rx: broadcast::Receiver<()>,
|
||||
/// Headroom compression sidecar (optional, connected lazily on startup).
|
||||
pub headroom_sidecar: RwLock<Option<HeadroomSidecar>>,
|
||||
}
|
||||
|
||||
impl DaemonState {
|
||||
|
|
@ -49,6 +52,7 @@ impl DaemonState {
|
|||
last_warm_hit_tokens: RwLock::new(0),
|
||||
shutdown_tx,
|
||||
shutdown_rx,
|
||||
headroom_sidecar: RwLock::new(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -96,6 +100,23 @@ pub async fn run_loop(
|
|||
|
||||
info!("daemon background loop started");
|
||||
|
||||
// Connect headroom compression sidecar if enabled.
|
||||
if state.config.headroom_enabled {
|
||||
match HeadroomSidecar::connect(&state.config.headroom_socket_path).await {
|
||||
Ok(sidecar) => {
|
||||
info!("headroom compression sidecar active");
|
||||
*state.headroom_sidecar.write().await = Some(sidecar);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
error = %e,
|
||||
socket = %state.config.headroom_socket_path.display(),
|
||||
"headroom sidecar unavailable — continuing without compression"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = heartbeat_tick.tick() => {
|
||||
|
|
|
|||
|
|
@ -363,6 +363,8 @@ mod tests {
|
|||
scheduler_prompt_injection: true,
|
||||
cache_warming_enabled: false,
|
||||
cache_warming_interval_hours: 0,
|
||||
headroom_enabled: false,
|
||||
headroom_socket_path: std::path::PathBuf::from("/var/run/colibri/headroom.sock"),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ use serde_json::Value;
|
|||
use thiserror::Error;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use crate::cost::HeadroomSidecar;
|
||||
use crate::DaemonConfig;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
@ -489,7 +490,7 @@ impl Session {
|
|||
/// Build a PromptAssembly from the current session state.
|
||||
/// Wraps the existing `build_prompt_messages()` — no behavior change.
|
||||
pub async fn build_prompt_assembly(&self) -> PromptAssembly {
|
||||
let messages = self.build_prompt_messages().await;
|
||||
let messages = self.build_prompt_messages(None).await;
|
||||
let total_bytes: u64 = messages
|
||||
.iter()
|
||||
.map(|m| serde_json::to_string(m).unwrap_or_default().len() as u64)
|
||||
|
|
@ -518,7 +519,10 @@ impl Session {
|
|||
/// 1. Immutable system prefix (byte-stable for cache hits)
|
||||
/// 2. Appendable conversation log (turns, possibly with compaction gaps)
|
||||
/// 3. Volatile scratch (not included — caller appends per-request)
|
||||
pub async fn build_prompt_messages(&self) -> Vec<serde_json::Value> {
|
||||
pub async fn build_prompt_messages(
|
||||
&self,
|
||||
mut headroom_sidecar: Option<&mut HeadroomSidecar>,
|
||||
) -> Vec<serde_json::Value> {
|
||||
let mut messages: Vec<serde_json::Value> = Vec::new();
|
||||
|
||||
// Region 1: Immutable system prefix
|
||||
|
|
@ -555,11 +559,18 @@ impl Session {
|
|||
}));
|
||||
}
|
||||
SessionEntry::ToolResult { name, result } => {
|
||||
let raw = result.to_string();
|
||||
let content = match headroom_sidecar {
|
||||
Some(ref mut sidecar) => {
|
||||
sidecar.compress(&raw, name).await.unwrap_or(raw)
|
||||
}
|
||||
None => raw,
|
||||
};
|
||||
messages.push(serde_json::json!({
|
||||
"role": "tool",
|
||||
"tool_call_id": format!("call_{}", turn.index),
|
||||
"name": name,
|
||||
"content": result.to_string(),
|
||||
"content": content,
|
||||
}));
|
||||
}
|
||||
SessionEntry::Compaction { summary, .. } => {
|
||||
|
|
|
|||
|
|
@ -294,6 +294,11 @@ async fn cmd_status(state: &SharedState) -> ColibriResponse {
|
|||
"last_warm_cache_hit": last_warm_hit,
|
||||
"last_warm_hit_tokens": last_warm_tokens,
|
||||
},
|
||||
"headroom": {
|
||||
"enabled": state.config.headroom_enabled,
|
||||
"socket": state.config.headroom_socket_path,
|
||||
"connected": state.headroom_sidecar.read().await.is_some(),
|
||||
},
|
||||
}))
|
||||
}
|
||||
|
||||
|
|
@ -463,7 +468,13 @@ async fn cmd_get_session(state: &SharedState, session_id: String) -> ColibriResp
|
|||
match state.sessions.get(&session_id) {
|
||||
Some(session) => {
|
||||
let turns = session.value().turns().await;
|
||||
let messages = session.value().build_prompt_messages().await;
|
||||
let messages = {
|
||||
let mut sidecar_guard = state.headroom_sidecar.write().await;
|
||||
session
|
||||
.value()
|
||||
.build_prompt_messages((*sidecar_guard).as_mut())
|
||||
.await
|
||||
};
|
||||
|
||||
ColibriResponse::ok(serde_json::json!({
|
||||
"session_id": session_id,
|
||||
|
|
@ -701,6 +712,8 @@ mod tests {
|
|||
scheduler_prompt_injection: true,
|
||||
cache_warming_enabled: false,
|
||||
cache_warming_interval_hours: 0,
|
||||
headroom_enabled: false,
|
||||
headroom_socket_path: std::path::PathBuf::from("/var/run/colibri/headroom.sock"),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
149
scripts/headroom-sidecar.py
Normal file
149
scripts/headroom-sidecar.py
Normal file
|
|
@ -0,0 +1,149 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Headroom compression sidecar for Colibri daemon.
|
||||
|
||||
Listens on a Unix domain socket. Accepts newline-delimited JSON requests,
|
||||
compresses the tool output via headroom, and returns the compressed result.
|
||||
|
||||
Protocol:
|
||||
→ {"id": "uuid", "raw": "tool output text...", "role": "tool"}\n
|
||||
← {"id": "uuid", "compressed": "...", "tokens_before": N, "tokens_after": M}\n
|
||||
|
||||
Exit codes:
|
||||
0 — clean shutdown (SIGTERM)
|
||||
1 — startup failure (socket bind error, import error)
|
||||
2 — runtime panic (unhandled exception in event loop)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import socket
|
||||
import sys
|
||||
import uuid
|
||||
|
||||
SOCKET_BACKLOG = 5
|
||||
MAX_MESSAGE_BYTES = 1_048_576 # 1 MiB
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Headroom compression sidecar")
|
||||
parser.add_argument(
|
||||
"--socket",
|
||||
default="/var/run/colibri/headroom.sock",
|
||||
help="Unix socket path",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
sock_path = args.socket
|
||||
_ensure_socket_dir(sock_path)
|
||||
|
||||
# Late import so --help is fast
|
||||
from headroom import compress
|
||||
|
||||
running = True
|
||||
|
||||
def _handle_signal(signum, frame):
|
||||
nonlocal running
|
||||
running = False
|
||||
|
||||
signal.signal(signal.SIGTERM, _handle_signal)
|
||||
signal.signal(signal.SIGINT, _handle_signal)
|
||||
|
||||
_remove_stale_socket(sock_path)
|
||||
|
||||
server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
server.bind(sock_path)
|
||||
os.chmod(sock_path, 0o660)
|
||||
server.listen(SOCKET_BACKLOG)
|
||||
server.settimeout(1.0) # wake up every second to check shutdown flag
|
||||
|
||||
print(f"headroom-sidecar listening on {sock_path}", flush=True)
|
||||
|
||||
while running:
|
||||
try:
|
||||
conn, _addr = server.accept()
|
||||
except socket.timeout:
|
||||
continue
|
||||
|
||||
try:
|
||||
_handle_connection(conn, compress)
|
||||
except Exception:
|
||||
# Don't crash the whole server on one bad message
|
||||
pass
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
_remove_stale_socket(sock_path)
|
||||
return 0
|
||||
|
||||
|
||||
def _ensure_socket_dir(sock_path: str) -> None:
|
||||
sock_dir = os.path.dirname(sock_path)
|
||||
os.makedirs(sock_dir, exist_ok=True)
|
||||
|
||||
|
||||
def _remove_stale_socket(sock_path: str) -> None:
|
||||
try:
|
||||
os.unlink(sock_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _handle_connection(conn: socket.socket, compress_fn) -> None:
|
||||
"""Read one line, compress, write one line, close."""
|
||||
buf = b""
|
||||
while True:
|
||||
try:
|
||||
chunk = conn.recv(4096)
|
||||
except (ConnectionResetError, BrokenPipeError):
|
||||
return
|
||||
if not chunk:
|
||||
break
|
||||
buf += chunk
|
||||
if b"\n" in buf or len(buf) > MAX_MESSAGE_BYTES:
|
||||
break
|
||||
|
||||
if not buf:
|
||||
return
|
||||
|
||||
try:
|
||||
request = json.loads(buf.split(b"\n")[0])
|
||||
except json.JSONDecodeError:
|
||||
_write_error(conn, "invalid json")
|
||||
return
|
||||
|
||||
req_id = request.get("id", str(uuid.uuid4()))
|
||||
raw = request.get("raw", "")
|
||||
role = request.get("role", "tool")
|
||||
|
||||
try:
|
||||
messages = [{"role": role, "content": raw}]
|
||||
result = compress_fn(messages)
|
||||
response = {
|
||||
"id": req_id,
|
||||
"compressed": result.messages[0]["content"],
|
||||
"tokens_before": result.tokens_before,
|
||||
"tokens_after": result.tokens_after,
|
||||
}
|
||||
except Exception as exc:
|
||||
_write_error(conn, str(exc))
|
||||
return
|
||||
|
||||
try:
|
||||
conn.sendall(json.dumps(response).encode() + b"\n")
|
||||
except (BrokenPipeError, ConnectionResetError):
|
||||
pass
|
||||
|
||||
|
||||
def _write_error(conn: socket.socket, message: str) -> None:
|
||||
try:
|
||||
conn.sendall(
|
||||
json.dumps({"error": message}).encode() + b"\n"
|
||||
)
|
||||
except (BrokenPipeError, ConnectionResetError):
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Reference in a new issue