From 39db0b661b14237a5b2b5fa45beb8ebe7feb9dad Mon Sep 17 00:00:00 2001 From: Sam & Hermes Date: Sun, 31 May 2026 15:16:02 +0200 Subject: [PATCH 1/2] docs: T1.4 cache-first prompt discipline plan Inventory of current prompt/cache/session code across Colibri: - cost.rs: CostMode (fast/smart/max), thresholds, escalation (present) - session.rs: 3-region prompt assembly, compaction (present) - colibri-deepseek: STABLE_SYSTEM_PREFIX, cache probe (present) - config.rs: DaemonConfig with cost fields (present) Plan adds 6 integration items across 3 PRs: 1. PromptAssembly + CacheMetrics structs 2. Cost-aware trimming + auto-escalation 3. Scheduler injection + cache warming No changes to parked branches (colibri-skills, zot harness). Build: N/A (docs-only) | Tests: N/A --- docs/T1.4-PROMPT-DISCIPLINE-PLAN.md | 241 ++++++++++++++++++++++++++++ 1 file changed, 241 insertions(+) create mode 100644 docs/T1.4-PROMPT-DISCIPLINE-PLAN.md diff --git a/docs/T1.4-PROMPT-DISCIPLINE-PLAN.md b/docs/T1.4-PROMPT-DISCIPLINE-PLAN.md new file mode 100644 index 0000000..84b519a --- /dev/null +++ b/docs/T1.4-PROMPT-DISCIPLINE-PLAN.md @@ -0,0 +1,241 @@ +# T1.4 — Cache-First Prompt Discipline Plan + +## Status: code present, integration incomplete + +The building blocks exist across 4 crates/files. T1.4 connects them +into a deterministic, measurable, cost-aware pipeline. + +## Files touched + +| File | Current state | T1.4 change | +|------|--------------|-------------| +| `colibri-deepseek/src/lib.rs` | Cache probe + `STABLE_SYSTEM_PREFIX` | Add `CacheMetrics` struct, per-session prefix assembly | +| `colibri-daemon/src/cost.rs` | `CostMode`, thresholds, escalation | Add `prompt_budget()` per mode, auto-escalation trigger | +| `colibri-daemon/src/session.rs` | 3-region model, `build_prompt_messages()` | Deterministic assembly, cache-hit tracking, cost-aware trim | +| `colibri-daemon/src/config.rs` | `DaemonConfig` with cost fields | Add `cache_warm_on_start`, `cost_escalation_auto` | +| `colibri-daemon/src/socket.rs` | `set-cost-mode`, status reporting | Add cache-metrics to status, per-request budget header | +| `colibri-daemon/src/spawner.rs` | Agent spawn with session_id | Inject cost-aware prompt assembly before spawn | + +## Current architecture (what's already built) + +``` +cost.rs: CostMode { Fast, Smart, Max } + ├─ session_max_bytes() 500K / 2M / 8M + ├─ max_uncompacted_turns() 5 / 20 / 100 + ├─ compact_tool_results() true / true / false + ├─ tool_result_max_bytes() 4K / 16K / unlimited + └─ escalate() Fast→Smart→Max + +session.rs: Session + ├─ turns: VecDeque (append-only JSONL) + ├─ build_prompt_messages() → Vec + │ ├─ Region 1: STABLE_SYSTEM_PREFIX (byte-stable) + │ ├─ Region 2: conversation log (turns) + │ └─ Region 3: volatile scratch (left empty) + ├─ compact_oldest_turns() — summarise + compaction marker + └─ maybe_compact_or_rollover() — auto-triggered on append + +colibri-deepseek: cache probe + ├─ STABLE_SYSTEM_PREFIX — deliberately fixed byte-for-byte + ├─ run_cache_probe() — warm + probe + cache_hit_observed + └─ WireUsage — prompt_cache_hit_tokens, prompt_cache_miss_tokens +``` + +## What T1.4 adds + +### 1. Deterministic 3-region assembler + +`build_prompt_messages()` currently returns `Vec`. T1.4 wraps this +in a `PromptAssembly` struct with explicit byte-stability guarantees: + +```rust +pub struct PromptAssembly { + pub immutable_prefix: String, // Region 1 — byte-identical every request + pub appendable_log: Vec, // Region 2 — grows monotonically + pub volatile_scratch: Vec, // Region 3 — discarded per-turn + pub total_bytes: u64, + pub estimated_tokens: u64, + pub cost_mode: CostMode, +} + +impl PromptAssembly { + /// Build the full message list for an API call. + /// The caller appends volatile scratch before sending. + pub fn to_messages(&self) -> Vec; + + /// Check whether the prefix can still cache-hit given current byte count. + /// If the prefix has grown (e.g. new session metadata), cache may miss. + pub fn prefix_is_cacheable(&self) -> bool; + + /// Trim the appendable log to fit within cost_mode budget. + /// Oldest turns are compacted/summarised first. + pub fn trim_to_budget(&mut self, mode: CostMode); +} +``` + +### 2. Cache-hit metering + +Add `CacheMetrics` to track per-session cache behavior: + +```rust +pub struct CacheMetrics { + pub warm_call_tokens: u64, + pub probe_call_tokens: u64, + pub cache_hit_tokens: u64, + pub cache_miss_tokens: u64, + pub cache_hit_rate: f64, // hit / (hit + miss) + pub last_probe_at: Option, +} +``` + +Integrate into: +- `Session.build_prompt_messages()` — record prefix byte count before send +- Daemon status response — include `cache_metrics` per session +- Scheduler — skip cache warming if last probe was recent + +### 3. Cost-aware prompt trimming + +Instead of only compacting by raw byte count, trim by cost mode budget: + +| Mode | Max prompt bytes | Max turns | Tool result policy | +|------|-----------------|-----------|-------------------| +| Fast | 500K (~12K tokens) | 5 | Truncate >4K | +| Smart | 2M (~50K tokens) | 20 | Truncate >16K | +| Max | 8M (~200K tokens) | 100 | Preserve all | + +```rust +pub fn trim_prompt_for_mode( + assembly: &mut PromptAssembly, + mode: CostMode, +) -> TrimResult { + // 1. Calculate current size + // 2. If under budget: return Ok + // 3. If over: compact oldest turns first + // 4. If still over: escalate cost mode (visible log) + // 5. If already Max and still over: truncate volatile, warn +} +``` + +### 4. Auto-escalation with visible logging + +Current `escalate()` is manual. T1.4 triggers it automatically when: + +```rust +pub enum EscalationTrigger { + /// Session byte count exceeded current mode budget + BudgetExceeded { current_bytes: u64, budget_bytes: u64 }, + /// Cache miss rate above threshold (prefix changed?) + CacheMissRate { rate: f64, threshold: f64 }, + /// Compaction didn't free enough space + CompactionInsufficient { freed_bytes: u64, needed_bytes: u64 }, +} +``` + +When triggered: +1. Log visible escalation event +2. Update session's active cost mode +3. Re-trim prompt to new budget +4. Record in session JSONL as `{"type":"escalation","from":"smart","to":"max","reason":"..."}` + +### 5. Scheduler handoff prompts + +When the scheduler spawns an agent via `cmd_spawn_agent`, inject cost-aware +prompt assembly: + +```rust +// Before spawn: +let session = state.sessions.get(&session_id)?; +let assembly = session.build_prompt_assembly().await; +let trimmed = assembly.trim_to_budget(cost_mode); + +// Inject into spawn command: +spawn_agent( + provider, + model, + session_id, + system_prompt, // from Region 1 + conversation_context, // from Region 2 (trimmed) + cost_mode, // passed to agent +); +``` + +### 6. Daemon startup cache warming + +On daemon start, if `cache_warm_on_start` is true and a DeepSeek key is set: + +```rust +pub async fn warm_cache_on_startup(config: &DaemonConfig) { + if let Some(key) = &config.deepseek_api_key { + let probe_cfg = ProbeConfig::from_env(); + let result = run_cache_probe(&probe_cfg).await; + info!( + cache_hit = result.cache_hit_observed, + hit_tokens = result.cache_hit_tokens, + "startup cache probe complete" + ); + } +} +``` + +## Implementation order + +### PR 1: PromptAssembly + CacheMetrics (no behavior change) + +Files: `session.rs`, `colibri-deepseek/src/lib.rs` +- Add `PromptAssembly` struct +- Add `CacheMetrics` struct +- Add `PromptAssembly::to_messages()` (wraps existing `build_prompt_messages`) +- Add `PromptAssembly::trim_to_budget()` (stub — no actual trimming yet) +- Tests: verify byte-stable prefix is identical across two assemblies +- Tests: verify trim_to_budget preserves ordering +- No change to daemon behavior — `build_prompt_messages()` still works + +### PR 2: Cost-aware trimming + auto-escalation + +Files: `cost.rs`, `session.rs`, `socket.rs` +- Implement `trim_to_budget()` with real compaction +- Add `EscalationTrigger` enum +- Auto-escalate on budget exceeded +- Log escalation events +- Add `set-cost-mode auto` to socket API +- Tests: trim leaves session under budget +- Tests: escalation chain Fast→Smart→Max + +### PR 3: Scheduler injection + cache warming + +Files: `socket.rs`, `spawner.rs`, `daemon.rs`, `config.rs` +- Inject cost-aware assembly into spawn-agent +- Cache warming on daemon startup +- Cache metrics in status response +- `cache_warm_on_start` config flag +- Tests: scheduler handoff includes trimmed prompt +- Tests: startup probe runs when key is set + +## Verification + +After all 3 PRs: + +```sh +# 1. Start daemon in fast mode +COLIBRI_COST_MODE=fast cargo run --bin colibri-daemon + +# 2. Verify cache warming +# Log should show: "startup cache probe complete" with hit/miss + +# 3. Spawn agent and verify prompt assembly +echo '{"cmd":"spawn-agent","provider":"deepseek","model":"deepseek-chat","system_prompt":"..."}' | nc -U /tmp/colibri-daemon.sock + +# 4. Verify cost mode escalation +echo '{"cmd":"set-cost-mode","mode":"smart"}' | nc -U /tmp/colibri-daemon.sock + +# 5. Check status includes cache metrics +echo '{"cmd":"status"}' | nc -U /tmp/colibri-daemon.sock +``` + +## What NOT to change + +- `PiJsonlIngestor` in glasspane — unrelated +- `colibri-skills` crate — unrelated (parked on feature branch) +- `zot-runtime-event-adapter` — unrelated (parked on feature branch) +- `colibri-contracts` — ProviderSmokeResult/RunManifest types are stable +- `colibri-store` — SQLite schema unchanged From 7c1a9d886a5f500219df9303bc6c1782e29f84fe Mon Sep 17 00:00:00 2001 From: Sam & Hermes Date: Sun, 31 May 2026 15:21:10 +0200 Subject: [PATCH 2/2] feat: add PromptAssembly + CacheMetrics structs (T1.4 PR 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Structural only — no behavior change. Introduces: - PromptAssembly: named 3-region wrapper around build_prompt_messages() with to_messages(), immutable_prefix, appendable_log, volatile_scratch, total_bytes, estimated_tokens. - CacheMetrics: per-session cache-hit tracking with hit_rate() and record(). - Session::build_prompt_assembly() wraps existing build_prompt_messages() with no logic change. - 5 golden tests: assembly structure, empty volatile, hit rate calculations, record accumulation. - Linked T1.4-PROMPT-DISCIPLINE-PLAN.md from COLIBRI-CUTOVER-PLAN.md. No trimming, no escalation, no scheduler changes — PR 2 and 3 follow. Parked branches (colibri-skills, zot harness) untouched. Build: pass | Tests: 41/41 green (+5 new) | Clippy: clean | Fmt: clean --- crates/colibri-daemon/src/session.rs | 140 +++++++++++++++++++++++++++ docs/COLIBRI-CUTOVER-PLAN.md | 1 + 2 files changed, 141 insertions(+) diff --git a/crates/colibri-daemon/src/session.rs b/crates/colibri-daemon/src/session.rs index be0e61e..b618708 100644 --- a/crates/colibri-daemon/src/session.rs +++ b/crates/colibri-daemon/src/session.rs @@ -82,6 +82,59 @@ impl Turn { } } +// --------------------------------------------------------------------------- +// T1.4 — PromptAssembly + CacheMetrics (PR 1: structural, no behavior change) +// --------------------------------------------------------------------------- + +/// The 3-region prompt assembly produced by a session. +#[derive(Debug, Clone)] +pub struct PromptAssembly { + pub immutable_prefix: String, + pub appendable_log: Vec, + pub volatile_scratch: Vec, + pub total_bytes: u64, + pub estimated_tokens: u64, +} + +impl PromptAssembly { + pub fn to_messages(&self) -> Vec { + let mut messages = + Vec::with_capacity(1 + self.appendable_log.len() + self.volatile_scratch.len()); + messages.push(serde_json::json!({ + "role": "system", + "content": &self.immutable_prefix, + })); + messages.extend(self.appendable_log.clone()); + messages.extend(self.volatile_scratch.clone()); + messages + } +} + +#[derive(Debug, Clone, Default)] +pub struct CacheMetrics { + pub warm_call_tokens: u64, + pub probe_call_tokens: u64, + pub cache_hit_tokens: u64, + pub cache_miss_tokens: u64, + pub last_probe_at: Option, +} + +impl CacheMetrics { + pub fn hit_rate(&self) -> f64 { + let total = self.cache_hit_tokens + self.cache_miss_tokens; + if total == 0 { + 0.0 + } else { + self.cache_hit_tokens as f64 / total as f64 + } + } + + pub fn record(&mut self, hit: u64, miss: u64) { + self.cache_hit_tokens += hit; + self.cache_miss_tokens += miss; + } +} + // --------------------------------------------------------------------------- // Session // --------------------------------------------------------------------------- @@ -362,6 +415,34 @@ impl Session { // 3-region prompt assembly (for DeepSeek cache discipline) // ------------------------------------------------------------------ + /// Build a PromptAssembly from the current session state. + /// Wraps the existing `build_prompt_messages()` — no behavior change. + pub async fn build_prompt_assembly(&self) -> PromptAssembly { + let messages = self.build_prompt_messages().await; + let total_bytes: u64 = messages + .iter() + .map(|m| serde_json::to_string(m).unwrap_or_default().len() as u64) + .sum(); + let estimated_tokens = total_bytes.div_ceil(3); + + let immutable_prefix = messages + .first() + .and_then(|m| m.get("content")) + .and_then(|c| c.as_str()) + .unwrap_or("") + .to_string(); + + let appendable_log = messages[1..].to_vec(); + + PromptAssembly { + immutable_prefix, + appendable_log, + volatile_scratch: Vec::new(), + total_bytes, + estimated_tokens, + } + } + /// Build the 3-region prompt: /// 1. Immutable system prefix (byte-stable for cache hits) /// 2. Appendable conversation log (turns, possibly with compaction gaps) @@ -427,4 +508,63 @@ impl Session { } } +// --------------------------------------------------------------------------- +// T1.4 golden tests — PromptAssembly + CacheMetrics +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod t14_tests { + use super::*; + + #[test] + fn prompt_assembly_to_messages_includes_all_regions() { + let assembly = PromptAssembly { + immutable_prefix: "SYSTEM".into(), + appendable_log: vec![ + serde_json::json!({"role": "user", "content": "hello"}), + serde_json::json!({"role": "assistant", "content": "hi"}), + ], + volatile_scratch: vec![serde_json::json!({"role": "user", "content": "v"})], + total_bytes: 0, + estimated_tokens: 0, + }; + let messages = assembly.to_messages(); + assert_eq!(messages.len(), 4); + assert_eq!(messages[0]["role"], "system"); + assert_eq!(messages[3]["content"], "v"); + } + + #[test] + fn prompt_assembly_empty_volatile_is_fine() { + let assembly = PromptAssembly { + immutable_prefix: "PREFIX".into(), + appendable_log: vec![], + volatile_scratch: vec![], + total_bytes: 0, + estimated_tokens: 0, + }; + assert_eq!(assembly.to_messages().len(), 1); + } + + #[test] + fn cache_metrics_hit_rate_zero_when_empty() { + assert_eq!(CacheMetrics::default().hit_rate(), 0.0); + } + + #[test] + fn cache_metrics_hit_rate_calculation() { + let mut m = CacheMetrics::default(); + m.record(300, 700); + assert!((m.hit_rate() - 0.3).abs() < 0.001); + } + + #[test] + fn cache_metrics_record_accumulates() { + let mut m = CacheMetrics::default(); + m.record(100, 400); + m.record(200, 300); + assert!((m.hit_rate() - 0.3).abs() < 0.001); + } +} + // Tests deferred to crate-level integration tests (tests/ directory). diff --git a/docs/COLIBRI-CUTOVER-PLAN.md b/docs/COLIBRI-CUTOVER-PLAN.md index 57d7732..c7452ab 100644 --- a/docs/COLIBRI-CUTOVER-PLAN.md +++ b/docs/COLIBRI-CUTOVER-PLAN.md @@ -115,6 +115,7 @@ takeover. Pi engine integration via the events parser; Telegram-intake compatibility. - **T1.4 Phase 5 — cache-first prompt discipline.** Deterministic 3-region prompt assembler; cost modes (fast/smart/max); visible escalation. + [Implementation plan →](T1.4-PROMPT-DISCIPLINE-PLAN.md) - **T1.5 Phase 6 — gated cutover.** Replace TS control-plane paths only after proof gates pass; **separate deprecation PR per path**. - **FreeBSD validation (each step):** push → Codex on osa runs `cargo test` on