diff --git a/crates/colibri-client/tests/live_socket_check.rs b/crates/colibri-client/tests/live_socket_check.rs index 246de0c..8a0529e 100644 --- a/crates/colibri-client/tests/live_socket_check.rs +++ b/crates/colibri-client/tests/live_socket_check.rs @@ -36,6 +36,10 @@ fn check_config() -> DaemonConfig { terminal_watch_targets: Vec::new(), telegram_bot_token: None, telegram_chat_id: None, + model_selection_enabled: false, + model_selection_window_hours: 168, + model_selection_weight_success: 0.8, + model_selection_weight_cost: 0.2, } } diff --git a/crates/colibri-daemon/src/config.rs b/crates/colibri-daemon/src/config.rs index aa41163..e1a7bc0 100644 --- a/crates/colibri-daemon/src/config.rs +++ b/crates/colibri-daemon/src/config.rs @@ -64,6 +64,15 @@ pub struct DaemonConfig { pub telegram_bot_token: Option, /// Telegram chat id that terminal alerts are sent to (optional). pub telegram_chat_id: Option, + /// Enable eval-driven model selection at dispatch (Phase 3). Disabled by + /// default — until on, dispatch uses `deepseek_model` as before. + pub model_selection_enabled: bool, + /// Eval window (hours) the selector aggregates success rates over. + pub model_selection_window_hours: u64, + /// Selection weight on success rate (primary signal). + pub model_selection_weight_success: f64, + /// Selection weight on cost bonus (tiebreaker). + pub model_selection_weight_cost: f64, } impl DaemonConfig { @@ -120,6 +129,14 @@ impl DaemonConfig { terminal_watch_targets: env_csv("COLIBRI_TERMINAL_WATCH"), telegram_bot_token: nonempty_env("TELEGRAM_BOT_TOKEN"), telegram_chat_id: nonempty_env("TELEGRAM_CHAT_ID"), + model_selection_enabled: env_bool("COLIBRI_MODEL_SELECTION"), + model_selection_window_hours: env_parse("COLIBRI_MODEL_SELECTION_WINDOW_HOURS") + .filter(|&n| n > 0) + .unwrap_or(168), + model_selection_weight_success: env_parse("COLIBRI_MODEL_SELECTION_WEIGHT_SUCCESS") + .unwrap_or(0.8), + model_selection_weight_cost: env_parse("COLIBRI_MODEL_SELECTION_WEIGHT_COST") + .unwrap_or(0.2), } } } diff --git a/crates/colibri-daemon/src/scheduler.rs b/crates/colibri-daemon/src/scheduler.rs index fc80dcc..0b932a6 100644 --- a/crates/colibri-daemon/src/scheduler.rs +++ b/crates/colibri-daemon/src/scheduler.rs @@ -402,6 +402,57 @@ pub fn select_model( } } +/// Providers the daemon can actually run, by credential presence. These are +/// the only providers `select_model` may pick from (resolves Q3's "model the +/// agent can run" with the data that exists today). `local` is always +/// available — it needs no API key. +pub fn available_providers(config: &crate::config::DaemonConfig) -> Vec { + let mut out = vec!["local".to_string()]; + if config.deepseek_api_key.is_some() { + out.push("deepseek".to_string()); + } + if config.openrouter_api_key.is_some() { + out.push("openrouter".to_string()); + } + if config.anthropic_api_key.is_some() { + out.push("anthropic".to_string()); + } + out +} + +/// Daemon-facing recommendation: run the eval rollup through `select_model` +/// using the configured window and weights. Returns `None` when model +/// selection is disabled or the pick is cold-start (so callers keep their +/// existing default rather than churn the model on thin data). +pub fn recommend_model(state: &SharedState) -> Option { + let config = &state.config; + if !config.model_selection_enabled { + return None; + } + + let candidates = { + let store = state.store.lock().unwrap(); + store + .model_success_rates(config.model_selection_window_hours) + .unwrap_or_default() + }; + + let weights = SelectionWeights { + success_rate: config.model_selection_weight_success, + cost: config.model_selection_weight_cost, + }; + let available = available_providers(config); + let choice = select_model(&candidates, &available, weights, &config.deepseek_model); + + // Cold-start choice carries no provider — signal "no data-driven pick". + if choice.provider.is_none() { + debug!(rationale = %choice.rationale, "model-selection: cold-start, keeping default"); + None + } else { + Some(choice) + } +} + // --------------------------------------------------------------------------- // Cron matching (simplified) // --------------------------------------------------------------------------- @@ -496,6 +547,10 @@ mod tests { terminal_watch_targets: Vec::new(), telegram_bot_token: None, telegram_chat_id: None, + model_selection_enabled: false, + model_selection_window_hours: 168, + model_selection_weight_success: 0.8, + model_selection_weight_cost: 0.2, } } @@ -879,4 +934,93 @@ mod tests { assert_eq!(choice.model, "deepseek-chat"); assert!(choice.rationale.contains("cold-start")); } + + #[test] + fn available_providers_tracks_credentials() { + let mut config = test_config(); + config.deepseek_api_key = None; + config.openrouter_api_key = None; + config.anthropic_api_key = None; + assert_eq!(available_providers(&config), vec!["local".to_string()]); + + config.deepseek_api_key = Some("k".to_string()); + config.anthropic_api_key = Some("k".to_string()); + let p = available_providers(&config); + assert!(p.contains(&"deepseek".to_string())); + assert!(p.contains(&"anthropic".to_string())); + assert!(!p.contains(&"openrouter".to_string())); + } + + // Seed `n` evaluated deepseek-chat tasks, `n_success` of them successful. + fn seed_deepseek_history(state: &SharedState, n: u64, n_success: u64) { + let store = state.store.lock().unwrap(); + for i in 0..n { + let task = store.create_task("t", None).unwrap(); + let quality = if i < n_success { 0.9 } else { 0.3 }; + store + .set_task_cost( + &task.id, + &colibri_ledger::TaskCost { + provider: Some("deepseek".to_string()), + model: Some("deepseek-chat".to_string()), + input_tokens: 0, + output_tokens: 0, + cache_read_tokens: 0, + cache_write_tokens: 0, + cost: 0.003, + success: quality >= 0.7, + }, + ) + .unwrap(); + store + .write_task_eval(&colibri_ledger::TaskEval { + task_id: task.id.clone(), + agent_id: None, + eval_mode: "agent".to_string(), + completion_status: if quality >= 0.7 { "success" } else { "fail" }.to_string(), + quality_score: Some(quality), + correctness_check: if quality >= 0.7 { "pass" } else { "fail" }.to_string(), + eval_provider: Some("test".to_string()), + eval_latency_ms: None, + eval_cost_usd: 0.0, + evaluated_at: Utc::now().to_rfc3339(), + }) + .unwrap(); + } + } + + #[test] + fn recommend_model_disabled_returns_none() { + let mut config = test_config(); + config.model_selection_enabled = false; + config.deepseek_api_key = Some("k".to_string()); + let state: SharedState = Arc::new(crate::daemon::DaemonState::new(config)); + seed_deepseek_history(&state, 10, 9); + assert!(recommend_model(&state).is_none(), "disabled → no pick"); + } + + #[test] + fn recommend_model_cold_start_returns_none() { + let mut config = test_config(); + config.model_selection_enabled = true; + config.deepseek_api_key = Some("k".to_string()); + let state: SharedState = Arc::new(crate::daemon::DaemonState::new(config)); + seed_deepseek_history(&state, 2, 2); // below MIN_EVAL_SAMPLES + assert!( + recommend_model(&state).is_none(), + "thin data → keep default" + ); + } + + #[test] + fn recommend_model_enabled_with_history_picks() { + let mut config = test_config(); + config.model_selection_enabled = true; + config.deepseek_api_key = Some("k".to_string()); + let state: SharedState = Arc::new(crate::daemon::DaemonState::new(config)); + seed_deepseek_history(&state, 10, 9); + let choice = recommend_model(&state).expect("enabled + history → pick"); + assert_eq!(choice.model, "deepseek-chat"); + assert_eq!(choice.provider.as_deref(), Some("deepseek")); + } } diff --git a/crates/colibri-daemon/src/socket.rs b/crates/colibri-daemon/src/socket.rs index 85578c4..74a3c9d 100644 --- a/crates/colibri-daemon/src/socket.rs +++ b/crates/colibri-daemon/src/socket.rs @@ -614,6 +614,23 @@ pub async fn autospawn_agent_if_configured(state: &SharedState) { debug!("autospawn: clawdie-system-probe not found at {probe_binary}; skipping hw profile"); } + // Eval-driven model selection (Phase 3): when enabled and there's a + // confident pick from eval history, inject it as the agent's model so its + // cloud calls use the best-performing model. Gated off by default — when + // off, recommend_model returns None and the agent keeps its own default. + if let Some(choice) = crate::scheduler::recommend_model(state) { + info!( + provider = ?choice.provider, + model = %choice.model, + rationale = %choice.rationale, + "model-selection: injecting eval-selected model into autospawn", + ); + extra_env.insert("COLIBRI_MODEL".to_string(), choice.model.clone()); + if choice.provider.as_deref() == Some("deepseek") { + extra_env.insert("DEEPSEEK_MODEL".to_string(), choice.model); + } + } + // Capture the hw profile before extra_env moves into cmd_spawn_agent. let hw_profile_for_mother = extra_env.get("CLAWDIE_HW_PROFILE").cloned(); @@ -1531,6 +1548,10 @@ mod tests { terminal_watch_targets: Vec::new(), telegram_bot_token: None, telegram_chat_id: None, + model_selection_enabled: false, + model_selection_window_hours: 168, + model_selection_weight_success: 0.8, + model_selection_weight_cost: 0.2, } }