feat(scheduler): wire eval-driven model selection into autospawn (Phase 3, 3a)

Steps 2/4/5 of Phase 3: config plumbing, dispatch wiring, rationale logging. - config: COLIBRI_MODEL_SELECTION (enable, default off), COLIBRI_MODEL_SELECTION_WINDOW_HOURS (168), and success/cost weight envs. - scheduler: available_providers() (providers with creds — resolves "model the agent can run") + recommend_model() — runs model_success_rates through select_model, returns None on disabled/cold-start so callers keep default. - socket: autospawn injects the eval-selected model into the agent env (COLIBRI_MODEL / DEEPSEEK_MODEL) and logs the rationale. Integration point note: the daemon runs long-lived agent harnesses, not per-task model dispatch, so selection is applied at agent spawn (env), not at the scheduler's per-task pick_agent sites. Gated off by default — zero behavior change until COLIBRI_MODEL_SELECTION is set. 6 new tests (available_providers, recommend_model gating x3, +). fmt + clippy clean. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-28 16:57:26 +02:00 · 2026-06-28 16:57:26 +02:00 · b59cc3da80
commit b59cc3da80
parent 521d9c4ed4
4 changed files with 186 additions and 0 deletions
--- a/crates/colibri-client/tests/live_socket_check.rs
+++ b/crates/colibri-client/tests/live_socket_check.rs
@ -36,6 +36,10 @@ fn check_config() -> DaemonConfig {
        terminal_watch_targets: Vec::new(),
        telegram_bot_token: None,
        telegram_chat_id: None,
+        model_selection_enabled: false,
+        model_selection_window_hours: 168,
+        model_selection_weight_success: 0.8,
+        model_selection_weight_cost: 0.2,
    }
 }

--- a/crates/colibri-daemon/src/config.rs
+++ b/crates/colibri-daemon/src/config.rs
@ -64,6 +64,15 @@ pub struct DaemonConfig {
    pub telegram_bot_token: Option<String>,
    /// Telegram chat id that terminal alerts are sent to (optional).
    pub telegram_chat_id: Option<String>,
+    /// Enable eval-driven model selection at dispatch (Phase 3). Disabled by
+    /// default — until on, dispatch uses `deepseek_model` as before.
+    pub model_selection_enabled: bool,
+    /// Eval window (hours) the selector aggregates success rates over.
+    pub model_selection_window_hours: u64,
+    /// Selection weight on success rate (primary signal).
+    pub model_selection_weight_success: f64,
+    /// Selection weight on cost bonus (tiebreaker).
+    pub model_selection_weight_cost: f64,
 }

 impl DaemonConfig {
@ -120,6 +129,14 @@ impl DaemonConfig {
            terminal_watch_targets: env_csv("COLIBRI_TERMINAL_WATCH"),
            telegram_bot_token: nonempty_env("TELEGRAM_BOT_TOKEN"),
            telegram_chat_id: nonempty_env("TELEGRAM_CHAT_ID"),
+            model_selection_enabled: env_bool("COLIBRI_MODEL_SELECTION"),
+            model_selection_window_hours: env_parse("COLIBRI_MODEL_SELECTION_WINDOW_HOURS")
+                .filter(|&n| n > 0)
+                .unwrap_or(168),
+            model_selection_weight_success: env_parse("COLIBRI_MODEL_SELECTION_WEIGHT_SUCCESS")
+                .unwrap_or(0.8),
+            model_selection_weight_cost: env_parse("COLIBRI_MODEL_SELECTION_WEIGHT_COST")
+                .unwrap_or(0.2),
        }
    }
 }
--- a/crates/colibri-daemon/src/scheduler.rs
+++ b/crates/colibri-daemon/src/scheduler.rs
@ -402,6 +402,57 @@ pub fn select_model(
    }
 }

+/// Providers the daemon can actually run, by credential presence. These are
+/// the only providers `select_model` may pick from (resolves Q3's "model the
+/// agent can run" with the data that exists today). `local` is always
+/// available — it needs no API key.
+pub fn available_providers(config: &crate::config::DaemonConfig) -> Vec<String> {
+    let mut out = vec!["local".to_string()];
+    if config.deepseek_api_key.is_some() {
+        out.push("deepseek".to_string());
+    }
+    if config.openrouter_api_key.is_some() {
+        out.push("openrouter".to_string());
+    }
+    if config.anthropic_api_key.is_some() {
+        out.push("anthropic".to_string());
+    }
+    out
+}
+
+/// Daemon-facing recommendation: run the eval rollup through `select_model`
+/// using the configured window and weights. Returns `None` when model
+/// selection is disabled or the pick is cold-start (so callers keep their
+/// existing default rather than churn the model on thin data).
+pub fn recommend_model(state: &SharedState) -> Option<ModelChoice> {
+    let config = &state.config;
+    if !config.model_selection_enabled {
+        return None;
+    }
+
+    let candidates = {
+        let store = state.store.lock().unwrap();
+        store
+            .model_success_rates(config.model_selection_window_hours)
+            .unwrap_or_default()
+    };
+
+    let weights = SelectionWeights {
+        success_rate: config.model_selection_weight_success,
+        cost: config.model_selection_weight_cost,
+    };
+    let available = available_providers(config);
+    let choice = select_model(&candidates, &available, weights, &config.deepseek_model);
+
+    // Cold-start choice carries no provider — signal "no data-driven pick".
+    if choice.provider.is_none() {
+        debug!(rationale = %choice.rationale, "model-selection: cold-start, keeping default");
+        None
+    } else {
+        Some(choice)
+    }
+}
+
 // ---------------------------------------------------------------------------
 // Cron matching (simplified)
 // ---------------------------------------------------------------------------
@ -496,6 +547,10 @@ mod tests {
            terminal_watch_targets: Vec::new(),
            telegram_bot_token: None,
            telegram_chat_id: None,
+            model_selection_enabled: false,
+            model_selection_window_hours: 168,
+            model_selection_weight_success: 0.8,
+            model_selection_weight_cost: 0.2,
        }
    }

@ -879,4 +934,93 @@ mod tests {
        assert_eq!(choice.model, "deepseek-chat");
        assert!(choice.rationale.contains("cold-start"));
    }
+
+    #[test]
+    fn available_providers_tracks_credentials() {
+        let mut config = test_config();
+        config.deepseek_api_key = None;
+        config.openrouter_api_key = None;
+        config.anthropic_api_key = None;
+        assert_eq!(available_providers(&config), vec!["local".to_string()]);
+
+        config.deepseek_api_key = Some("k".to_string());
+        config.anthropic_api_key = Some("k".to_string());
+        let p = available_providers(&config);
+        assert!(p.contains(&"deepseek".to_string()));
+        assert!(p.contains(&"anthropic".to_string()));
+        assert!(!p.contains(&"openrouter".to_string()));
+    }
+
+    // Seed `n` evaluated deepseek-chat tasks, `n_success` of them successful.
+    fn seed_deepseek_history(state: &SharedState, n: u64, n_success: u64) {
+        let store = state.store.lock().unwrap();
+        for i in 0..n {
+            let task = store.create_task("t", None).unwrap();
+            let quality = if i < n_success { 0.9 } else { 0.3 };
+            store
+                .set_task_cost(
+                    &task.id,
+                    &colibri_ledger::TaskCost {
+                        provider: Some("deepseek".to_string()),
+                        model: Some("deepseek-chat".to_string()),
+                        input_tokens: 0,
+                        output_tokens: 0,
+                        cache_read_tokens: 0,
+                        cache_write_tokens: 0,
+                        cost: 0.003,
+                        success: quality >= 0.7,
+                    },
+                )
+                .unwrap();
+            store
+                .write_task_eval(&colibri_ledger::TaskEval {
+                    task_id: task.id.clone(),
+                    agent_id: None,
+                    eval_mode: "agent".to_string(),
+                    completion_status: if quality >= 0.7 { "success" } else { "fail" }.to_string(),
+                    quality_score: Some(quality),
+                    correctness_check: if quality >= 0.7 { "pass" } else { "fail" }.to_string(),
+                    eval_provider: Some("test".to_string()),
+                    eval_latency_ms: None,
+                    eval_cost_usd: 0.0,
+                    evaluated_at: Utc::now().to_rfc3339(),
+                })
+                .unwrap();
+        }
+    }
+
+    #[test]
+    fn recommend_model_disabled_returns_none() {
+        let mut config = test_config();
+        config.model_selection_enabled = false;
+        config.deepseek_api_key = Some("k".to_string());
+        let state: SharedState = Arc::new(crate::daemon::DaemonState::new(config));
+        seed_deepseek_history(&state, 10, 9);
+        assert!(recommend_model(&state).is_none(), "disabled → no pick");
+    }
+
+    #[test]
+    fn recommend_model_cold_start_returns_none() {
+        let mut config = test_config();
+        config.model_selection_enabled = true;
+        config.deepseek_api_key = Some("k".to_string());
+        let state: SharedState = Arc::new(crate::daemon::DaemonState::new(config));
+        seed_deepseek_history(&state, 2, 2); // below MIN_EVAL_SAMPLES
+        assert!(
+            recommend_model(&state).is_none(),
+            "thin data → keep default"
+        );
+    }
+
+    #[test]
+    fn recommend_model_enabled_with_history_picks() {
+        let mut config = test_config();
+        config.model_selection_enabled = true;
+        config.deepseek_api_key = Some("k".to_string());
+        let state: SharedState = Arc::new(crate::daemon::DaemonState::new(config));
+        seed_deepseek_history(&state, 10, 9);
+        let choice = recommend_model(&state).expect("enabled + history → pick");
+        assert_eq!(choice.model, "deepseek-chat");
+        assert_eq!(choice.provider.as_deref(), Some("deepseek"));
+    }
 }
--- a/crates/colibri-daemon/src/socket.rs
+++ b/crates/colibri-daemon/src/socket.rs
@ -614,6 +614,23 @@ pub async fn autospawn_agent_if_configured(state: &SharedState) {
        debug!("autospawn: clawdie-system-probe not found at {probe_binary}; skipping hw profile");
    }

+    // Eval-driven model selection (Phase 3): when enabled and there's a
+    // confident pick from eval history, inject it as the agent's model so its
+    // cloud calls use the best-performing model. Gated off by default — when
+    // off, recommend_model returns None and the agent keeps its own default.
+    if let Some(choice) = crate::scheduler::recommend_model(state) {
+        info!(
+            provider = ?choice.provider,
+            model = %choice.model,
+            rationale = %choice.rationale,
+            "model-selection: injecting eval-selected model into autospawn",
+        );
+        extra_env.insert("COLIBRI_MODEL".to_string(), choice.model.clone());
+        if choice.provider.as_deref() == Some("deepseek") {
+            extra_env.insert("DEEPSEEK_MODEL".to_string(), choice.model);
+        }
+    }
+
    // Capture the hw profile before extra_env moves into cmd_spawn_agent.
    let hw_profile_for_mother = extra_env.get("CLAWDIE_HW_PROFILE").cloned();

@ -1531,6 +1548,10 @@ mod tests {
            terminal_watch_targets: Vec::new(),
            telegram_bot_token: None,
            telegram_chat_id: None,
+            model_selection_enabled: false,
+            model_selection_window_hours: 168,
+            model_selection_weight_success: 0.8,
+            model_selection_weight_cost: 0.2,
        }
    }