feat(scheduler): wire eval-driven model selection into autospawn (Phase 3, 3a)
Some checks failed
CI / agent-jail-pkgs (pull_request) Has been cancelled
CI / rust (pull_request) Has been cancelled
CI / markdown (pull_request) Has been cancelled
CI / port (pull_request) Has been cancelled

Steps 2/4/5 of Phase 3: config plumbing, dispatch wiring, rationale logging.

- config: COLIBRI_MODEL_SELECTION (enable, default off),
  COLIBRI_MODEL_SELECTION_WINDOW_HOURS (168), and success/cost weight envs.
- scheduler: available_providers() (providers with creds — resolves "model
  the agent can run") + recommend_model() — runs model_success_rates through
  select_model, returns None on disabled/cold-start so callers keep default.
- socket: autospawn injects the eval-selected model into the agent env
  (COLIBRI_MODEL / DEEPSEEK_MODEL) and logs the rationale.

Integration point note: the daemon runs long-lived agent harnesses, not
per-task model dispatch, so selection is applied at agent spawn (env), not
at the scheduler's per-task pick_agent sites. Gated off by default — zero
behavior change until COLIBRI_MODEL_SELECTION is set.

6 new tests (available_providers, recommend_model gating x3, +). fmt + clippy clean.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Sam & Claude 2026-06-28 16:57:26 +02:00
parent 521d9c4ed4
commit b59cc3da80
4 changed files with 186 additions and 0 deletions

View file

@ -36,6 +36,10 @@ fn check_config() -> DaemonConfig {
terminal_watch_targets: Vec::new(),
telegram_bot_token: None,
telegram_chat_id: None,
model_selection_enabled: false,
model_selection_window_hours: 168,
model_selection_weight_success: 0.8,
model_selection_weight_cost: 0.2,
}
}

View file

@ -64,6 +64,15 @@ pub struct DaemonConfig {
pub telegram_bot_token: Option<String>,
/// Telegram chat id that terminal alerts are sent to (optional).
pub telegram_chat_id: Option<String>,
/// Enable eval-driven model selection at dispatch (Phase 3). Disabled by
/// default — until on, dispatch uses `deepseek_model` as before.
pub model_selection_enabled: bool,
/// Eval window (hours) the selector aggregates success rates over.
pub model_selection_window_hours: u64,
/// Selection weight on success rate (primary signal).
pub model_selection_weight_success: f64,
/// Selection weight on cost bonus (tiebreaker).
pub model_selection_weight_cost: f64,
}
impl DaemonConfig {
@ -120,6 +129,14 @@ impl DaemonConfig {
terminal_watch_targets: env_csv("COLIBRI_TERMINAL_WATCH"),
telegram_bot_token: nonempty_env("TELEGRAM_BOT_TOKEN"),
telegram_chat_id: nonempty_env("TELEGRAM_CHAT_ID"),
model_selection_enabled: env_bool("COLIBRI_MODEL_SELECTION"),
model_selection_window_hours: env_parse("COLIBRI_MODEL_SELECTION_WINDOW_HOURS")
.filter(|&n| n > 0)
.unwrap_or(168),
model_selection_weight_success: env_parse("COLIBRI_MODEL_SELECTION_WEIGHT_SUCCESS")
.unwrap_or(0.8),
model_selection_weight_cost: env_parse("COLIBRI_MODEL_SELECTION_WEIGHT_COST")
.unwrap_or(0.2),
}
}
}

View file

@ -402,6 +402,57 @@ pub fn select_model(
}
}
/// Providers the daemon can actually run, by credential presence. These are
/// the only providers `select_model` may pick from (resolves Q3's "model the
/// agent can run" with the data that exists today). `local` is always
/// available — it needs no API key.
pub fn available_providers(config: &crate::config::DaemonConfig) -> Vec<String> {
let mut out = vec!["local".to_string()];
if config.deepseek_api_key.is_some() {
out.push("deepseek".to_string());
}
if config.openrouter_api_key.is_some() {
out.push("openrouter".to_string());
}
if config.anthropic_api_key.is_some() {
out.push("anthropic".to_string());
}
out
}
/// Daemon-facing recommendation: run the eval rollup through `select_model`
/// using the configured window and weights. Returns `None` when model
/// selection is disabled or the pick is cold-start (so callers keep their
/// existing default rather than churn the model on thin data).
pub fn recommend_model(state: &SharedState) -> Option<ModelChoice> {
let config = &state.config;
if !config.model_selection_enabled {
return None;
}
let candidates = {
let store = state.store.lock().unwrap();
store
.model_success_rates(config.model_selection_window_hours)
.unwrap_or_default()
};
let weights = SelectionWeights {
success_rate: config.model_selection_weight_success,
cost: config.model_selection_weight_cost,
};
let available = available_providers(config);
let choice = select_model(&candidates, &available, weights, &config.deepseek_model);
// Cold-start choice carries no provider — signal "no data-driven pick".
if choice.provider.is_none() {
debug!(rationale = %choice.rationale, "model-selection: cold-start, keeping default");
None
} else {
Some(choice)
}
}
// ---------------------------------------------------------------------------
// Cron matching (simplified)
// ---------------------------------------------------------------------------
@ -496,6 +547,10 @@ mod tests {
terminal_watch_targets: Vec::new(),
telegram_bot_token: None,
telegram_chat_id: None,
model_selection_enabled: false,
model_selection_window_hours: 168,
model_selection_weight_success: 0.8,
model_selection_weight_cost: 0.2,
}
}
@ -879,4 +934,93 @@ mod tests {
assert_eq!(choice.model, "deepseek-chat");
assert!(choice.rationale.contains("cold-start"));
}
#[test]
fn available_providers_tracks_credentials() {
let mut config = test_config();
config.deepseek_api_key = None;
config.openrouter_api_key = None;
config.anthropic_api_key = None;
assert_eq!(available_providers(&config), vec!["local".to_string()]);
config.deepseek_api_key = Some("k".to_string());
config.anthropic_api_key = Some("k".to_string());
let p = available_providers(&config);
assert!(p.contains(&"deepseek".to_string()));
assert!(p.contains(&"anthropic".to_string()));
assert!(!p.contains(&"openrouter".to_string()));
}
// Seed `n` evaluated deepseek-chat tasks, `n_success` of them successful.
fn seed_deepseek_history(state: &SharedState, n: u64, n_success: u64) {
let store = state.store.lock().unwrap();
for i in 0..n {
let task = store.create_task("t", None).unwrap();
let quality = if i < n_success { 0.9 } else { 0.3 };
store
.set_task_cost(
&task.id,
&colibri_ledger::TaskCost {
provider: Some("deepseek".to_string()),
model: Some("deepseek-chat".to_string()),
input_tokens: 0,
output_tokens: 0,
cache_read_tokens: 0,
cache_write_tokens: 0,
cost: 0.003,
success: quality >= 0.7,
},
)
.unwrap();
store
.write_task_eval(&colibri_ledger::TaskEval {
task_id: task.id.clone(),
agent_id: None,
eval_mode: "agent".to_string(),
completion_status: if quality >= 0.7 { "success" } else { "fail" }.to_string(),
quality_score: Some(quality),
correctness_check: if quality >= 0.7 { "pass" } else { "fail" }.to_string(),
eval_provider: Some("test".to_string()),
eval_latency_ms: None,
eval_cost_usd: 0.0,
evaluated_at: Utc::now().to_rfc3339(),
})
.unwrap();
}
}
#[test]
fn recommend_model_disabled_returns_none() {
let mut config = test_config();
config.model_selection_enabled = false;
config.deepseek_api_key = Some("k".to_string());
let state: SharedState = Arc::new(crate::daemon::DaemonState::new(config));
seed_deepseek_history(&state, 10, 9);
assert!(recommend_model(&state).is_none(), "disabled → no pick");
}
#[test]
fn recommend_model_cold_start_returns_none() {
let mut config = test_config();
config.model_selection_enabled = true;
config.deepseek_api_key = Some("k".to_string());
let state: SharedState = Arc::new(crate::daemon::DaemonState::new(config));
seed_deepseek_history(&state, 2, 2); // below MIN_EVAL_SAMPLES
assert!(
recommend_model(&state).is_none(),
"thin data → keep default"
);
}
#[test]
fn recommend_model_enabled_with_history_picks() {
let mut config = test_config();
config.model_selection_enabled = true;
config.deepseek_api_key = Some("k".to_string());
let state: SharedState = Arc::new(crate::daemon::DaemonState::new(config));
seed_deepseek_history(&state, 10, 9);
let choice = recommend_model(&state).expect("enabled + history → pick");
assert_eq!(choice.model, "deepseek-chat");
assert_eq!(choice.provider.as_deref(), Some("deepseek"));
}
}

View file

@ -614,6 +614,23 @@ pub async fn autospawn_agent_if_configured(state: &SharedState) {
debug!("autospawn: clawdie-system-probe not found at {probe_binary}; skipping hw profile");
}
// Eval-driven model selection (Phase 3): when enabled and there's a
// confident pick from eval history, inject it as the agent's model so its
// cloud calls use the best-performing model. Gated off by default — when
// off, recommend_model returns None and the agent keeps its own default.
if let Some(choice) = crate::scheduler::recommend_model(state) {
info!(
provider = ?choice.provider,
model = %choice.model,
rationale = %choice.rationale,
"model-selection: injecting eval-selected model into autospawn",
);
extra_env.insert("COLIBRI_MODEL".to_string(), choice.model.clone());
if choice.provider.as_deref() == Some("deepseek") {
extra_env.insert("DEEPSEEK_MODEL".to_string(), choice.model);
}
}
// Capture the hw profile before extra_env moves into cmd_spawn_agent.
let hw_profile_for_mother = extra_env.get("CLAWDIE_HW_PROFILE").cloned();
@ -1531,6 +1548,10 @@ mod tests {
terminal_watch_targets: Vec::new(),
telegram_bot_token: None,
telegram_chat_id: None,
model_selection_enabled: false,
model_selection_window_hours: 168,
model_selection_weight_success: 0.8,
model_selection_weight_cost: 0.2,
}
}