feat(scheduler): wire eval-driven model selection into autospawn (Phase 3, 3a)
Steps 2/4/5 of Phase 3: config plumbing, dispatch wiring, rationale logging. - config: COLIBRI_MODEL_SELECTION (enable, default off), COLIBRI_MODEL_SELECTION_WINDOW_HOURS (168), and success/cost weight envs. - scheduler: available_providers() (providers with creds — resolves "model the agent can run") + recommend_model() — runs model_success_rates through select_model, returns None on disabled/cold-start so callers keep default. - socket: autospawn injects the eval-selected model into the agent env (COLIBRI_MODEL / DEEPSEEK_MODEL) and logs the rationale. Integration point note: the daemon runs long-lived agent harnesses, not per-task model dispatch, so selection is applied at agent spawn (env), not at the scheduler's per-task pick_agent sites. Gated off by default — zero behavior change until COLIBRI_MODEL_SELECTION is set. 6 new tests (available_providers, recommend_model gating x3, +). fmt + clippy clean. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
521d9c4ed4
commit
b59cc3da80
4 changed files with 186 additions and 0 deletions
|
|
@ -36,6 +36,10 @@ fn check_config() -> DaemonConfig {
|
|||
terminal_watch_targets: Vec::new(),
|
||||
telegram_bot_token: None,
|
||||
telegram_chat_id: None,
|
||||
model_selection_enabled: false,
|
||||
model_selection_window_hours: 168,
|
||||
model_selection_weight_success: 0.8,
|
||||
model_selection_weight_cost: 0.2,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -64,6 +64,15 @@ pub struct DaemonConfig {
|
|||
pub telegram_bot_token: Option<String>,
|
||||
/// Telegram chat id that terminal alerts are sent to (optional).
|
||||
pub telegram_chat_id: Option<String>,
|
||||
/// Enable eval-driven model selection at dispatch (Phase 3). Disabled by
|
||||
/// default — until on, dispatch uses `deepseek_model` as before.
|
||||
pub model_selection_enabled: bool,
|
||||
/// Eval window (hours) the selector aggregates success rates over.
|
||||
pub model_selection_window_hours: u64,
|
||||
/// Selection weight on success rate (primary signal).
|
||||
pub model_selection_weight_success: f64,
|
||||
/// Selection weight on cost bonus (tiebreaker).
|
||||
pub model_selection_weight_cost: f64,
|
||||
}
|
||||
|
||||
impl DaemonConfig {
|
||||
|
|
@ -120,6 +129,14 @@ impl DaemonConfig {
|
|||
terminal_watch_targets: env_csv("COLIBRI_TERMINAL_WATCH"),
|
||||
telegram_bot_token: nonempty_env("TELEGRAM_BOT_TOKEN"),
|
||||
telegram_chat_id: nonempty_env("TELEGRAM_CHAT_ID"),
|
||||
model_selection_enabled: env_bool("COLIBRI_MODEL_SELECTION"),
|
||||
model_selection_window_hours: env_parse("COLIBRI_MODEL_SELECTION_WINDOW_HOURS")
|
||||
.filter(|&n| n > 0)
|
||||
.unwrap_or(168),
|
||||
model_selection_weight_success: env_parse("COLIBRI_MODEL_SELECTION_WEIGHT_SUCCESS")
|
||||
.unwrap_or(0.8),
|
||||
model_selection_weight_cost: env_parse("COLIBRI_MODEL_SELECTION_WEIGHT_COST")
|
||||
.unwrap_or(0.2),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -402,6 +402,57 @@ pub fn select_model(
|
|||
}
|
||||
}
|
||||
|
||||
/// Providers the daemon can actually run, by credential presence. These are
|
||||
/// the only providers `select_model` may pick from (resolves Q3's "model the
|
||||
/// agent can run" with the data that exists today). `local` is always
|
||||
/// available — it needs no API key.
|
||||
pub fn available_providers(config: &crate::config::DaemonConfig) -> Vec<String> {
|
||||
let mut out = vec!["local".to_string()];
|
||||
if config.deepseek_api_key.is_some() {
|
||||
out.push("deepseek".to_string());
|
||||
}
|
||||
if config.openrouter_api_key.is_some() {
|
||||
out.push("openrouter".to_string());
|
||||
}
|
||||
if config.anthropic_api_key.is_some() {
|
||||
out.push("anthropic".to_string());
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// Daemon-facing recommendation: run the eval rollup through `select_model`
|
||||
/// using the configured window and weights. Returns `None` when model
|
||||
/// selection is disabled or the pick is cold-start (so callers keep their
|
||||
/// existing default rather than churn the model on thin data).
|
||||
pub fn recommend_model(state: &SharedState) -> Option<ModelChoice> {
|
||||
let config = &state.config;
|
||||
if !config.model_selection_enabled {
|
||||
return None;
|
||||
}
|
||||
|
||||
let candidates = {
|
||||
let store = state.store.lock().unwrap();
|
||||
store
|
||||
.model_success_rates(config.model_selection_window_hours)
|
||||
.unwrap_or_default()
|
||||
};
|
||||
|
||||
let weights = SelectionWeights {
|
||||
success_rate: config.model_selection_weight_success,
|
||||
cost: config.model_selection_weight_cost,
|
||||
};
|
||||
let available = available_providers(config);
|
||||
let choice = select_model(&candidates, &available, weights, &config.deepseek_model);
|
||||
|
||||
// Cold-start choice carries no provider — signal "no data-driven pick".
|
||||
if choice.provider.is_none() {
|
||||
debug!(rationale = %choice.rationale, "model-selection: cold-start, keeping default");
|
||||
None
|
||||
} else {
|
||||
Some(choice)
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Cron matching (simplified)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
@ -496,6 +547,10 @@ mod tests {
|
|||
terminal_watch_targets: Vec::new(),
|
||||
telegram_bot_token: None,
|
||||
telegram_chat_id: None,
|
||||
model_selection_enabled: false,
|
||||
model_selection_window_hours: 168,
|
||||
model_selection_weight_success: 0.8,
|
||||
model_selection_weight_cost: 0.2,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -879,4 +934,93 @@ mod tests {
|
|||
assert_eq!(choice.model, "deepseek-chat");
|
||||
assert!(choice.rationale.contains("cold-start"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn available_providers_tracks_credentials() {
|
||||
let mut config = test_config();
|
||||
config.deepseek_api_key = None;
|
||||
config.openrouter_api_key = None;
|
||||
config.anthropic_api_key = None;
|
||||
assert_eq!(available_providers(&config), vec!["local".to_string()]);
|
||||
|
||||
config.deepseek_api_key = Some("k".to_string());
|
||||
config.anthropic_api_key = Some("k".to_string());
|
||||
let p = available_providers(&config);
|
||||
assert!(p.contains(&"deepseek".to_string()));
|
||||
assert!(p.contains(&"anthropic".to_string()));
|
||||
assert!(!p.contains(&"openrouter".to_string()));
|
||||
}
|
||||
|
||||
// Seed `n` evaluated deepseek-chat tasks, `n_success` of them successful.
|
||||
fn seed_deepseek_history(state: &SharedState, n: u64, n_success: u64) {
|
||||
let store = state.store.lock().unwrap();
|
||||
for i in 0..n {
|
||||
let task = store.create_task("t", None).unwrap();
|
||||
let quality = if i < n_success { 0.9 } else { 0.3 };
|
||||
store
|
||||
.set_task_cost(
|
||||
&task.id,
|
||||
&colibri_ledger::TaskCost {
|
||||
provider: Some("deepseek".to_string()),
|
||||
model: Some("deepseek-chat".to_string()),
|
||||
input_tokens: 0,
|
||||
output_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
cache_write_tokens: 0,
|
||||
cost: 0.003,
|
||||
success: quality >= 0.7,
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
store
|
||||
.write_task_eval(&colibri_ledger::TaskEval {
|
||||
task_id: task.id.clone(),
|
||||
agent_id: None,
|
||||
eval_mode: "agent".to_string(),
|
||||
completion_status: if quality >= 0.7 { "success" } else { "fail" }.to_string(),
|
||||
quality_score: Some(quality),
|
||||
correctness_check: if quality >= 0.7 { "pass" } else { "fail" }.to_string(),
|
||||
eval_provider: Some("test".to_string()),
|
||||
eval_latency_ms: None,
|
||||
eval_cost_usd: 0.0,
|
||||
evaluated_at: Utc::now().to_rfc3339(),
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn recommend_model_disabled_returns_none() {
|
||||
let mut config = test_config();
|
||||
config.model_selection_enabled = false;
|
||||
config.deepseek_api_key = Some("k".to_string());
|
||||
let state: SharedState = Arc::new(crate::daemon::DaemonState::new(config));
|
||||
seed_deepseek_history(&state, 10, 9);
|
||||
assert!(recommend_model(&state).is_none(), "disabled → no pick");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn recommend_model_cold_start_returns_none() {
|
||||
let mut config = test_config();
|
||||
config.model_selection_enabled = true;
|
||||
config.deepseek_api_key = Some("k".to_string());
|
||||
let state: SharedState = Arc::new(crate::daemon::DaemonState::new(config));
|
||||
seed_deepseek_history(&state, 2, 2); // below MIN_EVAL_SAMPLES
|
||||
assert!(
|
||||
recommend_model(&state).is_none(),
|
||||
"thin data → keep default"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn recommend_model_enabled_with_history_picks() {
|
||||
let mut config = test_config();
|
||||
config.model_selection_enabled = true;
|
||||
config.deepseek_api_key = Some("k".to_string());
|
||||
let state: SharedState = Arc::new(crate::daemon::DaemonState::new(config));
|
||||
seed_deepseek_history(&state, 10, 9);
|
||||
let choice = recommend_model(&state).expect("enabled + history → pick");
|
||||
assert_eq!(choice.model, "deepseek-chat");
|
||||
assert_eq!(choice.provider.as_deref(), Some("deepseek"));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -614,6 +614,23 @@ pub async fn autospawn_agent_if_configured(state: &SharedState) {
|
|||
debug!("autospawn: clawdie-system-probe not found at {probe_binary}; skipping hw profile");
|
||||
}
|
||||
|
||||
// Eval-driven model selection (Phase 3): when enabled and there's a
|
||||
// confident pick from eval history, inject it as the agent's model so its
|
||||
// cloud calls use the best-performing model. Gated off by default — when
|
||||
// off, recommend_model returns None and the agent keeps its own default.
|
||||
if let Some(choice) = crate::scheduler::recommend_model(state) {
|
||||
info!(
|
||||
provider = ?choice.provider,
|
||||
model = %choice.model,
|
||||
rationale = %choice.rationale,
|
||||
"model-selection: injecting eval-selected model into autospawn",
|
||||
);
|
||||
extra_env.insert("COLIBRI_MODEL".to_string(), choice.model.clone());
|
||||
if choice.provider.as_deref() == Some("deepseek") {
|
||||
extra_env.insert("DEEPSEEK_MODEL".to_string(), choice.model);
|
||||
}
|
||||
}
|
||||
|
||||
// Capture the hw profile before extra_env moves into cmd_spawn_agent.
|
||||
let hw_profile_for_mother = extra_env.get("CLAWDIE_HW_PROFILE").cloned();
|
||||
|
||||
|
|
@ -1531,6 +1548,10 @@ mod tests {
|
|||
terminal_watch_targets: Vec::new(),
|
||||
telegram_bot_token: None,
|
||||
telegram_chat_id: None,
|
||||
model_selection_enabled: false,
|
||||
model_selection_window_hours: 168,
|
||||
model_selection_weight_success: 0.8,
|
||||
model_selection_weight_cost: 0.2,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue