feat: rework multi-agent plan + complete CLI surface (19/19 commands) #138

Merged
clawdie merged 3 commits from feat/multi-agent-plan-rework-cli-surface into main 2026-06-21 18:47:04 +02:00
4 changed files with 343 additions and 69 deletions

View file

@ -34,6 +34,17 @@ enum Command {
ListTasks {
status: Option<String>,
},
ClaimTask {
task_id: String,
agent_id: String,
},
TransitionTask {
task_id: String,
status: String,
},
SetCostMode {
mode: String,
},
CreateTask {
title: String,
description: Option<String>,
@ -77,6 +88,9 @@ fn usage() -> &'static str {
colibri [--socket PATH] get-session SESSION_ID
colibri [--socket PATH] compact-session SESSION_ID
colibri [--socket PATH] list-tasks [--status STATUS]
colibri [--socket PATH] claim-task --task-id UUID --agent-id UUID
colibri [--socket PATH] transition-task --task-id UUID --status STATUS
colibri [--socket PATH] set-cost-mode MODE
colibri [--socket PATH] create-task --title TEXT [--description TEXT]
colibri [--socket PATH] intake-task --title TEXT [--description TEXT] [--capability CAP]...
colibri [--socket PATH] list-skills
@ -93,6 +107,9 @@ Examples:
colibri create-task --title "verify OSA check" --description "manual follow-up"
colibri intake-task --title "triage watchdog" --capability freebsd
colibri list-tasks --status queued
colibri claim-task --task-id 550e8400-e29b-41d4-a716-446655440000 --agent-id 660e8400-e29b-41d4-a716-446655440000
colibri transition-task --task-id 550e8400-e29b-41d4-a716-446655440000 --status done
colibri set-cost-mode smart
colibri register-skill freebsd-check --description "Live USB startup check" --category freebsd
colibri list-skills
colibri register-agent NAME [--capability CAP]... [--capabilities CSV]
@ -183,6 +200,17 @@ where
session_id: args[1].clone(),
}),
"list-tasks" => parse_list_tasks_options(&args[1..]),
"claim-task" => parse_claim_task_options(&args[1..]),
"transition-task" => parse_transition_task_options(&args[1..]),
"set-cost-mode" => {
if args.len() != 2 {
Err("set-cost-mode requires MODE\n\n".to_string() + usage())
} else {
Ok(Command::SetCostMode {
mode: args[1].clone(),
})
}
}
"create-task" => {
let (title, description) = parse_task_text_options("create-task", &args[1..])?;
Ok(Command::CreateTask { title, description })
@ -275,6 +303,71 @@ fn parse_list_tasks_options(args: &[String]) -> Result<Command, String> {
Ok(Command::ListTasks { status })
}
fn parse_claim_task_options(args: &[String]) -> Result<Command, String> {
let mut task_id = None;
let mut agent_id = None;
let mut i = 0;
while i < args.len() {
match args[i].as_str() {
"--task-id" => {
let Some(value) = args.get(i + 1) else {
return Err("--task-id requires UUID\n\n".to_string() + usage());
};
task_id = Some(value.clone());
i += 2;
}
"--agent-id" => {
let Some(value) = args.get(i + 1) else {
return Err("--agent-id requires UUID\n\n".to_string() + usage());
};
agent_id = Some(value.clone());
i += 2;
}
other => return Err(format!("unknown claim-task option: {other}\n\n{}", usage())),
}
}
let task_id =
task_id.ok_or_else(|| format!("claim-task requires --task-id UUID\n\n{}", usage()))?;
let agent_id =
agent_id.ok_or_else(|| format!("claim-task requires --agent-id UUID\n\n{}", usage()))?;
Ok(Command::ClaimTask { task_id, agent_id })
}
fn parse_transition_task_options(args: &[String]) -> Result<Command, String> {
let mut task_id = None;
let mut status = None;
let mut i = 0;
while i < args.len() {
match args[i].as_str() {
"--task-id" => {
let Some(value) = args.get(i + 1) else {
return Err("--task-id requires UUID\n\n".to_string() + usage());
};
task_id = Some(value.clone());
i += 2;
}
"--status" => {
let Some(value) = args.get(i + 1) else {
return Err("--status requires STATUS\n\n".to_string() + usage());
};
status = Some(value.clone());
i += 2;
}
other => {
return Err(format!(
"unknown transition-task option: {other}\n\n{}",
usage()
))
}
}
}
let task_id =
task_id.ok_or_else(|| format!("transition-task requires --task-id UUID\n\n{}", usage()))?;
let status =
status.ok_or_else(|| format!("transition-task requires --status STATUS\n\n{}", usage()))?;
Ok(Command::TransitionTask { task_id, status })
}
fn parse_task_text_options(
command: &str,
args: &[String],
@ -517,6 +610,13 @@ async fn run(options: Options) -> Result<(), ClientError> {
print_json(&client.compact_session(session_id).await?)
}
Command::ListTasks { status } => print_json(&client.list_tasks(status).await?),
Command::ClaimTask { task_id, agent_id } => {
print_json(&client.claim_task(task_id, agent_id).await?)
}
Command::TransitionTask { task_id, status } => {
print_json(&client.transition_task(task_id, status).await?)
}
Command::SetCostMode { mode } => print_json(&client.set_cost_mode(mode).await?),
Command::CreateTask { title, description } => {
print_json(&client.create_task(title, description).await?)
}
@ -671,6 +771,65 @@ mod tests {
);
}
#[test]
fn parses_claim_task() {
assert_eq!(
parsed(&["claim-task", "--task-id", "task-1", "--agent-id", "agent-1",]),
Options {
socket_path: default_socket_path(),
command: Command::ClaimTask {
task_id: "task-1".to_string(),
agent_id: "agent-1".to_string(),
},
}
);
}
#[test]
fn rejects_claim_task_missing_flags() {
let err = parse_args(["claim-task", "--task-id", "only"]).unwrap_err();
assert!(err.contains("claim-task requires --agent-id"));
}
#[test]
fn parses_transition_task() {
assert_eq!(
parsed(&["transition-task", "--task-id", "task-1", "--status", "done",]),
Options {
socket_path: default_socket_path(),
command: Command::TransitionTask {
task_id: "task-1".to_string(),
status: "done".to_string(),
},
}
);
}
#[test]
fn rejects_transition_task_missing_flags() {
let err = parse_args(["transition-task", "--task-id", "only"]).unwrap_err();
assert!(err.contains("transition-task requires --status"));
}
#[test]
fn parses_set_cost_mode() {
assert_eq!(
parsed(&["set-cost-mode", "smart"]),
Options {
socket_path: default_socket_path(),
command: Command::SetCostMode {
mode: "smart".to_string(),
},
}
);
}
#[test]
fn rejects_set_cost_mode_without_arg() {
let err = parse_args(["set-cost-mode"]).unwrap_err();
assert!(err.contains("set-cost-mode requires MODE"));
}
#[test]
fn rejects_create_task_without_title() {
let err = parse_args(["create-task", "--description", "missing title"]).unwrap_err();

View file

@ -196,6 +196,38 @@ impl DaemonClient {
.await
}
pub async fn claim_task(
&self,
task_id: impl Into<String>,
agent_id: impl Into<String>,
) -> Result<serde_json::Value, ClientError> {
self.request(&ColibriCommand::ClaimTask {
task_id: task_id.into(),
agent_id: agent_id.into(),
})
.await
}
pub async fn transition_task(
&self,
task_id: impl Into<String>,
status: impl Into<String>,
) -> Result<serde_json::Value, ClientError> {
self.request(&ColibriCommand::TransitionTask {
task_id: task_id.into(),
status: status.into(),
})
.await
}
pub async fn set_cost_mode(
&self,
mode: impl Into<String>,
) -> Result<serde_json::Value, ClientError> {
self.request(&ColibriCommand::SetCostMode { mode: mode.into() })
.await
}
pub async fn list_skills(&self) -> Result<serde_json::Value, ClientError> {
self.request(&ColibriCommand::ListSkills).await
}

View file

@ -529,6 +529,76 @@ mod tests {
assert_eq!(pick_agent(&[], &agents).unwrap().name, "generalist");
}
#[test]
fn test_pick_agent_tie_breaking() {
let required = vec!["rust".to_string()];
let agents = vec![
colibri_store::Agent {
id: "a1".into(),
name: "first-rust".into(),
capabilities: serde_json::json!(["rust"]),
status: "idle".into(),
created_at: "2026-01-01T00:00:00Z".into(),
},
colibri_store::Agent {
id: "a2".into(),
name: "second-rust".into(),
capabilities: serde_json::json!(["rust"]),
status: "idle".into(),
created_at: "2026-01-02T00:00:00Z".into(),
},
];
let picked = pick_agent(&required, &agents).unwrap();
assert_eq!(
picked.name, "second-rust",
"tie-break must favor later-in-slice (deterministic)"
);
}
#[test]
fn test_pick_agent_multiple_required_capabilities() {
let required = vec!["rust".to_string(), "freebsd".to_string()];
let agents = vec![
colibri_store::Agent {
id: "a1".into(),
name: "rust-only".into(),
capabilities: serde_json::json!(["rust"]),
status: "idle".into(),
created_at: "2026-01-01T00:00:00Z".into(),
},
colibri_store::Agent {
id: "a2".into(),
name: "rust-and-freebsd".into(),
capabilities: serde_json::json!(["rust", "freebsd"]),
status: "idle".into(),
created_at: "2026-01-01T00:00:00Z".into(),
},
];
let picked = pick_agent(&required, &agents).unwrap();
assert_eq!(
picked.name, "rust-and-freebsd",
"agent with both required caps beats agent with one"
);
}
#[test]
fn test_pick_agent_active_status_eligible() {
let required = vec!["rust".to_string()];
let agents = vec![colibri_store::Agent {
id: "a1".into(),
name: "active-bot".into(),
capabilities: serde_json::json!(["rust"]),
status: "active".into(),
created_at: "2026-01-01T00:00:00Z".into(),
}];
let picked = pick_agent(&required, &agents);
assert!(
picked.is_some(),
"status 'active' must be eligible same as 'idle'"
);
assert_eq!(picked.unwrap().name, "active-bot");
}
#[tokio::test]
async fn test_scheduler_tick_drains_intake_without_deadlock() {
let config = test_config();

View file

@ -1,24 +1,25 @@
# Multi-Agent Multi-Host — Gap Analysis & Implementation Plan
**Created:** 2026-06-19 (Sam & Hermes)
**Status:** Phase 1-2 ready for implementation
**Updated:** 2026-06-21 (Sam & Claude) — reflects 0.11.0 release and narrowed gaps
**Status:** Phase 2a complete; Phase 1 + Phase 2b ready for implementation
## Context
The 0.10.0 milestone (ISO build, rc.d lifecycle, SIGTERM/socket fixes, release
gate) is staged for the FreeBSD build host. The next milestone is proving the
multi-agent, multi-host coordination model: multiple agents on different hosts
reading from the same Colibri task board, each picking up work by capability,
and reporting results back.
Colibri 0.11.0 is released (MIT license, 230 tests, FreeBSD port + CI running).
The tenant/vault provision chain has landed (`register-tenant` → jail spawn →
`provision_tenant_env()``colibri-vault::provision`). The next milestone is
proving the multi-agent, multi-host coordination model: multiple agents on
different hosts reading from the same Colibri task board, each picking up work
by capability, and reporting results back.
PR #83 landed the first cross-host plumbing — a socat TCP bridge, Python polling
scripts, and a Hermes cronjob configuration. But the gap analysis below shows
that **the multi-host plane is packaged and documented but almost entirely
untested**. This document defines what needs to happen to close that gap.
scripts, and a Hermes cronjob configuration. The gap analysis below defines what
remains to close the multi-host testing gap.
---
## Current architecture (as of PR #83)
## Current architecture (as of 0.11.0)
The multi-host stack lives **outside the Rust daemon**:
@ -34,17 +35,17 @@ The multi-host stack lives **outside the Rust daemon**:
- **Transport:** `tokio::net::UnixListener` only — zero TCP in Rust. The socat
bridge is a shell-level relay.
- **Agent model:** `register-agent` stores name + capabilities + status
(`active`/`idle`/`offline`). No `host` field, no `last_seen`, no heartbeat,
no lease/TTL.
(`active`/`idle`/`offline`). Awaiting `host` field, `last_seen`, heartbeat,
and lease/TTL (Phase 3).
- **Task assignment:** `pick_agent()` matches by capability score (partial
match counts, highest score wins, tie → later-in-slice). `claim_task()` is a
blind UPDATE with no concurrency guard.
blind UPDATE; await a concurrency guard (Gap 4).
- **Polling:** `colibri_poll.py` queries `list-tasks status=started` filtered
by `agent_id`. `colibri_task_done.py` calls `transition-task`.
- **Spawning:** `poll_tasks()` in daemon.rs spawns agents for `Claimed` tasks,
skipping those with an existing session (idempotency guard).
### Socket command inventory (17 commands, all Unix-socket)
### Socket command inventory (19 commands, all Unix-socket)
| Category | Commands |
| -------------- | --------------------------------------------------------------------------- |
@ -53,14 +54,14 @@ The multi-host stack lives **outside the Rust daemon**:
| Agent process | `spawn-agent`, `kill-agent` |
| Board | `list-tasks`, `create-task`, `transition-task`, `claim-task`, `intake-task` |
| Agent registry | `register-agent`, `list-agents` |
| Tenant | `register-tenant`, `list-tenants` |
| Skills | `list-skills`, `register-skill` |
### CLI surface (10 of 17 commands exposed)
### CLI surface (16 of 19 commands exposed)
Missing from CLI: `claim-task`, `transition-task`, `register-agent`,
`list-agents`, `set-cost-mode`, `register-skill` (register-skill IS in CLI;
the others are socket-only). Remote agents currently must use raw Python
socket calls.
Awaiting CLI exposure: `claim-task`, `transition-task`, `set-cost-mode`
(Phase 2b). Remote agents currently use raw Python socket calls for these
three commands.
---
@ -73,21 +74,38 @@ socket calls.
- SIGTERM cleanup + stale socket safety
- Session isolation with 2 agents (bypasses task board)
- Cost mode derivation in background rotation
- `pick_agent` unit tests: best match, offline exclusion, no-match, empty-required
- Scheduler tick drains intake queue
- `pick_agent` unit tests: best match (2 agents), offline exclusion, no-match,
empty-required, partial scoring, none scoring
- Scheduler tick drains intake queue without deadlock
- `poll_tasks` spawns agent for a claimed task
- Double-spawn session isolation
- Tenant register + list over socket
### What is NOT tested
### Test targets (awaiting coverage)
| # | Gap | Severity | Linux-doable? |
| --- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------------------------------- |
| 1 | **Multi-agent task-board contention**`pick_agent` only tested with 0-1 agents; no capability-based multi-agent assignment test; no same-agent-multiple-tasks test | High | Yes |
| 2 | **CLI surface gaps**`claim-task`, `transition-task`, `register-agent`, `list-agents` have no CLI; remote agents forced to use raw Python | Medium | Yes |
| 3 | **Agent presence model** — missing `host`, `last_seen`, and heartbeat/lease columns; add these schema fields to detect stale remote agents | High | Yes (schema change) |
| 4 | **Remote-safe task claim**`claim_task` is a blind UPDATE, no concurrency safety, no lease/TTL | Medium | Yes |
| 5 | **Python polling scripts**`colibri_poll.py` and `colibri_task_done.py` have zero test coverage | Medium | Yes |
| 6 | **TCP bridge round-trip** — socat bridge untested end-to-end | Medium | Partial (needs socat or FreeBSD) |
| 7 | **Cross-host coordination** — no test simulates a remote agent claiming/transitioning a task over the bridge | High | FreeBSD only |
| # | Gap | Severity | Linux-doable? |
| --- | ------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------------------------------- |
| 1 | **Multi-agent task-board contention**`pick_agent` tie-breaking, multi-required-capability, and active-status eligibility await dedicated tests | High | Yes |
| 2 | **CLI surface gaps**`claim-task`, `transition-task`, `set-cost-mode` await CLI exposure (Phase 2b) | Medium | Yes |
| 3 | **Agent presence model** — await `host`, `last_seen`, and heartbeat/lease columns to detect stale remote agents (Phase 3) | High | Yes (schema change) |
| 4 | **Remote-safe task claim**`claim_task` is a blind UPDATE; await a concurrency guard or lease/TTL | Medium | Yes |
| 5 | **Python polling scripts**`colibri_poll.py` and `colibri_task_done.py` have zero test coverage | Medium | Yes |
| 6 | **TCP bridge round-trip** — socat bridge untested end-to-end | Medium | Partial (needs socat or FreeBSD) |
| 7 | **Cross-host coordination** — await a test simulating a remote agent claiming/transitioning a task over the bridge | High | FreeBSD only |
### Closed gaps (since the original 2026-06-19 analysis)
- **CLI: register-agent + list-agents** — merged (Phase 2a, PR #107)
- **CLI: register-tenant + list-tenants + register-skill** — merged
- **pick_agent scoring** — partial-match and no-match scoring tests added
- **Tenant/vault provision chain** — register-tenant, jail spawn flags,
`provision_tenant_env()`, `colibri-vault::provision` all landed
- **Issue #88** (CollectionNotFound) — daemon passes `tenant_id` (collection
name) to `vault::provision`
- **Issue #91** (tenant provision target verification) — `trim_trailing_slash`
string-equality check
- **Issue #92** (vault provision canonicalization) — canonicalize +
allowed-root containment (PR #119)
---
@ -98,14 +116,14 @@ socket calls.
#### 1a. Pure `pick_agent` unit tests — extend `scheduler.rs` test module
Existing tests cover: best match (2 agents, different caps), offline exclusion,
no-match, empty-required. Add:
no-match, empty-required, partial scoring, none scoring, tick-drains-intake.
Add:
| Test | What it proves |
| -------------------------------------------------- | ---------------------------------------------------------------------------------------- |
| `test_pick_agent_partial_match_wins_over_no_match` | Agent with `["rust","freebsd"]` beats agent with `["python"]` for required `["freebsd"]` |
| `test_pick_agent_tie_breaking` | Two agents with same score — verify deterministic tie-break (later name wins) |
| `test_pick_agent_multiple_required_capabilities` | Required `["rust","freebsd"]` — agent with both beats agent with one |
| `test_pick_agent_active_status_eligible` | `status: "active"` is treated same as `"idle"` (both eligible) |
| Test | What it proves |
| ------------------------------------------------ | --------------------------------------------------------------------------------- |
| `test_pick_agent_tie_breaking` | Two agents with same score — verify deterministic tie-break (later-in-slice wins) |
| `test_pick_agent_multiple_required_capabilities` | Required `["rust","freebsd"]` — agent with both beats agent with one |
| `test_pick_agent_active_status_eligible` | `status: "active"` is treated same as `"idle"` (both eligible) |
#### 1b. Multi-agent board integration test — new file `crates/colibri-daemon/tests/multi_agent_board.rs`
@ -145,36 +163,31 @@ Documents the current contention behavior (no guard against same agent getting
multiple tasks) and proves session isolation when one agent handles multiple
tasks.
### Phase 2: Merge `feat/cli-register-agent` + add claim/transition CLI
### Phase 2: CLI surface completion
#### 2a. Merge `feat/cli-register-agent` (existing branch, 64 lines, client-only)
#### 2a. Merge `feat/cli-register-agent` — COMPLETE
The branch is clean and ready:
`register-agent` and `list-agents` are in the CLI (merged via PR #107).
- `Command::RegisterAgent { name, capabilities }` + `Command::ListAgents`
- `parse_capabilities()` helper (reuses `--capability`/`--capabilities` pattern)
- `DaemonClient::register_agent()` + `DaemonClient::list_agents()`
- Usage text
#### 2b. Add `claim-task`, `transition-task`, and `set-cost-mode` to CLI
Enables: `colibri register-agent osa-agent --capability freebsd` and
`colibri list-agents`.
#### 2b. Add `claim-task` and `transition-task` to CLI
The two commands `colibri_task_done.py` currently does via raw socket. Adding
them to the CLI means remote agents can work entirely through the `colibri`
binary:
The three commands `colibri_task_done.py` currently reaches via raw socket.
Adding them to the CLI means remote agents can work entirely through the
`colibri` binary:
```
colibri claim-task --task-id <UUID> --agent-id <UUID>
colibri transition-task --task-id <UUID> --status done|failed
colibri set-cost-mode MODE
```
Implementation:
- Add `Command::ClaimTask { task_id, agent_id }` and
`Command::TransitionTask { task_id, status }` variants
- Add `DaemonClient::claim_task()` and `DaemonClient::transition_task()`
- Add `Command::ClaimTask { task_id, agent_id }`,
`Command::TransitionTask { task_id, status }`, and
`Command::SetCostMode { mode }` variants
- Add `DaemonClient::claim_task()`, `DaemonClient::transition_task()`, and
`DaemonClient::set_cost_mode()` methods
- Add CLI parsing (follow existing `--flag value` pattern)
#### 2c. Add CLI unit tests for new commands
@ -197,7 +210,7 @@ simulating what `colibri_poll.py` does. Register two agents, create tasks with
different capabilities, verify each agent sees only its tasks via the poll
path, transition tasks to done.
**Deferred** — depends on Phase 2 CLI additions (so the test can use CLI
**Deferred** — depends on Phase 2b CLI additions (so the test can use CLI
commands instead of raw socket replication of the Python scripts).
### Phase 5: Bridge validation (FreeBSD-only)
@ -211,17 +224,17 @@ mesh.**
## Summary
| Phase | What | Files | Linux? | Status |
| ----- | ---------------------------------------- | ------------------------------------ | ------ | ------------------------ |
| 1a | `pick_agent` unit tests | `scheduler.rs` tests | Yes | Ready |
| 1b | Multi-agent board integration test | `tests/multi_agent_board.rs` (new) | Yes | Ready |
| 1c | Same-capability multi-task test | Same file | Yes | Ready |
| 2a | Merge `feat/cli-register-agent` | `colibri.rs` + `lib.rs` | Yes | Branch exists |
| 2b | Add `claim-task` + `transition-task` CLI | `colibri.rs` + `lib.rs` | Yes | Ready |
| 2c | CLI parse tests | `colibri.rs` tests | Yes | Ready |
| 3 | Agent presence schema | `schema.rs` + `lib.rs` + `socket.rs` | Yes | Deferred |
| 4 | Polling workflow test | `tests/` | Yes | Deferred (needs Phase 2) |
| 5 | TCP bridge validation | FreeBSD host | No | FreeBSD lane |
| Phase | What | Files | Linux? | Status |
| ----- | ---------------------------------------------------------- | ------------------------------------ | ------ | ------------------------- |
| 1a | `pick_agent` unit tests (3 remaining) | `scheduler.rs` tests | Yes | Ready |
| 1b | Multi-agent board integration test | `tests/multi_agent_board.rs` (new) | Yes | Ready |
| 1c | Same-capability multi-task test | Same file | Yes | Ready |
| 2a | Merge `feat/cli-register-agent` | `colibri.rs` + `lib.rs` | Yes | **Complete** |
| 2b | Add `claim-task` + `transition-task` + `set-cost-mode` CLI | `colibri.rs` + `lib.rs` | Yes | Ready |
| 2c | CLI parse tests | `colibri.rs` tests | Yes | Ready |
| 3 | Agent presence schema | `schema.rs` + `lib.rs` + `socket.rs` | Yes | Deferred |
| 4 | Polling workflow test | `tests/` | Yes | Deferred (needs Phase 2b) |
| 5 | TCP bridge validation | FreeBSD host | No | FreeBSD lane |
**Immediate scope:** Phases 1-2. All testable on Linux with `cargo test` +
**Immediate scope:** Phases 1 + 2b. All testable on Linux with `cargo test` +
`cargo clippy` gate. No FreeBSD dependency for implementation.