feat: scaffold colibri-skills crate — split-brain read consumer #2

Merged
clawdie merged 3 commits from feat/colibri-skills-scaffold into main 2026-05-31 16:03:13 +02:00
5 changed files with 528 additions and 168 deletions

12
Cargo.lock generated
View file

@ -180,7 +180,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0"
dependencies = [
"iana-time-zone",
"js-sys",
"num-traits",
"serde",
"wasm-bindgen",
"windows-link",
]
@ -284,6 +287,15 @@ dependencies = [
"tokio",
]
[[package]]
name = "colibri-skills"
version = "0.0.1"
dependencies = [
"chrono",
"serde",
"serde_json",
]
[[package]]
name = "colibri-store"
version = "0.0.1"

View file

@ -1,5 +1,5 @@
[workspace]
members = ["crates/colibri-contracts", "crates/colibri-deepseek", "crates/colibri-runtime", "crates/colibri-glasspane", "crates/colibri-daemon", "crates/colibri-client", "crates/colibri-glasspane-tui", "crates/colibri-store"]
members = ["crates/colibri-contracts", "crates/colibri-deepseek", "crates/colibri-runtime", "crates/colibri-glasspane", "crates/colibri-daemon", "crates/colibri-client", "crates/colibri-glasspane-tui", "crates/colibri-store", "crates/colibri-skills"]
[package]
name = "colibri"

View file

@ -0,0 +1,11 @@
[package]
name = "colibri-skills"
version = "0.0.1"
edition = "2021"
description = "Colibri skills read consumer — indexes Clawdie-AI skill artifacts into SQLite"
license = "AGPL-3.0-only"
[dependencies]
serde = { version = "1", features = ["derive"] }
serde_json = "1"
chrono = { version = "0.4", features = ["serde"] }

View file

@ -0,0 +1,317 @@
//! Colibri Skills — read-only consumer for Clawdie-AI skill artifacts.
//!
//! This crate indexes committed, reviewed skill artifacts from the Clawdie-AI
//! repository into SQLite. It does NOT author, edit, or store skill content —
//! that responsibility lives in Clawdie-AI.
//!
//! ```text
//! Clawdie-AI repo (source of truth)
//! docs/astro-howto/
//! docs/forgejo-admin/
//! ...
//!
//! colibri-skills (read-only consumer)
//! reads run_manifest.json
//! validates checksums
//! indexes markdown/transcript chunks
//! serves CLI/TUI search
//! ```
use serde::{Deserialize, Serialize};
// ── Core types ────────────────────────────────────────────────────────────
/// A read-only skill artifact indexed from Clawdie-AI.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Skill {
pub skill_id: String,
pub display_name: String,
/// Relative path within the Clawdie-AI repo (e.g. "docs/astro-howto").
pub source_path: String,
pub manifest: SkillManifest,
pub artifacts: Vec<SkillArtifact>,
pub status: SkillStatus,
/// Natural-language verification test.
pub verification: Option<String>,
}
/// Parsed from `run_manifest.json` in a skill directory.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SkillManifest {
pub run_id: String,
pub created: String,
pub source: Option<ManifestSource>,
pub pipeline_stages: Vec<PipelineStage>,
pub models_used: Vec<ModelUsage>,
pub notes: Option<String>,
}
/// Source media metadata from the manifest.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ManifestSource {
pub path: Option<String>,
pub size_human: Option<String>,
pub codec: Option<String>,
pub resolution: Option<String>,
pub duration_human: Option<String>,
pub original_untouched: Option<bool>,
}
/// A single stage in the skill generation pipeline.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PipelineStage {
pub stage: String,
pub tool: String,
pub model: Option<String>,
pub output: Option<String>,
pub duration_s: Option<f64>,
pub lines: Option<u64>,
pub size_human: Option<String>,
}
/// A model used during skill generation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelUsage {
pub model: String,
pub purpose: String,
pub api_key_used: bool,
}
/// An individual file within a skill directory.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SkillArtifact {
pub artifact_type: ArtifactType,
/// Path relative to the skill directory.
pub relative_path: String,
pub file_name: String,
pub mime_type: Option<String>,
pub size_bytes: u64,
pub sha256_hash: String,
}
/// Classification of a skill artifact file.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ArtifactType {
Document,
Image,
Script,
Transcript,
Manifest,
Checksum,
Report,
ContactSheet,
Other(String),
}
impl ArtifactType {
pub fn from_path(path: &str) -> Self {
let lower = path.to_lowercase();
// Check specific patterns before generic extensions.
if lower.ends_with(".py") || lower.ends_with(".sh") || lower.ends_with(".bash") {
return ArtifactType::Script;
}
if lower.contains("contact_sheet") {
return ArtifactType::ContactSheet;
}
if lower.contains("run_manifest") && lower.ends_with(".json") {
return ArtifactType::Manifest;
}
if lower.contains("sha256") || lower.contains("checksum") {
return ArtifactType::Checksum;
}
if lower.contains("report") && lower.ends_with(".json") {
return ArtifactType::Report;
}
if lower.ends_with(".md") {
ArtifactType::Document
} else if lower.ends_with(".jpg") || lower.ends_with(".png") || lower.ends_with(".webp") {
ArtifactType::Image
} else if lower.ends_with(".txt") && lower.contains("transcript") {
ArtifactType::Transcript
} else {
ArtifactType::Other(path.to_string())
}
}
}
/// A chunk of searchable content extracted from a skill artifact.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SkillChunk {
pub chunk_id: Option<i64>,
pub skill_id: String,
pub artifact_relative_path: String,
pub chunk_type: ChunkType,
/// Nearest markdown heading, if applicable.
pub heading: Option<String>,
pub content: String,
pub line_start: u64,
pub line_end: u64,
pub tokens_estimate: Option<u64>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ChunkType {
MarkdownSection,
TranscriptSegment,
CommandBlock,
CodeBlock,
Table,
}
// ── Status ────────────────────────────────────────────────────────────────
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum SkillStatus {
#[default]
Active,
Archived,
Superseded,
}
// ── Import summary ────────────────────────────────────────────────────────
/// Returned after indexing a Clawdie-AI checkout.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ImportSummary {
pub skills_found: usize,
pub skills_indexed: usize,
pub skills_skipped: usize,
pub artifacts_total: usize,
pub chunks_total: usize,
pub checksum_failures: usize,
pub errors: Vec<String>,
}
impl ImportSummary {
pub fn success(&self) -> bool {
self.errors.is_empty() && self.checksum_failures == 0
}
}
// ── Search result ─────────────────────────────────────────────────────────
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchResult {
pub skill_id: String,
pub display_name: String,
pub chunk_type: ChunkType,
pub heading: Option<String>,
pub snippet: String,
pub artifact_path: String,
pub line_start: u64,
}
// ── Tests ─────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn artifact_type_from_path_document() {
assert_eq!(
ArtifactType::from_path("docs/HOWTO.md"),
ArtifactType::Document
);
}
#[test]
fn artifact_type_from_path_image() {
assert_eq!(
ArtifactType::from_path("screenshots/001_00-01-05_intro.jpg"),
ArtifactType::Image
);
}
#[test]
fn artifact_type_from_path_script_python() {
assert_eq!(
ArtifactType::from_path("scripts/generate_contact_sheet.py"),
ArtifactType::Script
);
}
#[test]
fn artifact_type_from_path_script_shell() {
assert_eq!(
ArtifactType::from_path("scripts/extract_screenshots.sh"),
ArtifactType::Script
);
}
#[test]
fn artifact_type_from_path_transcript() {
assert_eq!(
ArtifactType::from_path("transcript_local.txt"),
ArtifactType::Transcript
);
}
#[test]
fn artifact_type_from_path_manifest() {
assert_eq!(
ArtifactType::from_path("run_manifest.json"),
ArtifactType::Manifest
);
}
#[test]
fn artifact_type_from_path_checksum() {
assert_eq!(
ArtifactType::from_path("artifacts.sha256"),
ArtifactType::Checksum
);
}
#[test]
fn artifact_type_from_path_report() {
assert_eq!(
ArtifactType::from_path("contact-sheet/report.json"),
ArtifactType::Report
);
}
#[test]
fn artifact_type_from_path_contact_sheet() {
assert_eq!(
ArtifactType::from_path("contact-sheet/contact_sheet.jpg"),
ArtifactType::ContactSheet
);
}
#[test]
fn skill_status_default_is_active() {
assert_eq!(SkillStatus::default(), SkillStatus::Active);
}
#[test]
fn import_summary_success_empty() {
let s = ImportSummary {
skills_found: 0,
skills_indexed: 0,
skills_skipped: 0,
artifacts_total: 0,
chunks_total: 0,
checksum_failures: 0,
errors: vec![],
};
assert!(s.success());
}
#[test]
fn import_summary_failure_on_error() {
let s = ImportSummary {
skills_found: 1,
skills_indexed: 0,
skills_skipped: 1,
artifacts_total: 0,
chunks_total: 0,
checksum_failures: 0,
errors: vec!["manifest parse error".into()],
};
assert!(!s.success());
}
}

View file

@ -1,211 +1,231 @@
# Colibri Skills Plan
**Date:** 27.maj.2026
**Status:** PLANNED — docs first, no crate scaffold yet
**Status:** Phase 1 scaffolded — read-only split-brain consumer
**Crate:** `crates/colibri-skills`
## Purpose
Define the next Colibri slice for Clawdie's split-brain model:
`colibri-skills` is Colibri's read-only runtime consumer for reviewed skill
artifacts authored in the Clawdie-AI repo. It does not author, edit, or store
canonical skills. Clawdie-AI remains the source of truth; Colibri indexes and
serves typed/runtime views.
- **built-in knowledge** stays separate from
- **user/agent memory** and
- **operational state**
```text
Clawdie-AI repo (source of truth)
docs/astro-howto/
docs/forgejo-admin/
docs/vaultwarden-onboarding/
...
The product explanation is simple:
Colibri colibri-skills crate (read-only consumer)
reads committed skill artifacts
validates checksums
indexes Markdown/transcript chunks
exposes Skill, SkillArtifact, SkillChunk structs
serves CLI/TUI/search later
```
> Think of it like the machine already ships with its handbook inside.
This keeps the split-brain model explicit:
In Clawdie terms, that handbook is the **committed built-in knowledge
artifact** imported into `system_skills`.
- `system_skills`: committed built-in knowledge / manuals / reviewed skillpacks
- `system_brain`: user and agent memory
- `system_ops`: live runtime, task, service, and daemon state
This plan names the Rust-side consumer for that lane:
## Seed artifact: Astro how-to
- **crate name:** `colibri-skills`
The first concrete skillpack is `docs/astro-howto/` in Clawdie-AI. It is useful
because it is not just prose; it includes transcript, generated how-to docs,
commands, screenshots, contact sheet, manifest, checksums, and scripts.
It is a **read path first**, not a new generator, not a new database, and not a
memory replacement.
```json
{
"skill_id": "astro-howto",
"source": "local video-derived training artifact",
"inputs": [
"transcript_local.txt",
"screenshots/",
"contact-sheet/contact_sheet.jpg"
],
"outputs": [
"docs/HOWTO.md",
"docs/COMMANDS.md",
"docs/SCREENSHOTS.md",
"docs/SUMMARY.md"
],
"verification": "can user create and run an Astro project?",
"media": "screenshots/*.jpg (paths + hashes, not blobs)",
"manifest": "run_manifest.json",
"checksums": "artifacts.sha256"
}
```
## Background
Pipeline shape:
The current split-brain design already exists in `clawdie-ai`:
```text
video → local transcript → topic extraction → how-to/runbook
→ screenshots/contact sheet → commands → verification test
→ manifest + checksums → reviewed skill artifact → Colibri read-only index
```
- `system_skills`
- built-in knowledge / manuals already included
- sourced from `bootstrap/skills-memory/artifact.sql`
- `system_brain`
- dynamic user/agent memory
- `system_ops`
- runtime and operational state
## Ownership
Colibri should reflect that boundary explicitly instead of flattening all
knowledge into one vague "brain."
| Layer | Role | Writes | Reads |
|-------|------|--------|-------|
| Clawdie-AI | Source of truth | Skill artifacts via PR | N/A |
| `colibri-skills` | Runtime consumer | Never to source repo | Indexed skill structs from committed artifacts |
| Agents | Authors/reviewers | Candidate skill artifact PRs | Skill content for task routing |
| `system_brain` | Agent/user memory | Personal/user/agent context | Not canonical skill docs |
| `system_ops` | Runtime state | Live task/service state | Not skills |
## What `colibri-skills` should own
## What `colibri-skills` does
`colibri-skills` should own the Rust-side access layer for **built-in
knowledge only**.
- Read skill manifests from a configured Clawdie-AI checkout path
- Parse `run_manifest.json`
- Validate checksums against `artifacts.sha256`
- Classify artifacts as document, image, script, transcript, manifest, checksum,
report, contact sheet, or other
- Index Markdown/transcript chunks for search
- Expose stable typed structs for daemon/client/TUI callers
- Persist runtime index metadata in SQLite
Phase-1 responsibilities:
## What `colibri-skills` does not do
- connect to the `system_skills` database
- report artifact/import status
- expose stable typed results for built-in knowledge lookup
- support text search over committed handbook content
- return structured summaries that other Colibri crates can consume
- Author, edit, or create skills
- Store image blobs in SQLite; store paths and hashes only
- Replace `system_brain`
- Replace `system_ops`
- Own provider/API budget logic
- Require nonportable local source media paths at runtime
Good callers later:
## Phase 1 delivered
- `colibri-daemon`
- `colibri-client`
- `colibri-glasspane-tui`
The scaffold crate now provides:
## What `colibri-skills` must not own
- `Skill`
- `SkillManifest`
- `SkillArtifact`
- `SkillChunk`
- `ArtifactType`
- `SkillStatus`
- `ImportSummary`
- `SearchResult`
- unit tests for artifact classification and status/summary behavior
It should **not** own:
Phase 1 is intentionally scaffold-only: compile and type proof, no runtime
import behavior yet.
- user memory retrieval from `system_brain`
- task/session/runtime state from `system_ops`
- artifact generation or embedding refresh
- OpenRouter or embedding-provider budget logic
- ISO build-time content packing
## SQLite schema target
Those already belong elsewhere.
```sql
CREATE TABLE system_skills (
skill_id TEXT PRIMARY KEY,
display_name TEXT NOT NULL,
source_path TEXT NOT NULL, -- relative within Clawdie-AI repo
manifest_hash TEXT, -- sha256 of run_manifest.json
created_at TEXT NOT NULL, -- ISO 8601
updated_at TEXT NOT NULL,
verification TEXT, -- natural-language verification test
status TEXT NOT NULL DEFAULT 'active' -- active, archived, superseded
);
## Why a separate crate
CREATE TABLE system_skill_artifacts (
artifact_id INTEGER PRIMARY KEY AUTOINCREMENT,
skill_id TEXT NOT NULL REFERENCES system_skills(skill_id),
artifact_type TEXT NOT NULL,
relative_path TEXT NOT NULL, -- within the skill directory
file_name TEXT NOT NULL,
mime_type TEXT,
size_bytes INTEGER,
sha256_hash TEXT NOT NULL,
UNIQUE(skill_id, relative_path)
);
This wants its own crate because it is a separate architectural lane:
CREATE TABLE system_skill_chunks (
chunk_id INTEGER PRIMARY KEY AUTOINCREMENT,
skill_id TEXT NOT NULL REFERENCES system_skills(skill_id),
artifact_id INTEGER NOT NULL REFERENCES system_skill_artifacts(artifact_id),
chunk_type TEXT NOT NULL,
heading TEXT,
content TEXT NOT NULL,
line_start INTEGER,
line_end INTEGER,
tokens_estimate INTEGER
);
- distinct data source
- distinct lifecycle
- distinct operator story
- distinct proof gates
CREATE INDEX idx_skills_status ON system_skills(status);
CREATE INDEX idx_artifacts_skill ON system_skill_artifacts(skill_id);
CREATE INDEX idx_artifacts_type ON system_skill_artifacts(artifact_type);
CREATE INDEX idx_chunks_skill ON system_skill_chunks(skill_id);
CREATE INDEX idx_chunks_type ON system_skill_chunks(chunk_type);
It is not just a helper module inside `colibri-daemon`.
CREATE VIRTUAL TABLE IF NOT EXISTS skill_fts USING fts5(
content,
heading,
skill_id,
chunk_type,
content=system_skill_chunks,
content_rowid=chunk_id
);
```
That keeps the split-brain boundary visible in code:
## Import flow target
- `colibri-runtime` = host/runtime facts
- `colibri-skills` = built-in handbook facts
- future memory lane = user/agent memory, if needed later
1. Read Clawdie-AI checkout path from config/env.
2. Scan for directories containing `run_manifest.json`.
3. Parse manifest and derive skill metadata.
4. Read artifacts, compute SHA-256, and verify `artifacts.sha256` when present.
5. Chunk Markdown by heading and transcripts by timestamp/segment.
6. Upsert SQLite rows idempotently.
7. Return `ImportSummary` with skills found/indexed/skipped, artifacts, chunks,
checksum failures, and errors.
## Phase 0 — source map before scaffolding
## CLI surface target
Before adding the crate, copy the TypeScript contract into this document or a
small fixture so the Rust implementation does not guess at table names or env
wiring.
```sh
colibri list-skills
colibri show-skill <id>
colibri search-skills <query>
colibri index-skills
colibri verify-skill <id>
```
Known current tables from `clawdie-ai/src/split-brain-status.ts` and
`bootstrap/skills-memory/artifact.sql`:
## Portability rules
- `builtin_knowledge_artifacts`
- `builtin_knowledge_documents`
- `builtin_knowledge_chunks`
- `builtin_knowledge_embeddings`
- Store image paths and hashes, not blobs.
- Treat local provenance paths like `/home/samob/Videos/...` as metadata only.
- Verify checksums against committed artifacts, not local source paths.
- Store paths relative to the Clawdie-AI repo.
- Normal tests must not require PostgreSQL, remote Forgejo, or local source
videos/audio.
Phase-0 checks:
## Future skillpacks
1. Identify the exact read-only connection source/env var used for the skills
DB in production.
2. Record the minimum queries needed for status:
- artifact row count
- document row count
- chunk row count
- latest `artifact_version`
3. Choose an opt-in live-test env var so normal workspace tests do **not**
require PostgreSQL.
4. Add offline fixture rows/golden JSON before any live DB test.
```text
astro-howto
forgejo-admin
vaultwarden-onboarding
freebsd-update-reboot
colibri-iso-build
zed-on-freebsd
pi-headless-login
```
## Phase 1 — read-only consumer
## Implementation phases
The first implementation should stay intentionally small.
Deliverables:
1. New crate scaffold:
- `crates/colibri-skills`
2. DB-facing types for:
- artifact metadata
- documents
- chunks
- text search results
3. Read-only API for:
- artifact/import status
- list current artifact version
- search built-in knowledge by text
4. Unit tests with fixture rows or golden snapshots
5. Optional ignored/opt-in integration test for a real skills DB
Phase 1 goal:
> prove Colibri can read the "manuals already included" lane cleanly, without
> touching memory or generator code.
## Phase 2 — daemon integration
After Phase 1 is proven, wire it into Colibri surfaces:
- `colibri-daemon`
- expose a query path for built-in handbook lookups
- `colibri-client`
- typed client method
- `colibri-glasspane-tui`
- optional "manuals included" / artifact-status pane
This should still remain read-only.
## Phase 3 — optional deeper work
Only after the read path is stable should we consider:
- vector similarity for built-in knowledge
- richer ranking/grounding
- parity checks against the TypeScript retrieval layer
- eventual Rust ownership of artifact generation
That is explicitly **not** phase 1.
## Contracts and proof
The proof for `colibri-skills` should be simple:
1. Offline tests can parse fixture rows and return typed status/search results.
2. With an explicit live DB env var, Linux and FreeBSD can connect read-only to
the skills DB / `system_skills` lane.
3. Colibri can report whether the committed artifact is imported.
4. Colibri can query known built-in handbook content by text.
5. Results are typed and stable enough for daemon/client use.
This is narrower than the daemon proof gates and should stay that way.
## Naming
Why `colibri-skills`:
- it matches the existing `system_skills` boundary
- it stays concrete
- it maps to the "manuals already included" story without pretending it owns
all knowledge
Rejected names for now:
- `colibri-brain`
- too vague; collides with user memory
- `colibri-knowledge`
- broader than the actual lane
- `colibri-manuals`
- good product language, but weaker alignment with existing technical names
## Non-goals
- no new repo
- no crate inside `clawdie-iso`
- no merge of `system_skills` with `system_brain`
- no rewrite of `bootstrap/skills-memory/artifact.sql` generation yet
- no embedding refresh port to Rust yet
| Phase | What | Depends on |
|-------|------|------------|
| 1 | Scaffold crate + structs + schema plan | Nothing |
| 2 | Manifest parser (`run_manifest.json``SkillManifest`) | Phase 1 |
| 3 | Checksum validator (`artifacts.sha256` → verify) | Phase 2 |
| 4 | Markdown/transcript chunker | Phase 1 |
| 5 | SQLite storage + FTS5 search | Phases 3, 4 |
| 6 | CLI commands (`list`, `show`, `search`, `index`, `verify`) | Phase 5 |
| 7 | Daemon/client/TUI integration | Phase 6 |
## Related sources
- `clawdie-ai/docs/internal/SKILLS-ARTIFACT-V1-PLAN.md`
- `clawdie-ai/html/docs-clawdie-si/docs/split-brain.html`
- `clawdie-ai/setup/skills-memory.ts`
- `clawdie-ai/docs/astro-howto/`
- `clawdie-ai/docs/VAULTWARDEN-SETUP.md`
- `clawdie-ai/bootstrap/skills-memory/artifact.sql`
- `clawdie-ai/src/split-brain-status.ts`