From 5267f97f52273dbce6b2e2f5f985c010f77d0435 Mon Sep 17 00:00:00 2001 From: Sam & Hermes Date: Sun, 31 May 2026 14:36:43 +0200 Subject: [PATCH 1/3] =?UTF-8?q?feat:=20scaffold=20colibri-skills=20crate?= =?UTF-8?q?=20=E2=80=94=20split-brain=20read=20consumer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1: structs + type system + 12 tests. No IO, no SQLite yet. Compiles against full workspace (9 crates now, up from 8). The colibri-skills crate is the read-only runtime consumer for skill artifacts authored in Clawdie-AI. It does NOT store or author skills — it indexes committed, reviewed skill bundles. Seeded from the astro-howto artifact (PR #6 in clawdie-ai): - Skill, SkillManifest, SkillArtifact, SkillChunk structs - ArtifactType classifier (document, image, script, transcript, etc.) - ImportSummary + SearchResult types - SQLite schema documented in doc/COLIBRI-SKILLS-PLAN.md Build: pass | Tests: 12/12 green | Clippy: pending --- Cargo.lock | 12 ++ Cargo.toml | 2 +- crates/colibri-skills/Cargo.toml | 11 ++ crates/colibri-skills/src/lib.rs | 322 +++++++++++++++++++++++++++++++ doc/COLIBRI-SKILLS-PLAN.md | 254 ++++++++++++++++++++++++ 5 files changed, 600 insertions(+), 1 deletion(-) create mode 100644 crates/colibri-skills/Cargo.toml create mode 100644 crates/colibri-skills/src/lib.rs create mode 100644 doc/COLIBRI-SKILLS-PLAN.md diff --git a/Cargo.lock b/Cargo.lock index 2c32456..5442670 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -180,7 +180,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" dependencies = [ "iana-time-zone", + "js-sys", "num-traits", + "serde", + "wasm-bindgen", "windows-link", ] @@ -284,6 +287,15 @@ dependencies = [ "tokio", ] +[[package]] +name = "colibri-skills" +version = "0.0.1" +dependencies = [ + "chrono", + "serde", + "serde_json", +] + [[package]] name = "colibri-store" version = "0.0.1" diff --git a/Cargo.toml b/Cargo.toml index cdd1b4b..8e952d8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["crates/colibri-contracts", "crates/colibri-deepseek", "crates/colibri-runtime", "crates/colibri-glasspane", "crates/colibri-daemon", "crates/colibri-client", "crates/colibri-glasspane-tui", "crates/colibri-store"] +members = ["crates/colibri-contracts", "crates/colibri-deepseek", "crates/colibri-runtime", "crates/colibri-glasspane", "crates/colibri-daemon", "crates/colibri-client", "crates/colibri-glasspane-tui", "crates/colibri-store", "crates/colibri-skills"] [package] name = "colibri" diff --git a/crates/colibri-skills/Cargo.toml b/crates/colibri-skills/Cargo.toml new file mode 100644 index 0000000..6997dc4 --- /dev/null +++ b/crates/colibri-skills/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "colibri-skills" +version = "0.0.1" +edition = "2021" +description = "Colibri skills read consumer — indexes Clawdie-AI skill artifacts into SQLite" +license = "AGPL-3.0-only" + +[dependencies] +serde = { version = "1", features = ["derive"] } +serde_json = "1" +chrono = { version = "0.4", features = ["serde"] } diff --git a/crates/colibri-skills/src/lib.rs b/crates/colibri-skills/src/lib.rs new file mode 100644 index 0000000..4d54b11 --- /dev/null +++ b/crates/colibri-skills/src/lib.rs @@ -0,0 +1,322 @@ +//! Colibri Skills — read-only consumer for Clawdie-AI skill artifacts. +//! +//! This crate indexes committed, reviewed skill artifacts from the Clawdie-AI +//! repository into SQLite. It does NOT author, edit, or store skill content — +//! that responsibility lives in Clawdie-AI. +//! +//! ```text +//! Clawdie-AI repo (source of truth) +//! docs/astro-howto/ +//! docs/forgejo-admin/ +//! ... +//! +//! colibri-skills (read-only consumer) +//! reads run_manifest.json +//! validates checksums +//! indexes markdown/transcript chunks +//! serves CLI/TUI search +//! ``` + +use serde::{Deserialize, Serialize}; + +// ── Core types ──────────────────────────────────────────────────────────── + +/// A read-only skill artifact indexed from Clawdie-AI. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Skill { + pub skill_id: String, + pub display_name: String, + /// Relative path within the Clawdie-AI repo (e.g. "docs/astro-howto"). + pub source_path: String, + pub manifest: SkillManifest, + pub artifacts: Vec, + pub status: SkillStatus, + /// Natural-language verification test. + pub verification: Option, +} + +/// Parsed from `run_manifest.json` in a skill directory. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SkillManifest { + pub run_id: String, + pub created: String, + pub source: Option, + pub pipeline_stages: Vec, + pub models_used: Vec, + pub notes: Option, +} + +/// Source media metadata from the manifest. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ManifestSource { + pub path: Option, + pub size_human: Option, + pub codec: Option, + pub resolution: Option, + pub duration_human: Option, + pub original_untouched: Option, +} + +/// A single stage in the skill generation pipeline. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PipelineStage { + pub stage: String, + pub tool: String, + pub model: Option, + pub output: Option, + pub duration_s: Option, + pub lines: Option, + pub size_human: Option, +} + +/// A model used during skill generation. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ModelUsage { + pub model: String, + pub purpose: String, + pub api_key_used: bool, +} + +/// An individual file within a skill directory. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SkillArtifact { + pub artifact_type: ArtifactType, + /// Path relative to the skill directory. + pub relative_path: String, + pub file_name: String, + pub mime_type: Option, + pub size_bytes: u64, + pub sha256_hash: String, +} + +/// Classification of a skill artifact file. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ArtifactType { + Document, + Image, + Script, + Transcript, + Manifest, + Checksum, + Report, + ContactSheet, + Other(String), +} + +impl ArtifactType { + pub fn from_path(path: &str) -> Self { + let lower = path.to_lowercase(); + // Check specific patterns before generic extensions. + if lower.ends_with(".py") || lower.ends_with(".sh") || lower.ends_with(".bash") { + return ArtifactType::Script; + } + if lower.contains("contact_sheet") { + return ArtifactType::ContactSheet; + } + if lower.contains("run_manifest") && lower.ends_with(".json") { + return ArtifactType::Manifest; + } + if lower.contains("sha256") || lower.contains("checksum") { + return ArtifactType::Checksum; + } + if lower.contains("report") && lower.ends_with(".json") { + return ArtifactType::Report; + } + if lower.ends_with(".md") { + ArtifactType::Document + } else if lower.ends_with(".jpg") || lower.ends_with(".png") || lower.ends_with(".webp") { + ArtifactType::Image + } else if lower.ends_with(".txt") && lower.contains("transcript") { + ArtifactType::Transcript + } else { + ArtifactType::Other(path.to_string()) + } + } +} + +/// A chunk of searchable content extracted from a skill artifact. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SkillChunk { + pub chunk_id: Option, + pub skill_id: String, + pub artifact_relative_path: String, + pub chunk_type: ChunkType, + /// Nearest markdown heading, if applicable. + pub heading: Option, + pub content: String, + pub line_start: u64, + pub line_end: u64, + pub tokens_estimate: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ChunkType { + MarkdownSection, + TranscriptSegment, + CommandBlock, + CodeBlock, + Table, +} + +// ── Status ──────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum SkillStatus { + Active, + Archived, + Superseded, +} + +impl Default for SkillStatus { + fn default() -> Self { + SkillStatus::Active + } +} + +// ── Import summary ──────────────────────────────────────────────────────── + +/// Returned after indexing a Clawdie-AI checkout. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ImportSummary { + pub skills_found: usize, + pub skills_indexed: usize, + pub skills_skipped: usize, + pub artifacts_total: usize, + pub chunks_total: usize, + pub checksum_failures: usize, + pub errors: Vec, +} + +impl ImportSummary { + pub fn success(&self) -> bool { + self.errors.is_empty() && self.checksum_failures == 0 + } +} + +// ── Search result ───────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SearchResult { + pub skill_id: String, + pub display_name: String, + pub chunk_type: ChunkType, + pub heading: Option, + pub snippet: String, + pub artifact_path: String, + pub line_start: u64, +} + +// ── Tests ───────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn artifact_type_from_path_document() { + assert_eq!( + ArtifactType::from_path("docs/HOWTO.md"), + ArtifactType::Document + ); + } + + #[test] + fn artifact_type_from_path_image() { + assert_eq!( + ArtifactType::from_path("screenshots/001_00-01-05_intro.jpg"), + ArtifactType::Image + ); + } + + #[test] + fn artifact_type_from_path_script_python() { + assert_eq!( + ArtifactType::from_path("scripts/generate_contact_sheet.py"), + ArtifactType::Script + ); + } + + #[test] + fn artifact_type_from_path_script_shell() { + assert_eq!( + ArtifactType::from_path("scripts/extract_screenshots.sh"), + ArtifactType::Script + ); + } + + #[test] + fn artifact_type_from_path_transcript() { + assert_eq!( + ArtifactType::from_path("transcript_local.txt"), + ArtifactType::Transcript + ); + } + + #[test] + fn artifact_type_from_path_manifest() { + assert_eq!( + ArtifactType::from_path("run_manifest.json"), + ArtifactType::Manifest + ); + } + + #[test] + fn artifact_type_from_path_checksum() { + assert_eq!( + ArtifactType::from_path("artifacts.sha256"), + ArtifactType::Checksum + ); + } + + #[test] + fn artifact_type_from_path_report() { + assert_eq!( + ArtifactType::from_path("contact-sheet/report.json"), + ArtifactType::Report + ); + } + + #[test] + fn artifact_type_from_path_contact_sheet() { + assert_eq!( + ArtifactType::from_path("contact-sheet/contact_sheet.jpg"), + ArtifactType::ContactSheet + ); + } + + #[test] + fn skill_status_default_is_active() { + assert_eq!(SkillStatus::default(), SkillStatus::Active); + } + + #[test] + fn import_summary_success_empty() { + let s = ImportSummary { + skills_found: 0, + skills_indexed: 0, + skills_skipped: 0, + artifacts_total: 0, + chunks_total: 0, + checksum_failures: 0, + errors: vec![], + }; + assert!(s.success()); + } + + #[test] + fn import_summary_failure_on_error() { + let s = ImportSummary { + skills_found: 1, + skills_indexed: 0, + skills_skipped: 1, + artifacts_total: 0, + chunks_total: 0, + checksum_failures: 0, + errors: vec!["manifest parse error".into()], + }; + assert!(!s.success()); + } +} diff --git a/doc/COLIBRI-SKILLS-PLAN.md b/doc/COLIBRI-SKILLS-PLAN.md new file mode 100644 index 0000000..db4c0a0 --- /dev/null +++ b/doc/COLIBRI-SKILLS-PLAN.md @@ -0,0 +1,254 @@ +# Colibri Skills — Split-Brain Read Consumer + +## Purpose + +`colibri-skills` is the read-only runtime consumer for skill artifacts authored +in the Clawdie-AI repo. It does NOT store or author skills — it indexes them. + +```text + Clawdie-AI repo (source of truth) + docs/astro-howto/ + docs/forgejo-admin/ + docs/vaultwarden-onboarding/ + ... + + Colibri colibri-skills crate (read-only consumer) + reads committed skill artifacts + validates checksums + indexes Markdown/transcript chunks + exposes Skill, SkillArtifact, SkillChunk structs + serves CLI/TUI search +``` + +## Provenance example + +The first seed artifact: `docs/astro-howto/` in Clawdie-AI. + +```json +{ + "skill_id": "astro-howto", + "source": "local video-derived training artifact", + "inputs": [ + "transcript_local.txt", + "screenshots/", + "contact-sheet/contact_sheet.jpg" + ], + "outputs": [ + "docs/HOWTO.md", + "docs/COMMANDS.md", + "docs/SCREENSHOTS.md", + "docs/SUMMARY.md" + ], + "verification": "can user create and run an Astro project?", + "media": "screenshots/*.jpg (paths + hashes, not blobs)", + "manifest": "run_manifest.json", + "checksums": "artifacts.sha256" +} +``` + +Pipeline that produced it: + +```text + video → local transcript → topic extraction → how-to/runbook + → screenshots/contact sheet → commands → verification test + → manifest + checksums → reviewed skill artifact → Colibri read-only index +``` + +## Architecture + +### Ownership + +| Layer | Role | Writes | Reads | +|-------|------|--------|-------| +| Clawdie-AI | Source of truth | Skill artifacts via PR | N/A | +| Colibri colibri-skills | Runtime consumer | Never | Indexed skill structs from committed artifacts | +| Agents (Hermes, Claude, Codex) | Authors | Create PRs with skill artifacts | Skill content for task routing | +| system_brain | Agent memory | Personal/user/agent context | N/A — not canonical docs | +| system_ops | Runtime state | Live task/service state | N/A — not skills | + +### What Colibri-skills does NOT do + +- Store skill content (that's the Clawdie-AI repo's job) +- Store image blobs in SQLite (paths + hashes only) +- Write, edit, or create skills (read-only) +- Replace system_brain (personal/agent memory is separate) +- Replace system_ops (runtime state is separate) + +### What Colibri-skills DOES + +- Read skill manifests from a configured Clawdie-AI checkout path +- Parse run_manifest.json for each skill +- Validate checksums against artifacts.sha256 +- Index Markdown sections for search +- Expose Skill, SkillArtifact, SkillChunk structs +- Back by SQLite (system_skills, system_skill_artifacts, system_skill_chunks) + +## SQLite schema + +```sql +CREATE TABLE system_skills ( + skill_id TEXT PRIMARY KEY, + display_name TEXT NOT NULL, + source_path TEXT NOT NULL, -- relative within Clawdie-AI repo + manifest_hash TEXT, -- sha256 of run_manifest.json + created_at TEXT NOT NULL, -- ISO 8601 + updated_at TEXT NOT NULL, + verification TEXT, -- natural-language verification test + status TEXT NOT NULL DEFAULT 'active' -- active, archived, superseded +); + +CREATE TABLE system_skill_artifacts ( + artifact_id INTEGER PRIMARY KEY AUTOINCREMENT, + skill_id TEXT NOT NULL REFERENCES system_skills(skill_id), + artifact_type TEXT NOT NULL, -- document, image, script, transcript, manifest, checksum + relative_path TEXT NOT NULL, -- within the skill directory + file_name TEXT NOT NULL, + mime_type TEXT, + size_bytes INTEGER, + sha256_hash TEXT NOT NULL, + UNIQUE(skill_id, relative_path) +); + +CREATE TABLE system_skill_chunks ( + chunk_id INTEGER PRIMARY KEY AUTOINCREMENT, + skill_id TEXT NOT NULL REFERENCES system_skills(skill_id), + artifact_id INTEGER NOT NULL REFERENCES system_skill_artifacts(artifact_id), + chunk_type TEXT NOT NULL, -- markdown_section, transcript_segment, command_block + heading TEXT, -- nearest markdown heading, if applicable + content TEXT NOT NULL, -- the chunk text (not full file) + line_start INTEGER, -- line number in source file + line_end INTEGER, + tokens_estimate INTEGER -- rough token count for search ranking +); + +CREATE INDEX idx_skills_status ON system_skills(status); +CREATE INDEX idx_artifacts_skill ON system_skill_artifacts(skill_id); +CREATE INDEX idx_artifacts_type ON system_skill_artifacts(artifact_type); +CREATE INDEX idx_chunks_skill ON system_skill_chunks(skill_id); +CREATE INDEX idx_chunks_type ON system_skill_chunks(chunk_type); +CREATE VIRTUAL TABLE IF NOT EXISTS skill_fts USING fts5( + content, + heading, + skill_id, + chunk_type, + content=system_skill_chunks, + content_rowid=chunk_id +); +``` + +## Rust structs (planned) + +```rust +// crates/colibri-skills/src/lib.rs + +pub struct Skill { + pub skill_id: String, + pub display_name: String, + pub source_path: String, + pub manifest: SkillManifest, + pub artifacts: Vec, + pub status: SkillStatus, + pub verification: Option, +} + +pub struct SkillManifest { + pub run_id: String, + pub pipeline_stages: Vec, + pub models_used: Vec, + pub created: String, +} + +pub struct PipelineStage { + pub stage: String, + pub tool: String, + pub output: String, + pub duration_s: Option, +} + +pub struct ModelUsage { + pub model: String, + pub purpose: String, + pub api_key_used: bool, +} + +pub struct SkillArtifact { + pub artifact_type: ArtifactType, + pub relative_path: String, + pub file_name: String, + pub mime_type: Option, + pub size_bytes: u64, + pub sha256_hash: String, +} + +pub enum ArtifactType { + Document, // .md + Image, // .jpg, .png, .webp + Script, // .py, .sh + Transcript, // .txt + Manifest, // run_manifest.json + Checksum, // artifacts.sha256 + Report, // report.json +} + +pub enum SkillStatus { + Active, + Archived, + Superseded, +} +``` + +## CLI surface (future) + +```sh +colibri list-skills # list all indexed skills +colibri show-skill # full skill metadata + artifact list +colibri search-skills # FTS5 search across chunks +colibri index-skills # re-index from Clawdie-AI checkout +colibri verify-skill # re-validate checksums +``` + +## Import flow + +``` +1. colibri-skills reads Clawdie-AI path from config/env +2. Scans for directories containing run_manifest.json +3. Parses manifest → extracts skill metadata +4. For each artifact in manifest: reads file, computes/verifies sha256 +5. Chunks Markdown files by heading, transcript files by segment +6. Inserts/updates SQLite rows (idempotent — replace on conflict) +7. Returns import summary: skills found, artifacts indexed, chunks created +``` + +## Portability rules + +- Image paths and hashes stored, not blobs +- Local provenance paths (e.g., `/home/samob/Videos/...`) are nonportable + metadata — stored but never assumed to exist at runtime +- Checksums are portable — verified against artifact content, not source paths +- Paths stored as relative within Clawdie-AI repo + +## Future skillpacks (candidates) + +```text + astro-howto + forgejo-admin + vaultwarden-onboarding + freebsd-update-reboot + colibri-iso-build + zed-on-freebsd + pi-headless-login +``` + +## Implementation phases + +| Phase | What | Depends on | +|-------|------|-----------| +| 1 | Scaffold crate + structs + SQLite schema | Nothing | +| 2 | Manifest parser (run_manifest.json → SkillManifest) | Phase 1 | +| 3 | Checksum validator (artifacts.sha256 → verify) | Phase 2 | +| 4 | Markdown chunker (heading-based) | Phase 1 | +| 5 | FTS5-backed search | Phases 3, 4 | +| 6 | CLI commands (list, show, search, index) | Phases 3, 4 | +| 7 | Daemon integration (auto-index on config path changes) | Phase 6 | + +Phase 1 is scaffold-only — compile check, no runtime behavior. -- 2.45.3 From 1da49eac4f7689c80a34fd3232349cc2f8728e33 Mon Sep 17 00:00:00 2001 From: Sam & Claude Date: Sun, 31 May 2026 14:38:28 +0200 Subject: [PATCH 2/3] fix: satisfy clippy for skill status default (Sam & Codex) Validation: cargo fmt --check; cargo clippy -p colibri-skills --all-targets -- -D warnings; cargo test -p colibri-skills. --- crates/colibri-skills/src/lib.rs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/crates/colibri-skills/src/lib.rs b/crates/colibri-skills/src/lib.rs index 4d54b11..06934ff 100644 --- a/crates/colibri-skills/src/lib.rs +++ b/crates/colibri-skills/src/lib.rs @@ -162,20 +162,15 @@ pub enum ChunkType { // ── Status ──────────────────────────────────────────────────────────────── -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum SkillStatus { + #[default] Active, Archived, Superseded, } -impl Default for SkillStatus { - fn default() -> Self { - SkillStatus::Active - } -} - // ── Import summary ──────────────────────────────────────────────────────── /// Returned after indexing a Clawdie-AI checkout. -- 2.45.3 From 8b419f79ed51e68a5324fb2ea1575942b8b0cda7 Mon Sep 17 00:00:00 2001 From: Sam & Claude Date: Sun, 31 May 2026 14:39:49 +0200 Subject: [PATCH 3/3] docs: align skills plan with scaffold (Sam & Codex) Validation: cargo fmt --check; cargo clippy -p colibri-skills --all-targets -- -D warnings; cargo test -p colibri-skills; git diff --check. --- doc/COLIBRI-SKILLS-PLAN.md | 254 -------------------------- docs/COLIBRI-SKILLS-PLAN.md | 354 +++++++++++++++++++----------------- 2 files changed, 187 insertions(+), 421 deletions(-) delete mode 100644 doc/COLIBRI-SKILLS-PLAN.md diff --git a/doc/COLIBRI-SKILLS-PLAN.md b/doc/COLIBRI-SKILLS-PLAN.md deleted file mode 100644 index db4c0a0..0000000 --- a/doc/COLIBRI-SKILLS-PLAN.md +++ /dev/null @@ -1,254 +0,0 @@ -# Colibri Skills — Split-Brain Read Consumer - -## Purpose - -`colibri-skills` is the read-only runtime consumer for skill artifacts authored -in the Clawdie-AI repo. It does NOT store or author skills — it indexes them. - -```text - Clawdie-AI repo (source of truth) - docs/astro-howto/ - docs/forgejo-admin/ - docs/vaultwarden-onboarding/ - ... - - Colibri colibri-skills crate (read-only consumer) - reads committed skill artifacts - validates checksums - indexes Markdown/transcript chunks - exposes Skill, SkillArtifact, SkillChunk structs - serves CLI/TUI search -``` - -## Provenance example - -The first seed artifact: `docs/astro-howto/` in Clawdie-AI. - -```json -{ - "skill_id": "astro-howto", - "source": "local video-derived training artifact", - "inputs": [ - "transcript_local.txt", - "screenshots/", - "contact-sheet/contact_sheet.jpg" - ], - "outputs": [ - "docs/HOWTO.md", - "docs/COMMANDS.md", - "docs/SCREENSHOTS.md", - "docs/SUMMARY.md" - ], - "verification": "can user create and run an Astro project?", - "media": "screenshots/*.jpg (paths + hashes, not blobs)", - "manifest": "run_manifest.json", - "checksums": "artifacts.sha256" -} -``` - -Pipeline that produced it: - -```text - video → local transcript → topic extraction → how-to/runbook - → screenshots/contact sheet → commands → verification test - → manifest + checksums → reviewed skill artifact → Colibri read-only index -``` - -## Architecture - -### Ownership - -| Layer | Role | Writes | Reads | -|-------|------|--------|-------| -| Clawdie-AI | Source of truth | Skill artifacts via PR | N/A | -| Colibri colibri-skills | Runtime consumer | Never | Indexed skill structs from committed artifacts | -| Agents (Hermes, Claude, Codex) | Authors | Create PRs with skill artifacts | Skill content for task routing | -| system_brain | Agent memory | Personal/user/agent context | N/A — not canonical docs | -| system_ops | Runtime state | Live task/service state | N/A — not skills | - -### What Colibri-skills does NOT do - -- Store skill content (that's the Clawdie-AI repo's job) -- Store image blobs in SQLite (paths + hashes only) -- Write, edit, or create skills (read-only) -- Replace system_brain (personal/agent memory is separate) -- Replace system_ops (runtime state is separate) - -### What Colibri-skills DOES - -- Read skill manifests from a configured Clawdie-AI checkout path -- Parse run_manifest.json for each skill -- Validate checksums against artifacts.sha256 -- Index Markdown sections for search -- Expose Skill, SkillArtifact, SkillChunk structs -- Back by SQLite (system_skills, system_skill_artifacts, system_skill_chunks) - -## SQLite schema - -```sql -CREATE TABLE system_skills ( - skill_id TEXT PRIMARY KEY, - display_name TEXT NOT NULL, - source_path TEXT NOT NULL, -- relative within Clawdie-AI repo - manifest_hash TEXT, -- sha256 of run_manifest.json - created_at TEXT NOT NULL, -- ISO 8601 - updated_at TEXT NOT NULL, - verification TEXT, -- natural-language verification test - status TEXT NOT NULL DEFAULT 'active' -- active, archived, superseded -); - -CREATE TABLE system_skill_artifacts ( - artifact_id INTEGER PRIMARY KEY AUTOINCREMENT, - skill_id TEXT NOT NULL REFERENCES system_skills(skill_id), - artifact_type TEXT NOT NULL, -- document, image, script, transcript, manifest, checksum - relative_path TEXT NOT NULL, -- within the skill directory - file_name TEXT NOT NULL, - mime_type TEXT, - size_bytes INTEGER, - sha256_hash TEXT NOT NULL, - UNIQUE(skill_id, relative_path) -); - -CREATE TABLE system_skill_chunks ( - chunk_id INTEGER PRIMARY KEY AUTOINCREMENT, - skill_id TEXT NOT NULL REFERENCES system_skills(skill_id), - artifact_id INTEGER NOT NULL REFERENCES system_skill_artifacts(artifact_id), - chunk_type TEXT NOT NULL, -- markdown_section, transcript_segment, command_block - heading TEXT, -- nearest markdown heading, if applicable - content TEXT NOT NULL, -- the chunk text (not full file) - line_start INTEGER, -- line number in source file - line_end INTEGER, - tokens_estimate INTEGER -- rough token count for search ranking -); - -CREATE INDEX idx_skills_status ON system_skills(status); -CREATE INDEX idx_artifacts_skill ON system_skill_artifacts(skill_id); -CREATE INDEX idx_artifacts_type ON system_skill_artifacts(artifact_type); -CREATE INDEX idx_chunks_skill ON system_skill_chunks(skill_id); -CREATE INDEX idx_chunks_type ON system_skill_chunks(chunk_type); -CREATE VIRTUAL TABLE IF NOT EXISTS skill_fts USING fts5( - content, - heading, - skill_id, - chunk_type, - content=system_skill_chunks, - content_rowid=chunk_id -); -``` - -## Rust structs (planned) - -```rust -// crates/colibri-skills/src/lib.rs - -pub struct Skill { - pub skill_id: String, - pub display_name: String, - pub source_path: String, - pub manifest: SkillManifest, - pub artifacts: Vec, - pub status: SkillStatus, - pub verification: Option, -} - -pub struct SkillManifest { - pub run_id: String, - pub pipeline_stages: Vec, - pub models_used: Vec, - pub created: String, -} - -pub struct PipelineStage { - pub stage: String, - pub tool: String, - pub output: String, - pub duration_s: Option, -} - -pub struct ModelUsage { - pub model: String, - pub purpose: String, - pub api_key_used: bool, -} - -pub struct SkillArtifact { - pub artifact_type: ArtifactType, - pub relative_path: String, - pub file_name: String, - pub mime_type: Option, - pub size_bytes: u64, - pub sha256_hash: String, -} - -pub enum ArtifactType { - Document, // .md - Image, // .jpg, .png, .webp - Script, // .py, .sh - Transcript, // .txt - Manifest, // run_manifest.json - Checksum, // artifacts.sha256 - Report, // report.json -} - -pub enum SkillStatus { - Active, - Archived, - Superseded, -} -``` - -## CLI surface (future) - -```sh -colibri list-skills # list all indexed skills -colibri show-skill # full skill metadata + artifact list -colibri search-skills # FTS5 search across chunks -colibri index-skills # re-index from Clawdie-AI checkout -colibri verify-skill # re-validate checksums -``` - -## Import flow - -``` -1. colibri-skills reads Clawdie-AI path from config/env -2. Scans for directories containing run_manifest.json -3. Parses manifest → extracts skill metadata -4. For each artifact in manifest: reads file, computes/verifies sha256 -5. Chunks Markdown files by heading, transcript files by segment -6. Inserts/updates SQLite rows (idempotent — replace on conflict) -7. Returns import summary: skills found, artifacts indexed, chunks created -``` - -## Portability rules - -- Image paths and hashes stored, not blobs -- Local provenance paths (e.g., `/home/samob/Videos/...`) are nonportable - metadata — stored but never assumed to exist at runtime -- Checksums are portable — verified against artifact content, not source paths -- Paths stored as relative within Clawdie-AI repo - -## Future skillpacks (candidates) - -```text - astro-howto - forgejo-admin - vaultwarden-onboarding - freebsd-update-reboot - colibri-iso-build - zed-on-freebsd - pi-headless-login -``` - -## Implementation phases - -| Phase | What | Depends on | -|-------|------|-----------| -| 1 | Scaffold crate + structs + SQLite schema | Nothing | -| 2 | Manifest parser (run_manifest.json → SkillManifest) | Phase 1 | -| 3 | Checksum validator (artifacts.sha256 → verify) | Phase 2 | -| 4 | Markdown chunker (heading-based) | Phase 1 | -| 5 | FTS5-backed search | Phases 3, 4 | -| 6 | CLI commands (list, show, search, index) | Phases 3, 4 | -| 7 | Daemon integration (auto-index on config path changes) | Phase 6 | - -Phase 1 is scaffold-only — compile check, no runtime behavior. diff --git a/docs/COLIBRI-SKILLS-PLAN.md b/docs/COLIBRI-SKILLS-PLAN.md index e39e4fb..c46a650 100644 --- a/docs/COLIBRI-SKILLS-PLAN.md +++ b/docs/COLIBRI-SKILLS-PLAN.md @@ -1,211 +1,231 @@ # Colibri Skills Plan -**Date:** 27.maj.2026 -**Status:** PLANNED — docs first, no crate scaffold yet +**Status:** Phase 1 scaffolded — read-only split-brain consumer + +**Crate:** `crates/colibri-skills` ## Purpose -Define the next Colibri slice for Clawdie's split-brain model: +`colibri-skills` is Colibri's read-only runtime consumer for reviewed skill +artifacts authored in the Clawdie-AI repo. It does not author, edit, or store +canonical skills. Clawdie-AI remains the source of truth; Colibri indexes and +serves typed/runtime views. -- **built-in knowledge** stays separate from -- **user/agent memory** and -- **operational state** +```text +Clawdie-AI repo (source of truth) + docs/astro-howto/ + docs/forgejo-admin/ + docs/vaultwarden-onboarding/ + ... -The product explanation is simple: +Colibri colibri-skills crate (read-only consumer) + reads committed skill artifacts + validates checksums + indexes Markdown/transcript chunks + exposes Skill, SkillArtifact, SkillChunk structs + serves CLI/TUI/search later +``` -> Think of it like the machine already ships with its handbook inside. +This keeps the split-brain model explicit: -In Clawdie terms, that handbook is the **committed built-in knowledge -artifact** imported into `system_skills`. +- `system_skills`: committed built-in knowledge / manuals / reviewed skillpacks +- `system_brain`: user and agent memory +- `system_ops`: live runtime, task, service, and daemon state -This plan names the Rust-side consumer for that lane: +## Seed artifact: Astro how-to -- **crate name:** `colibri-skills` +The first concrete skillpack is `docs/astro-howto/` in Clawdie-AI. It is useful +because it is not just prose; it includes transcript, generated how-to docs, +commands, screenshots, contact sheet, manifest, checksums, and scripts. -It is a **read path first**, not a new generator, not a new database, and not a -memory replacement. +```json +{ + "skill_id": "astro-howto", + "source": "local video-derived training artifact", + "inputs": [ + "transcript_local.txt", + "screenshots/", + "contact-sheet/contact_sheet.jpg" + ], + "outputs": [ + "docs/HOWTO.md", + "docs/COMMANDS.md", + "docs/SCREENSHOTS.md", + "docs/SUMMARY.md" + ], + "verification": "can user create and run an Astro project?", + "media": "screenshots/*.jpg (paths + hashes, not blobs)", + "manifest": "run_manifest.json", + "checksums": "artifacts.sha256" +} +``` -## Background +Pipeline shape: -The current split-brain design already exists in `clawdie-ai`: +```text +video → local transcript → topic extraction → how-to/runbook +→ screenshots/contact sheet → commands → verification test +→ manifest + checksums → reviewed skill artifact → Colibri read-only index +``` -- `system_skills` - - built-in knowledge / manuals already included - - sourced from `bootstrap/skills-memory/artifact.sql` -- `system_brain` - - dynamic user/agent memory -- `system_ops` - - runtime and operational state +## Ownership -Colibri should reflect that boundary explicitly instead of flattening all -knowledge into one vague "brain." +| Layer | Role | Writes | Reads | +|-------|------|--------|-------| +| Clawdie-AI | Source of truth | Skill artifacts via PR | N/A | +| `colibri-skills` | Runtime consumer | Never to source repo | Indexed skill structs from committed artifacts | +| Agents | Authors/reviewers | Candidate skill artifact PRs | Skill content for task routing | +| `system_brain` | Agent/user memory | Personal/user/agent context | Not canonical skill docs | +| `system_ops` | Runtime state | Live task/service state | Not skills | -## What `colibri-skills` should own +## What `colibri-skills` does -`colibri-skills` should own the Rust-side access layer for **built-in -knowledge only**. +- Read skill manifests from a configured Clawdie-AI checkout path +- Parse `run_manifest.json` +- Validate checksums against `artifacts.sha256` +- Classify artifacts as document, image, script, transcript, manifest, checksum, + report, contact sheet, or other +- Index Markdown/transcript chunks for search +- Expose stable typed structs for daemon/client/TUI callers +- Persist runtime index metadata in SQLite -Phase-1 responsibilities: +## What `colibri-skills` does not do -- connect to the `system_skills` database -- report artifact/import status -- expose stable typed results for built-in knowledge lookup -- support text search over committed handbook content -- return structured summaries that other Colibri crates can consume +- Author, edit, or create skills +- Store image blobs in SQLite; store paths and hashes only +- Replace `system_brain` +- Replace `system_ops` +- Own provider/API budget logic +- Require nonportable local source media paths at runtime -Good callers later: +## Phase 1 delivered -- `colibri-daemon` -- `colibri-client` -- `colibri-glasspane-tui` +The scaffold crate now provides: -## What `colibri-skills` must not own +- `Skill` +- `SkillManifest` +- `SkillArtifact` +- `SkillChunk` +- `ArtifactType` +- `SkillStatus` +- `ImportSummary` +- `SearchResult` +- unit tests for artifact classification and status/summary behavior -It should **not** own: +Phase 1 is intentionally scaffold-only: compile and type proof, no runtime +import behavior yet. -- user memory retrieval from `system_brain` -- task/session/runtime state from `system_ops` -- artifact generation or embedding refresh -- OpenRouter or embedding-provider budget logic -- ISO build-time content packing +## SQLite schema target -Those already belong elsewhere. +```sql +CREATE TABLE system_skills ( + skill_id TEXT PRIMARY KEY, + display_name TEXT NOT NULL, + source_path TEXT NOT NULL, -- relative within Clawdie-AI repo + manifest_hash TEXT, -- sha256 of run_manifest.json + created_at TEXT NOT NULL, -- ISO 8601 + updated_at TEXT NOT NULL, + verification TEXT, -- natural-language verification test + status TEXT NOT NULL DEFAULT 'active' -- active, archived, superseded +); -## Why a separate crate +CREATE TABLE system_skill_artifacts ( + artifact_id INTEGER PRIMARY KEY AUTOINCREMENT, + skill_id TEXT NOT NULL REFERENCES system_skills(skill_id), + artifact_type TEXT NOT NULL, + relative_path TEXT NOT NULL, -- within the skill directory + file_name TEXT NOT NULL, + mime_type TEXT, + size_bytes INTEGER, + sha256_hash TEXT NOT NULL, + UNIQUE(skill_id, relative_path) +); -This wants its own crate because it is a separate architectural lane: +CREATE TABLE system_skill_chunks ( + chunk_id INTEGER PRIMARY KEY AUTOINCREMENT, + skill_id TEXT NOT NULL REFERENCES system_skills(skill_id), + artifact_id INTEGER NOT NULL REFERENCES system_skill_artifacts(artifact_id), + chunk_type TEXT NOT NULL, + heading TEXT, + content TEXT NOT NULL, + line_start INTEGER, + line_end INTEGER, + tokens_estimate INTEGER +); -- distinct data source -- distinct lifecycle -- distinct operator story -- distinct proof gates +CREATE INDEX idx_skills_status ON system_skills(status); +CREATE INDEX idx_artifacts_skill ON system_skill_artifacts(skill_id); +CREATE INDEX idx_artifacts_type ON system_skill_artifacts(artifact_type); +CREATE INDEX idx_chunks_skill ON system_skill_chunks(skill_id); +CREATE INDEX idx_chunks_type ON system_skill_chunks(chunk_type); -It is not just a helper module inside `colibri-daemon`. +CREATE VIRTUAL TABLE IF NOT EXISTS skill_fts USING fts5( + content, + heading, + skill_id, + chunk_type, + content=system_skill_chunks, + content_rowid=chunk_id +); +``` -That keeps the split-brain boundary visible in code: +## Import flow target -- `colibri-runtime` = host/runtime facts -- `colibri-skills` = built-in handbook facts -- future memory lane = user/agent memory, if needed later +1. Read Clawdie-AI checkout path from config/env. +2. Scan for directories containing `run_manifest.json`. +3. Parse manifest and derive skill metadata. +4. Read artifacts, compute SHA-256, and verify `artifacts.sha256` when present. +5. Chunk Markdown by heading and transcripts by timestamp/segment. +6. Upsert SQLite rows idempotently. +7. Return `ImportSummary` with skills found/indexed/skipped, artifacts, chunks, + checksum failures, and errors. -## Phase 0 — source map before scaffolding +## CLI surface target -Before adding the crate, copy the TypeScript contract into this document or a -small fixture so the Rust implementation does not guess at table names or env -wiring. +```sh +colibri list-skills +colibri show-skill +colibri search-skills +colibri index-skills +colibri verify-skill +``` -Known current tables from `clawdie-ai/src/split-brain-status.ts` and -`bootstrap/skills-memory/artifact.sql`: +## Portability rules -- `builtin_knowledge_artifacts` -- `builtin_knowledge_documents` -- `builtin_knowledge_chunks` -- `builtin_knowledge_embeddings` +- Store image paths and hashes, not blobs. +- Treat local provenance paths like `/home/samob/Videos/...` as metadata only. +- Verify checksums against committed artifacts, not local source paths. +- Store paths relative to the Clawdie-AI repo. +- Normal tests must not require PostgreSQL, remote Forgejo, or local source + videos/audio. -Phase-0 checks: +## Future skillpacks -1. Identify the exact read-only connection source/env var used for the skills - DB in production. -2. Record the minimum queries needed for status: - - artifact row count - - document row count - - chunk row count - - latest `artifact_version` -3. Choose an opt-in live-test env var so normal workspace tests do **not** - require PostgreSQL. -4. Add offline fixture rows/golden JSON before any live DB test. +```text +astro-howto +forgejo-admin +vaultwarden-onboarding +freebsd-update-reboot +colibri-iso-build +zed-on-freebsd +pi-headless-login +``` -## Phase 1 — read-only consumer +## Implementation phases -The first implementation should stay intentionally small. - -Deliverables: - -1. New crate scaffold: - - `crates/colibri-skills` -2. DB-facing types for: - - artifact metadata - - documents - - chunks - - text search results -3. Read-only API for: - - artifact/import status - - list current artifact version - - search built-in knowledge by text -4. Unit tests with fixture rows or golden snapshots -5. Optional ignored/opt-in integration test for a real skills DB - -Phase 1 goal: - -> prove Colibri can read the "manuals already included" lane cleanly, without -> touching memory or generator code. - -## Phase 2 — daemon integration - -After Phase 1 is proven, wire it into Colibri surfaces: - -- `colibri-daemon` - - expose a query path for built-in handbook lookups -- `colibri-client` - - typed client method -- `colibri-glasspane-tui` - - optional "manuals included" / artifact-status pane - -This should still remain read-only. - -## Phase 3 — optional deeper work - -Only after the read path is stable should we consider: - -- vector similarity for built-in knowledge -- richer ranking/grounding -- parity checks against the TypeScript retrieval layer -- eventual Rust ownership of artifact generation - -That is explicitly **not** phase 1. - -## Contracts and proof - -The proof for `colibri-skills` should be simple: - -1. Offline tests can parse fixture rows and return typed status/search results. -2. With an explicit live DB env var, Linux and FreeBSD can connect read-only to - the skills DB / `system_skills` lane. -3. Colibri can report whether the committed artifact is imported. -4. Colibri can query known built-in handbook content by text. -5. Results are typed and stable enough for daemon/client use. - -This is narrower than the daemon proof gates and should stay that way. - -## Naming - -Why `colibri-skills`: - -- it matches the existing `system_skills` boundary -- it stays concrete -- it maps to the "manuals already included" story without pretending it owns - all knowledge - -Rejected names for now: - -- `colibri-brain` - - too vague; collides with user memory -- `colibri-knowledge` - - broader than the actual lane -- `colibri-manuals` - - good product language, but weaker alignment with existing technical names - -## Non-goals - -- no new repo -- no crate inside `clawdie-iso` -- no merge of `system_skills` with `system_brain` -- no rewrite of `bootstrap/skills-memory/artifact.sql` generation yet -- no embedding refresh port to Rust yet +| Phase | What | Depends on | +|-------|------|------------| +| 1 | Scaffold crate + structs + schema plan | Nothing | +| 2 | Manifest parser (`run_manifest.json` → `SkillManifest`) | Phase 1 | +| 3 | Checksum validator (`artifacts.sha256` → verify) | Phase 2 | +| 4 | Markdown/transcript chunker | Phase 1 | +| 5 | SQLite storage + FTS5 search | Phases 3, 4 | +| 6 | CLI commands (`list`, `show`, `search`, `index`, `verify`) | Phase 5 | +| 7 | Daemon/client/TUI integration | Phase 6 | ## Related sources -- `clawdie-ai/docs/internal/SKILLS-ARTIFACT-V1-PLAN.md` -- `clawdie-ai/html/docs-clawdie-si/docs/split-brain.html` -- `clawdie-ai/setup/skills-memory.ts` +- `clawdie-ai/docs/astro-howto/` +- `clawdie-ai/docs/VAULTWARDEN-SETUP.md` +- `clawdie-ai/bootstrap/skills-memory/artifact.sql` - `clawdie-ai/src/split-brain-status.ts` -- 2.45.3