diff --git a/Cargo.lock b/Cargo.lock index 2c32456..5442670 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -180,7 +180,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" dependencies = [ "iana-time-zone", + "js-sys", "num-traits", + "serde", + "wasm-bindgen", "windows-link", ] @@ -284,6 +287,15 @@ dependencies = [ "tokio", ] +[[package]] +name = "colibri-skills" +version = "0.0.1" +dependencies = [ + "chrono", + "serde", + "serde_json", +] + [[package]] name = "colibri-store" version = "0.0.1" diff --git a/Cargo.toml b/Cargo.toml index cdd1b4b..8e952d8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["crates/colibri-contracts", "crates/colibri-deepseek", "crates/colibri-runtime", "crates/colibri-glasspane", "crates/colibri-daemon", "crates/colibri-client", "crates/colibri-glasspane-tui", "crates/colibri-store"] +members = ["crates/colibri-contracts", "crates/colibri-deepseek", "crates/colibri-runtime", "crates/colibri-glasspane", "crates/colibri-daemon", "crates/colibri-client", "crates/colibri-glasspane-tui", "crates/colibri-store", "crates/colibri-skills"] [package] name = "colibri" diff --git a/crates/colibri-skills/Cargo.toml b/crates/colibri-skills/Cargo.toml new file mode 100644 index 0000000..6997dc4 --- /dev/null +++ b/crates/colibri-skills/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "colibri-skills" +version = "0.0.1" +edition = "2021" +description = "Colibri skills read consumer — indexes Clawdie-AI skill artifacts into SQLite" +license = "AGPL-3.0-only" + +[dependencies] +serde = { version = "1", features = ["derive"] } +serde_json = "1" +chrono = { version = "0.4", features = ["serde"] } diff --git a/crates/colibri-skills/src/lib.rs b/crates/colibri-skills/src/lib.rs new file mode 100644 index 0000000..4d54b11 --- /dev/null +++ b/crates/colibri-skills/src/lib.rs @@ -0,0 +1,322 @@ +//! Colibri Skills — read-only consumer for Clawdie-AI skill artifacts. +//! +//! This crate indexes committed, reviewed skill artifacts from the Clawdie-AI +//! repository into SQLite. It does NOT author, edit, or store skill content — +//! that responsibility lives in Clawdie-AI. +//! +//! ```text +//! Clawdie-AI repo (source of truth) +//! docs/astro-howto/ +//! docs/forgejo-admin/ +//! ... +//! +//! colibri-skills (read-only consumer) +//! reads run_manifest.json +//! validates checksums +//! indexes markdown/transcript chunks +//! serves CLI/TUI search +//! ``` + +use serde::{Deserialize, Serialize}; + +// ── Core types ──────────────────────────────────────────────────────────── + +/// A read-only skill artifact indexed from Clawdie-AI. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Skill { + pub skill_id: String, + pub display_name: String, + /// Relative path within the Clawdie-AI repo (e.g. "docs/astro-howto"). + pub source_path: String, + pub manifest: SkillManifest, + pub artifacts: Vec, + pub status: SkillStatus, + /// Natural-language verification test. + pub verification: Option, +} + +/// Parsed from `run_manifest.json` in a skill directory. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SkillManifest { + pub run_id: String, + pub created: String, + pub source: Option, + pub pipeline_stages: Vec, + pub models_used: Vec, + pub notes: Option, +} + +/// Source media metadata from the manifest. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ManifestSource { + pub path: Option, + pub size_human: Option, + pub codec: Option, + pub resolution: Option, + pub duration_human: Option, + pub original_untouched: Option, +} + +/// A single stage in the skill generation pipeline. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PipelineStage { + pub stage: String, + pub tool: String, + pub model: Option, + pub output: Option, + pub duration_s: Option, + pub lines: Option, + pub size_human: Option, +} + +/// A model used during skill generation. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ModelUsage { + pub model: String, + pub purpose: String, + pub api_key_used: bool, +} + +/// An individual file within a skill directory. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SkillArtifact { + pub artifact_type: ArtifactType, + /// Path relative to the skill directory. + pub relative_path: String, + pub file_name: String, + pub mime_type: Option, + pub size_bytes: u64, + pub sha256_hash: String, +} + +/// Classification of a skill artifact file. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ArtifactType { + Document, + Image, + Script, + Transcript, + Manifest, + Checksum, + Report, + ContactSheet, + Other(String), +} + +impl ArtifactType { + pub fn from_path(path: &str) -> Self { + let lower = path.to_lowercase(); + // Check specific patterns before generic extensions. + if lower.ends_with(".py") || lower.ends_with(".sh") || lower.ends_with(".bash") { + return ArtifactType::Script; + } + if lower.contains("contact_sheet") { + return ArtifactType::ContactSheet; + } + if lower.contains("run_manifest") && lower.ends_with(".json") { + return ArtifactType::Manifest; + } + if lower.contains("sha256") || lower.contains("checksum") { + return ArtifactType::Checksum; + } + if lower.contains("report") && lower.ends_with(".json") { + return ArtifactType::Report; + } + if lower.ends_with(".md") { + ArtifactType::Document + } else if lower.ends_with(".jpg") || lower.ends_with(".png") || lower.ends_with(".webp") { + ArtifactType::Image + } else if lower.ends_with(".txt") && lower.contains("transcript") { + ArtifactType::Transcript + } else { + ArtifactType::Other(path.to_string()) + } + } +} + +/// A chunk of searchable content extracted from a skill artifact. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SkillChunk { + pub chunk_id: Option, + pub skill_id: String, + pub artifact_relative_path: String, + pub chunk_type: ChunkType, + /// Nearest markdown heading, if applicable. + pub heading: Option, + pub content: String, + pub line_start: u64, + pub line_end: u64, + pub tokens_estimate: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ChunkType { + MarkdownSection, + TranscriptSegment, + CommandBlock, + CodeBlock, + Table, +} + +// ── Status ──────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum SkillStatus { + Active, + Archived, + Superseded, +} + +impl Default for SkillStatus { + fn default() -> Self { + SkillStatus::Active + } +} + +// ── Import summary ──────────────────────────────────────────────────────── + +/// Returned after indexing a Clawdie-AI checkout. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ImportSummary { + pub skills_found: usize, + pub skills_indexed: usize, + pub skills_skipped: usize, + pub artifacts_total: usize, + pub chunks_total: usize, + pub checksum_failures: usize, + pub errors: Vec, +} + +impl ImportSummary { + pub fn success(&self) -> bool { + self.errors.is_empty() && self.checksum_failures == 0 + } +} + +// ── Search result ───────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SearchResult { + pub skill_id: String, + pub display_name: String, + pub chunk_type: ChunkType, + pub heading: Option, + pub snippet: String, + pub artifact_path: String, + pub line_start: u64, +} + +// ── Tests ───────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn artifact_type_from_path_document() { + assert_eq!( + ArtifactType::from_path("docs/HOWTO.md"), + ArtifactType::Document + ); + } + + #[test] + fn artifact_type_from_path_image() { + assert_eq!( + ArtifactType::from_path("screenshots/001_00-01-05_intro.jpg"), + ArtifactType::Image + ); + } + + #[test] + fn artifact_type_from_path_script_python() { + assert_eq!( + ArtifactType::from_path("scripts/generate_contact_sheet.py"), + ArtifactType::Script + ); + } + + #[test] + fn artifact_type_from_path_script_shell() { + assert_eq!( + ArtifactType::from_path("scripts/extract_screenshots.sh"), + ArtifactType::Script + ); + } + + #[test] + fn artifact_type_from_path_transcript() { + assert_eq!( + ArtifactType::from_path("transcript_local.txt"), + ArtifactType::Transcript + ); + } + + #[test] + fn artifact_type_from_path_manifest() { + assert_eq!( + ArtifactType::from_path("run_manifest.json"), + ArtifactType::Manifest + ); + } + + #[test] + fn artifact_type_from_path_checksum() { + assert_eq!( + ArtifactType::from_path("artifacts.sha256"), + ArtifactType::Checksum + ); + } + + #[test] + fn artifact_type_from_path_report() { + assert_eq!( + ArtifactType::from_path("contact-sheet/report.json"), + ArtifactType::Report + ); + } + + #[test] + fn artifact_type_from_path_contact_sheet() { + assert_eq!( + ArtifactType::from_path("contact-sheet/contact_sheet.jpg"), + ArtifactType::ContactSheet + ); + } + + #[test] + fn skill_status_default_is_active() { + assert_eq!(SkillStatus::default(), SkillStatus::Active); + } + + #[test] + fn import_summary_success_empty() { + let s = ImportSummary { + skills_found: 0, + skills_indexed: 0, + skills_skipped: 0, + artifacts_total: 0, + chunks_total: 0, + checksum_failures: 0, + errors: vec![], + }; + assert!(s.success()); + } + + #[test] + fn import_summary_failure_on_error() { + let s = ImportSummary { + skills_found: 1, + skills_indexed: 0, + skills_skipped: 1, + artifacts_total: 0, + chunks_total: 0, + checksum_failures: 0, + errors: vec!["manifest parse error".into()], + }; + assert!(!s.success()); + } +} diff --git a/doc/COLIBRI-SKILLS-PLAN.md b/doc/COLIBRI-SKILLS-PLAN.md new file mode 100644 index 0000000..db4c0a0 --- /dev/null +++ b/doc/COLIBRI-SKILLS-PLAN.md @@ -0,0 +1,254 @@ +# Colibri Skills — Split-Brain Read Consumer + +## Purpose + +`colibri-skills` is the read-only runtime consumer for skill artifacts authored +in the Clawdie-AI repo. It does NOT store or author skills — it indexes them. + +```text + Clawdie-AI repo (source of truth) + docs/astro-howto/ + docs/forgejo-admin/ + docs/vaultwarden-onboarding/ + ... + + Colibri colibri-skills crate (read-only consumer) + reads committed skill artifacts + validates checksums + indexes Markdown/transcript chunks + exposes Skill, SkillArtifact, SkillChunk structs + serves CLI/TUI search +``` + +## Provenance example + +The first seed artifact: `docs/astro-howto/` in Clawdie-AI. + +```json +{ + "skill_id": "astro-howto", + "source": "local video-derived training artifact", + "inputs": [ + "transcript_local.txt", + "screenshots/", + "contact-sheet/contact_sheet.jpg" + ], + "outputs": [ + "docs/HOWTO.md", + "docs/COMMANDS.md", + "docs/SCREENSHOTS.md", + "docs/SUMMARY.md" + ], + "verification": "can user create and run an Astro project?", + "media": "screenshots/*.jpg (paths + hashes, not blobs)", + "manifest": "run_manifest.json", + "checksums": "artifacts.sha256" +} +``` + +Pipeline that produced it: + +```text + video → local transcript → topic extraction → how-to/runbook + → screenshots/contact sheet → commands → verification test + → manifest + checksums → reviewed skill artifact → Colibri read-only index +``` + +## Architecture + +### Ownership + +| Layer | Role | Writes | Reads | +|-------|------|--------|-------| +| Clawdie-AI | Source of truth | Skill artifacts via PR | N/A | +| Colibri colibri-skills | Runtime consumer | Never | Indexed skill structs from committed artifacts | +| Agents (Hermes, Claude, Codex) | Authors | Create PRs with skill artifacts | Skill content for task routing | +| system_brain | Agent memory | Personal/user/agent context | N/A — not canonical docs | +| system_ops | Runtime state | Live task/service state | N/A — not skills | + +### What Colibri-skills does NOT do + +- Store skill content (that's the Clawdie-AI repo's job) +- Store image blobs in SQLite (paths + hashes only) +- Write, edit, or create skills (read-only) +- Replace system_brain (personal/agent memory is separate) +- Replace system_ops (runtime state is separate) + +### What Colibri-skills DOES + +- Read skill manifests from a configured Clawdie-AI checkout path +- Parse run_manifest.json for each skill +- Validate checksums against artifacts.sha256 +- Index Markdown sections for search +- Expose Skill, SkillArtifact, SkillChunk structs +- Back by SQLite (system_skills, system_skill_artifacts, system_skill_chunks) + +## SQLite schema + +```sql +CREATE TABLE system_skills ( + skill_id TEXT PRIMARY KEY, + display_name TEXT NOT NULL, + source_path TEXT NOT NULL, -- relative within Clawdie-AI repo + manifest_hash TEXT, -- sha256 of run_manifest.json + created_at TEXT NOT NULL, -- ISO 8601 + updated_at TEXT NOT NULL, + verification TEXT, -- natural-language verification test + status TEXT NOT NULL DEFAULT 'active' -- active, archived, superseded +); + +CREATE TABLE system_skill_artifacts ( + artifact_id INTEGER PRIMARY KEY AUTOINCREMENT, + skill_id TEXT NOT NULL REFERENCES system_skills(skill_id), + artifact_type TEXT NOT NULL, -- document, image, script, transcript, manifest, checksum + relative_path TEXT NOT NULL, -- within the skill directory + file_name TEXT NOT NULL, + mime_type TEXT, + size_bytes INTEGER, + sha256_hash TEXT NOT NULL, + UNIQUE(skill_id, relative_path) +); + +CREATE TABLE system_skill_chunks ( + chunk_id INTEGER PRIMARY KEY AUTOINCREMENT, + skill_id TEXT NOT NULL REFERENCES system_skills(skill_id), + artifact_id INTEGER NOT NULL REFERENCES system_skill_artifacts(artifact_id), + chunk_type TEXT NOT NULL, -- markdown_section, transcript_segment, command_block + heading TEXT, -- nearest markdown heading, if applicable + content TEXT NOT NULL, -- the chunk text (not full file) + line_start INTEGER, -- line number in source file + line_end INTEGER, + tokens_estimate INTEGER -- rough token count for search ranking +); + +CREATE INDEX idx_skills_status ON system_skills(status); +CREATE INDEX idx_artifacts_skill ON system_skill_artifacts(skill_id); +CREATE INDEX idx_artifacts_type ON system_skill_artifacts(artifact_type); +CREATE INDEX idx_chunks_skill ON system_skill_chunks(skill_id); +CREATE INDEX idx_chunks_type ON system_skill_chunks(chunk_type); +CREATE VIRTUAL TABLE IF NOT EXISTS skill_fts USING fts5( + content, + heading, + skill_id, + chunk_type, + content=system_skill_chunks, + content_rowid=chunk_id +); +``` + +## Rust structs (planned) + +```rust +// crates/colibri-skills/src/lib.rs + +pub struct Skill { + pub skill_id: String, + pub display_name: String, + pub source_path: String, + pub manifest: SkillManifest, + pub artifacts: Vec, + pub status: SkillStatus, + pub verification: Option, +} + +pub struct SkillManifest { + pub run_id: String, + pub pipeline_stages: Vec, + pub models_used: Vec, + pub created: String, +} + +pub struct PipelineStage { + pub stage: String, + pub tool: String, + pub output: String, + pub duration_s: Option, +} + +pub struct ModelUsage { + pub model: String, + pub purpose: String, + pub api_key_used: bool, +} + +pub struct SkillArtifact { + pub artifact_type: ArtifactType, + pub relative_path: String, + pub file_name: String, + pub mime_type: Option, + pub size_bytes: u64, + pub sha256_hash: String, +} + +pub enum ArtifactType { + Document, // .md + Image, // .jpg, .png, .webp + Script, // .py, .sh + Transcript, // .txt + Manifest, // run_manifest.json + Checksum, // artifacts.sha256 + Report, // report.json +} + +pub enum SkillStatus { + Active, + Archived, + Superseded, +} +``` + +## CLI surface (future) + +```sh +colibri list-skills # list all indexed skills +colibri show-skill # full skill metadata + artifact list +colibri search-skills # FTS5 search across chunks +colibri index-skills # re-index from Clawdie-AI checkout +colibri verify-skill # re-validate checksums +``` + +## Import flow + +``` +1. colibri-skills reads Clawdie-AI path from config/env +2. Scans for directories containing run_manifest.json +3. Parses manifest → extracts skill metadata +4. For each artifact in manifest: reads file, computes/verifies sha256 +5. Chunks Markdown files by heading, transcript files by segment +6. Inserts/updates SQLite rows (idempotent — replace on conflict) +7. Returns import summary: skills found, artifacts indexed, chunks created +``` + +## Portability rules + +- Image paths and hashes stored, not blobs +- Local provenance paths (e.g., `/home/samob/Videos/...`) are nonportable + metadata — stored but never assumed to exist at runtime +- Checksums are portable — verified against artifact content, not source paths +- Paths stored as relative within Clawdie-AI repo + +## Future skillpacks (candidates) + +```text + astro-howto + forgejo-admin + vaultwarden-onboarding + freebsd-update-reboot + colibri-iso-build + zed-on-freebsd + pi-headless-login +``` + +## Implementation phases + +| Phase | What | Depends on | +|-------|------|-----------| +| 1 | Scaffold crate + structs + SQLite schema | Nothing | +| 2 | Manifest parser (run_manifest.json → SkillManifest) | Phase 1 | +| 3 | Checksum validator (artifacts.sha256 → verify) | Phase 2 | +| 4 | Markdown chunker (heading-based) | Phase 1 | +| 5 | FTS5-backed search | Phases 3, 4 | +| 6 | CLI commands (list, show, search, index) | Phases 3, 4 | +| 7 | Daemon integration (auto-index on config path changes) | Phase 6 | + +Phase 1 is scaffold-only — compile check, no runtime behavior.