diff --git a/Cargo.lock b/Cargo.lock index 2c32456..5442670 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -180,7 +180,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" dependencies = [ "iana-time-zone", + "js-sys", "num-traits", + "serde", + "wasm-bindgen", "windows-link", ] @@ -284,6 +287,15 @@ dependencies = [ "tokio", ] +[[package]] +name = "colibri-skills" +version = "0.0.1" +dependencies = [ + "chrono", + "serde", + "serde_json", +] + [[package]] name = "colibri-store" version = "0.0.1" diff --git a/Cargo.toml b/Cargo.toml index cdd1b4b..8e952d8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["crates/colibri-contracts", "crates/colibri-deepseek", "crates/colibri-runtime", "crates/colibri-glasspane", "crates/colibri-daemon", "crates/colibri-client", "crates/colibri-glasspane-tui", "crates/colibri-store"] +members = ["crates/colibri-contracts", "crates/colibri-deepseek", "crates/colibri-runtime", "crates/colibri-glasspane", "crates/colibri-daemon", "crates/colibri-client", "crates/colibri-glasspane-tui", "crates/colibri-store", "crates/colibri-skills"] [package] name = "colibri" diff --git a/crates/colibri-skills/Cargo.toml b/crates/colibri-skills/Cargo.toml new file mode 100644 index 0000000..6997dc4 --- /dev/null +++ b/crates/colibri-skills/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "colibri-skills" +version = "0.0.1" +edition = "2021" +description = "Colibri skills read consumer — indexes Clawdie-AI skill artifacts into SQLite" +license = "AGPL-3.0-only" + +[dependencies] +serde = { version = "1", features = ["derive"] } +serde_json = "1" +chrono = { version = "0.4", features = ["serde"] } diff --git a/crates/colibri-skills/src/lib.rs b/crates/colibri-skills/src/lib.rs new file mode 100644 index 0000000..06934ff --- /dev/null +++ b/crates/colibri-skills/src/lib.rs @@ -0,0 +1,317 @@ +//! Colibri Skills — read-only consumer for Clawdie-AI skill artifacts. +//! +//! This crate indexes committed, reviewed skill artifacts from the Clawdie-AI +//! repository into SQLite. It does NOT author, edit, or store skill content — +//! that responsibility lives in Clawdie-AI. +//! +//! ```text +//! Clawdie-AI repo (source of truth) +//! docs/astro-howto/ +//! docs/forgejo-admin/ +//! ... +//! +//! colibri-skills (read-only consumer) +//! reads run_manifest.json +//! validates checksums +//! indexes markdown/transcript chunks +//! serves CLI/TUI search +//! ``` + +use serde::{Deserialize, Serialize}; + +// ── Core types ──────────────────────────────────────────────────────────── + +/// A read-only skill artifact indexed from Clawdie-AI. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Skill { + pub skill_id: String, + pub display_name: String, + /// Relative path within the Clawdie-AI repo (e.g. "docs/astro-howto"). + pub source_path: String, + pub manifest: SkillManifest, + pub artifacts: Vec, + pub status: SkillStatus, + /// Natural-language verification test. + pub verification: Option, +} + +/// Parsed from `run_manifest.json` in a skill directory. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SkillManifest { + pub run_id: String, + pub created: String, + pub source: Option, + pub pipeline_stages: Vec, + pub models_used: Vec, + pub notes: Option, +} + +/// Source media metadata from the manifest. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ManifestSource { + pub path: Option, + pub size_human: Option, + pub codec: Option, + pub resolution: Option, + pub duration_human: Option, + pub original_untouched: Option, +} + +/// A single stage in the skill generation pipeline. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PipelineStage { + pub stage: String, + pub tool: String, + pub model: Option, + pub output: Option, + pub duration_s: Option, + pub lines: Option, + pub size_human: Option, +} + +/// A model used during skill generation. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ModelUsage { + pub model: String, + pub purpose: String, + pub api_key_used: bool, +} + +/// An individual file within a skill directory. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SkillArtifact { + pub artifact_type: ArtifactType, + /// Path relative to the skill directory. + pub relative_path: String, + pub file_name: String, + pub mime_type: Option, + pub size_bytes: u64, + pub sha256_hash: String, +} + +/// Classification of a skill artifact file. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ArtifactType { + Document, + Image, + Script, + Transcript, + Manifest, + Checksum, + Report, + ContactSheet, + Other(String), +} + +impl ArtifactType { + pub fn from_path(path: &str) -> Self { + let lower = path.to_lowercase(); + // Check specific patterns before generic extensions. + if lower.ends_with(".py") || lower.ends_with(".sh") || lower.ends_with(".bash") { + return ArtifactType::Script; + } + if lower.contains("contact_sheet") { + return ArtifactType::ContactSheet; + } + if lower.contains("run_manifest") && lower.ends_with(".json") { + return ArtifactType::Manifest; + } + if lower.contains("sha256") || lower.contains("checksum") { + return ArtifactType::Checksum; + } + if lower.contains("report") && lower.ends_with(".json") { + return ArtifactType::Report; + } + if lower.ends_with(".md") { + ArtifactType::Document + } else if lower.ends_with(".jpg") || lower.ends_with(".png") || lower.ends_with(".webp") { + ArtifactType::Image + } else if lower.ends_with(".txt") && lower.contains("transcript") { + ArtifactType::Transcript + } else { + ArtifactType::Other(path.to_string()) + } + } +} + +/// A chunk of searchable content extracted from a skill artifact. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SkillChunk { + pub chunk_id: Option, + pub skill_id: String, + pub artifact_relative_path: String, + pub chunk_type: ChunkType, + /// Nearest markdown heading, if applicable. + pub heading: Option, + pub content: String, + pub line_start: u64, + pub line_end: u64, + pub tokens_estimate: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ChunkType { + MarkdownSection, + TranscriptSegment, + CommandBlock, + CodeBlock, + Table, +} + +// ── Status ──────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum SkillStatus { + #[default] + Active, + Archived, + Superseded, +} + +// ── Import summary ──────────────────────────────────────────────────────── + +/// Returned after indexing a Clawdie-AI checkout. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ImportSummary { + pub skills_found: usize, + pub skills_indexed: usize, + pub skills_skipped: usize, + pub artifacts_total: usize, + pub chunks_total: usize, + pub checksum_failures: usize, + pub errors: Vec, +} + +impl ImportSummary { + pub fn success(&self) -> bool { + self.errors.is_empty() && self.checksum_failures == 0 + } +} + +// ── Search result ───────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SearchResult { + pub skill_id: String, + pub display_name: String, + pub chunk_type: ChunkType, + pub heading: Option, + pub snippet: String, + pub artifact_path: String, + pub line_start: u64, +} + +// ── Tests ───────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn artifact_type_from_path_document() { + assert_eq!( + ArtifactType::from_path("docs/HOWTO.md"), + ArtifactType::Document + ); + } + + #[test] + fn artifact_type_from_path_image() { + assert_eq!( + ArtifactType::from_path("screenshots/001_00-01-05_intro.jpg"), + ArtifactType::Image + ); + } + + #[test] + fn artifact_type_from_path_script_python() { + assert_eq!( + ArtifactType::from_path("scripts/generate_contact_sheet.py"), + ArtifactType::Script + ); + } + + #[test] + fn artifact_type_from_path_script_shell() { + assert_eq!( + ArtifactType::from_path("scripts/extract_screenshots.sh"), + ArtifactType::Script + ); + } + + #[test] + fn artifact_type_from_path_transcript() { + assert_eq!( + ArtifactType::from_path("transcript_local.txt"), + ArtifactType::Transcript + ); + } + + #[test] + fn artifact_type_from_path_manifest() { + assert_eq!( + ArtifactType::from_path("run_manifest.json"), + ArtifactType::Manifest + ); + } + + #[test] + fn artifact_type_from_path_checksum() { + assert_eq!( + ArtifactType::from_path("artifacts.sha256"), + ArtifactType::Checksum + ); + } + + #[test] + fn artifact_type_from_path_report() { + assert_eq!( + ArtifactType::from_path("contact-sheet/report.json"), + ArtifactType::Report + ); + } + + #[test] + fn artifact_type_from_path_contact_sheet() { + assert_eq!( + ArtifactType::from_path("contact-sheet/contact_sheet.jpg"), + ArtifactType::ContactSheet + ); + } + + #[test] + fn skill_status_default_is_active() { + assert_eq!(SkillStatus::default(), SkillStatus::Active); + } + + #[test] + fn import_summary_success_empty() { + let s = ImportSummary { + skills_found: 0, + skills_indexed: 0, + skills_skipped: 0, + artifacts_total: 0, + chunks_total: 0, + checksum_failures: 0, + errors: vec![], + }; + assert!(s.success()); + } + + #[test] + fn import_summary_failure_on_error() { + let s = ImportSummary { + skills_found: 1, + skills_indexed: 0, + skills_skipped: 1, + artifacts_total: 0, + chunks_total: 0, + checksum_failures: 0, + errors: vec!["manifest parse error".into()], + }; + assert!(!s.success()); + } +} diff --git a/docs/COLIBRI-SKILLS-PLAN.md b/docs/COLIBRI-SKILLS-PLAN.md index e39e4fb..c46a650 100644 --- a/docs/COLIBRI-SKILLS-PLAN.md +++ b/docs/COLIBRI-SKILLS-PLAN.md @@ -1,211 +1,231 @@ # Colibri Skills Plan -**Date:** 27.maj.2026 -**Status:** PLANNED — docs first, no crate scaffold yet +**Status:** Phase 1 scaffolded — read-only split-brain consumer + +**Crate:** `crates/colibri-skills` ## Purpose -Define the next Colibri slice for Clawdie's split-brain model: +`colibri-skills` is Colibri's read-only runtime consumer for reviewed skill +artifacts authored in the Clawdie-AI repo. It does not author, edit, or store +canonical skills. Clawdie-AI remains the source of truth; Colibri indexes and +serves typed/runtime views. -- **built-in knowledge** stays separate from -- **user/agent memory** and -- **operational state** +```text +Clawdie-AI repo (source of truth) + docs/astro-howto/ + docs/forgejo-admin/ + docs/vaultwarden-onboarding/ + ... -The product explanation is simple: +Colibri colibri-skills crate (read-only consumer) + reads committed skill artifacts + validates checksums + indexes Markdown/transcript chunks + exposes Skill, SkillArtifact, SkillChunk structs + serves CLI/TUI/search later +``` -> Think of it like the machine already ships with its handbook inside. +This keeps the split-brain model explicit: -In Clawdie terms, that handbook is the **committed built-in knowledge -artifact** imported into `system_skills`. +- `system_skills`: committed built-in knowledge / manuals / reviewed skillpacks +- `system_brain`: user and agent memory +- `system_ops`: live runtime, task, service, and daemon state -This plan names the Rust-side consumer for that lane: +## Seed artifact: Astro how-to -- **crate name:** `colibri-skills` +The first concrete skillpack is `docs/astro-howto/` in Clawdie-AI. It is useful +because it is not just prose; it includes transcript, generated how-to docs, +commands, screenshots, contact sheet, manifest, checksums, and scripts. -It is a **read path first**, not a new generator, not a new database, and not a -memory replacement. +```json +{ + "skill_id": "astro-howto", + "source": "local video-derived training artifact", + "inputs": [ + "transcript_local.txt", + "screenshots/", + "contact-sheet/contact_sheet.jpg" + ], + "outputs": [ + "docs/HOWTO.md", + "docs/COMMANDS.md", + "docs/SCREENSHOTS.md", + "docs/SUMMARY.md" + ], + "verification": "can user create and run an Astro project?", + "media": "screenshots/*.jpg (paths + hashes, not blobs)", + "manifest": "run_manifest.json", + "checksums": "artifacts.sha256" +} +``` -## Background +Pipeline shape: -The current split-brain design already exists in `clawdie-ai`: +```text +video → local transcript → topic extraction → how-to/runbook +→ screenshots/contact sheet → commands → verification test +→ manifest + checksums → reviewed skill artifact → Colibri read-only index +``` -- `system_skills` - - built-in knowledge / manuals already included - - sourced from `bootstrap/skills-memory/artifact.sql` -- `system_brain` - - dynamic user/agent memory -- `system_ops` - - runtime and operational state +## Ownership -Colibri should reflect that boundary explicitly instead of flattening all -knowledge into one vague "brain." +| Layer | Role | Writes | Reads | +|-------|------|--------|-------| +| Clawdie-AI | Source of truth | Skill artifacts via PR | N/A | +| `colibri-skills` | Runtime consumer | Never to source repo | Indexed skill structs from committed artifacts | +| Agents | Authors/reviewers | Candidate skill artifact PRs | Skill content for task routing | +| `system_brain` | Agent/user memory | Personal/user/agent context | Not canonical skill docs | +| `system_ops` | Runtime state | Live task/service state | Not skills | -## What `colibri-skills` should own +## What `colibri-skills` does -`colibri-skills` should own the Rust-side access layer for **built-in -knowledge only**. +- Read skill manifests from a configured Clawdie-AI checkout path +- Parse `run_manifest.json` +- Validate checksums against `artifacts.sha256` +- Classify artifacts as document, image, script, transcript, manifest, checksum, + report, contact sheet, or other +- Index Markdown/transcript chunks for search +- Expose stable typed structs for daemon/client/TUI callers +- Persist runtime index metadata in SQLite -Phase-1 responsibilities: +## What `colibri-skills` does not do -- connect to the `system_skills` database -- report artifact/import status -- expose stable typed results for built-in knowledge lookup -- support text search over committed handbook content -- return structured summaries that other Colibri crates can consume +- Author, edit, or create skills +- Store image blobs in SQLite; store paths and hashes only +- Replace `system_brain` +- Replace `system_ops` +- Own provider/API budget logic +- Require nonportable local source media paths at runtime -Good callers later: +## Phase 1 delivered -- `colibri-daemon` -- `colibri-client` -- `colibri-glasspane-tui` +The scaffold crate now provides: -## What `colibri-skills` must not own +- `Skill` +- `SkillManifest` +- `SkillArtifact` +- `SkillChunk` +- `ArtifactType` +- `SkillStatus` +- `ImportSummary` +- `SearchResult` +- unit tests for artifact classification and status/summary behavior -It should **not** own: +Phase 1 is intentionally scaffold-only: compile and type proof, no runtime +import behavior yet. -- user memory retrieval from `system_brain` -- task/session/runtime state from `system_ops` -- artifact generation or embedding refresh -- OpenRouter or embedding-provider budget logic -- ISO build-time content packing +## SQLite schema target -Those already belong elsewhere. +```sql +CREATE TABLE system_skills ( + skill_id TEXT PRIMARY KEY, + display_name TEXT NOT NULL, + source_path TEXT NOT NULL, -- relative within Clawdie-AI repo + manifest_hash TEXT, -- sha256 of run_manifest.json + created_at TEXT NOT NULL, -- ISO 8601 + updated_at TEXT NOT NULL, + verification TEXT, -- natural-language verification test + status TEXT NOT NULL DEFAULT 'active' -- active, archived, superseded +); -## Why a separate crate +CREATE TABLE system_skill_artifacts ( + artifact_id INTEGER PRIMARY KEY AUTOINCREMENT, + skill_id TEXT NOT NULL REFERENCES system_skills(skill_id), + artifact_type TEXT NOT NULL, + relative_path TEXT NOT NULL, -- within the skill directory + file_name TEXT NOT NULL, + mime_type TEXT, + size_bytes INTEGER, + sha256_hash TEXT NOT NULL, + UNIQUE(skill_id, relative_path) +); -This wants its own crate because it is a separate architectural lane: +CREATE TABLE system_skill_chunks ( + chunk_id INTEGER PRIMARY KEY AUTOINCREMENT, + skill_id TEXT NOT NULL REFERENCES system_skills(skill_id), + artifact_id INTEGER NOT NULL REFERENCES system_skill_artifacts(artifact_id), + chunk_type TEXT NOT NULL, + heading TEXT, + content TEXT NOT NULL, + line_start INTEGER, + line_end INTEGER, + tokens_estimate INTEGER +); -- distinct data source -- distinct lifecycle -- distinct operator story -- distinct proof gates +CREATE INDEX idx_skills_status ON system_skills(status); +CREATE INDEX idx_artifacts_skill ON system_skill_artifacts(skill_id); +CREATE INDEX idx_artifacts_type ON system_skill_artifacts(artifact_type); +CREATE INDEX idx_chunks_skill ON system_skill_chunks(skill_id); +CREATE INDEX idx_chunks_type ON system_skill_chunks(chunk_type); -It is not just a helper module inside `colibri-daemon`. +CREATE VIRTUAL TABLE IF NOT EXISTS skill_fts USING fts5( + content, + heading, + skill_id, + chunk_type, + content=system_skill_chunks, + content_rowid=chunk_id +); +``` -That keeps the split-brain boundary visible in code: +## Import flow target -- `colibri-runtime` = host/runtime facts -- `colibri-skills` = built-in handbook facts -- future memory lane = user/agent memory, if needed later +1. Read Clawdie-AI checkout path from config/env. +2. Scan for directories containing `run_manifest.json`. +3. Parse manifest and derive skill metadata. +4. Read artifacts, compute SHA-256, and verify `artifacts.sha256` when present. +5. Chunk Markdown by heading and transcripts by timestamp/segment. +6. Upsert SQLite rows idempotently. +7. Return `ImportSummary` with skills found/indexed/skipped, artifacts, chunks, + checksum failures, and errors. -## Phase 0 — source map before scaffolding +## CLI surface target -Before adding the crate, copy the TypeScript contract into this document or a -small fixture so the Rust implementation does not guess at table names or env -wiring. +```sh +colibri list-skills +colibri show-skill +colibri search-skills +colibri index-skills +colibri verify-skill +``` -Known current tables from `clawdie-ai/src/split-brain-status.ts` and -`bootstrap/skills-memory/artifact.sql`: +## Portability rules -- `builtin_knowledge_artifacts` -- `builtin_knowledge_documents` -- `builtin_knowledge_chunks` -- `builtin_knowledge_embeddings` +- Store image paths and hashes, not blobs. +- Treat local provenance paths like `/home/samob/Videos/...` as metadata only. +- Verify checksums against committed artifacts, not local source paths. +- Store paths relative to the Clawdie-AI repo. +- Normal tests must not require PostgreSQL, remote Forgejo, or local source + videos/audio. -Phase-0 checks: +## Future skillpacks -1. Identify the exact read-only connection source/env var used for the skills - DB in production. -2. Record the minimum queries needed for status: - - artifact row count - - document row count - - chunk row count - - latest `artifact_version` -3. Choose an opt-in live-test env var so normal workspace tests do **not** - require PostgreSQL. -4. Add offline fixture rows/golden JSON before any live DB test. +```text +astro-howto +forgejo-admin +vaultwarden-onboarding +freebsd-update-reboot +colibri-iso-build +zed-on-freebsd +pi-headless-login +``` -## Phase 1 — read-only consumer +## Implementation phases -The first implementation should stay intentionally small. - -Deliverables: - -1. New crate scaffold: - - `crates/colibri-skills` -2. DB-facing types for: - - artifact metadata - - documents - - chunks - - text search results -3. Read-only API for: - - artifact/import status - - list current artifact version - - search built-in knowledge by text -4. Unit tests with fixture rows or golden snapshots -5. Optional ignored/opt-in integration test for a real skills DB - -Phase 1 goal: - -> prove Colibri can read the "manuals already included" lane cleanly, without -> touching memory or generator code. - -## Phase 2 — daemon integration - -After Phase 1 is proven, wire it into Colibri surfaces: - -- `colibri-daemon` - - expose a query path for built-in handbook lookups -- `colibri-client` - - typed client method -- `colibri-glasspane-tui` - - optional "manuals included" / artifact-status pane - -This should still remain read-only. - -## Phase 3 — optional deeper work - -Only after the read path is stable should we consider: - -- vector similarity for built-in knowledge -- richer ranking/grounding -- parity checks against the TypeScript retrieval layer -- eventual Rust ownership of artifact generation - -That is explicitly **not** phase 1. - -## Contracts and proof - -The proof for `colibri-skills` should be simple: - -1. Offline tests can parse fixture rows and return typed status/search results. -2. With an explicit live DB env var, Linux and FreeBSD can connect read-only to - the skills DB / `system_skills` lane. -3. Colibri can report whether the committed artifact is imported. -4. Colibri can query known built-in handbook content by text. -5. Results are typed and stable enough for daemon/client use. - -This is narrower than the daemon proof gates and should stay that way. - -## Naming - -Why `colibri-skills`: - -- it matches the existing `system_skills` boundary -- it stays concrete -- it maps to the "manuals already included" story without pretending it owns - all knowledge - -Rejected names for now: - -- `colibri-brain` - - too vague; collides with user memory -- `colibri-knowledge` - - broader than the actual lane -- `colibri-manuals` - - good product language, but weaker alignment with existing technical names - -## Non-goals - -- no new repo -- no crate inside `clawdie-iso` -- no merge of `system_skills` with `system_brain` -- no rewrite of `bootstrap/skills-memory/artifact.sql` generation yet -- no embedding refresh port to Rust yet +| Phase | What | Depends on | +|-------|------|------------| +| 1 | Scaffold crate + structs + schema plan | Nothing | +| 2 | Manifest parser (`run_manifest.json` → `SkillManifest`) | Phase 1 | +| 3 | Checksum validator (`artifacts.sha256` → verify) | Phase 2 | +| 4 | Markdown/transcript chunker | Phase 1 | +| 5 | SQLite storage + FTS5 search | Phases 3, 4 | +| 6 | CLI commands (`list`, `show`, `search`, `index`, `verify`) | Phase 5 | +| 7 | Daemon/client/TUI integration | Phase 6 | ## Related sources -- `clawdie-ai/docs/internal/SKILLS-ARTIFACT-V1-PLAN.md` -- `clawdie-ai/html/docs-clawdie-si/docs/split-brain.html` -- `clawdie-ai/setup/skills-memory.ts` +- `clawdie-ai/docs/astro-howto/` +- `clawdie-ai/docs/VAULTWARDEN-SETUP.md` +- `clawdie-ai/bootstrap/skills-memory/artifact.sql` - `clawdie-ai/src/split-brain-status.ts`