clawdie-ai/scripts/memory/embed-builtin-knowledge.py

#!/usr/bin/env python3
"""
scripts/memory/embed-builtin-knowledge.py — Generate artifact.sql for built-in knowledge.

Discovers all project docs and skill definitions, chunks by markdown headings,
generates embeddings via OpenRouter (BAAI/bge-m3), and outputs a complete
artifact.sql ready for import into the skills database.

Usage:
    OPENROUTER_API_KEY=sk-or-... python3 scripts/memory/embed-builtin-knowledge.py
    OPENROUTER_API_KEY=sk-or-... python3 scripts/memory/embed-builtin-knowledge.py --dry-run

Output (stdout): Complete artifact.sql with all INSERT statements.
Progress (stderr): Discovery and embedding progress.
"""

import hashlib
import json
import os
import re
import subprocess
import sys
import time
import urllib.request
from datetime import datetime, timezone
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent

# ── Config ────────────────────────────────────────────────────────────────────

OPENROUTER_URL = "https://openrouter.ai/api/v1/embeddings"
MODEL = "BAAI/bge-m3"
DIMENSIONS = 1024
CHUNK_SIZE = 900
ARTIFACT_VERSION = "v1.0.2-full"
SCHEMA_VERSION = "builtin-knowledge-base-v1"

# Files to skip — outdated, test, or meta-only content
SKIP_FILES = {
    'nanoclaw-architecture-final.md',
    'WIZARD-SIMULATION-TEST.md',
    'DOCUMENTATION-SYNC-RUNBOOK.md',
}

# Doc patterns: (glob relative to PROJECT_ROOT, source_type, importance)
DOC_PATTERNS = [
    ('docs/public/**/*.md', 'doc'),
    ('docs/internal/**/*.md', 'doc'),
    ('SOUL.md', 'identity'),
    ('IDENTITY.md', 'identity'),
    ('USER.md', 'identity'),
    ('AGENTS.md', 'identity'),
    ('MEMORY.md', 'identity'),
    ('CLAWDIE-ISO.md', 'identity'),
    ('.agent/skills/*/SKILL.md', 'skill'),
]

# ── Chunking ──────────────────────────────────────────────────────────────────

def chunk_markdown(text: str) -> list[str]:
    """Split markdown by headings, then further split long sections. Max CHUNK_SIZE chars."""
    sections = re.split(r'(?=\n#{1,3} )', text)
    chunks: list[str] = []

    for section in sections:
        section = section.strip()
        if not section:
            continue
        if len(section) <= CHUNK_SIZE:
            chunks.append(section)
        else:
            paragraphs = re.split(r'\n\n+', section)
            current = ''
            for para in paragraphs:
                para = para.strip()
                if not para:
                    continue
                if len(current) + len(para) + 2 <= CHUNK_SIZE:
                    current = f'{current}\n\n{para}'.strip() if current else para
                else:
                    if current:
                        chunks.append(current)
                    while len(para) > CHUNK_SIZE:
                        chunks.append(para[:CHUNK_SIZE])
                        para = para[CHUNK_SIZE:]
                    current = para
            if current:
                chunks.append(current)

    return chunks or [text[:CHUNK_SIZE]]


# ── File discovery ────────────────────────────────────────────────────────────

def collect_files() -> list[tuple[Path, str]]:
    """Collect all embeddable files with their source_type."""
    files: list[tuple[Path, str]] = []
    seen: set[Path] = set()
    for pattern, source_type in DOC_PATTERNS:
        for path in sorted(PROJECT_ROOT.glob(pattern)):
            if path.name in SKIP_FILES:
                continue
            if path in seen:
                continue
            seen.add(path)
            files.append((path, source_type))
    return files


def derive_title(path: Path, text: str) -> str:
    """Extract title from first heading or filename."""
    for line in text.splitlines()[:10]:
        line = line.strip()
        if line.startswith('# '):
            return line.lstrip('#').strip()[:120]
    return path.stem.replace('-', ' ').replace('_', ' ').title()[:120]


# ── Embeddings ────────────────────────────────────────────────────────────────

def embed_batch(texts: list[str]) -> list[list[float]]:
    """Embed a batch of texts via OpenRouter."""
    api_key = os.environ.get("OPENROUTER_API_KEY", "")
    if not api_key:
        print("ERROR: OPENROUTER_API_KEY not set", file=sys.stderr)
        sys.exit(1)

    body = {"input": texts, "model": MODEL}
    payload = json.dumps(body).encode()
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}",
    }

    req = urllib.request.Request(OPENROUTER_URL, data=payload, headers=headers)
    with urllib.request.urlopen(req, timeout=120) as resp:
        data = json.load(resp)

    return [d["embedding"] for d in sorted(data["data"], key=lambda d: d["index"])]


# ── SQL helpers ───────────────────────────────────────────────────────────────

def sql_escape(text: str) -> str:
    """Escape text for SQL using dollar-quoting where possible."""
    # Use $$ quoting for chunk text (may contain single quotes)
    return text


def format_vector(vec: list[float]) -> str:
    return "[" + ",".join(f"{v:.8f}" for v in vec) + "]"


# ── Main ──────────────────────────────────────────────────────────────────────

def main():
    dry_run = '--dry-run' in sys.argv

    files = collect_files()
    print(f"-- Discovered {len(files)} source files", file=sys.stderr)

    if not files:
        print("ERROR: No files found", file=sys.stderr)
        sys.exit(1)

    # Phase 1: Chunk all files
    all_docs: list[dict] = []
    total_chunks = 0

    for path, source_type in files:
        rel = str(path.relative_to(PROJECT_ROOT))
        text = path.read_text(encoding='utf-8', errors='replace').strip()
        if not text:
            print(f"  skip (empty): {rel}", file=sys.stderr)
            continue

        chunks = chunk_markdown(text)
        title = derive_title(path, text)
        checksum = hashlib.sha256(text.encode()).hexdigest()[:16]

        all_docs.append({
            'source_path': rel,
            'source_type': source_type,
            'title': title,
            'checksum': checksum,
            'chunks': chunks,
        })
        total_chunks += len(chunks)
        print(f"  {rel}: {len(chunks)} chunks", file=sys.stderr)

    print(f"-- Total: {len(all_docs)} docs, {total_chunks} chunks", file=sys.stderr)

    if dry_run:
        print(f"\nDRY RUN — would embed {total_chunks} chunks from {len(all_docs)} docs", file=sys.stderr)
        sys.exit(0)

    # Phase 2: Generate embeddings (batch per file, with rate limiting)
    print(f"-- Generating embeddings via {MODEL}...", file=sys.stderr)
    for i, doc in enumerate(all_docs):
        chunk_texts = doc['chunks']
        try:
            vectors = embed_batch(chunk_texts)
            doc['vectors'] = vectors
            print(f"  [{i+1}/{len(all_docs)}] {doc['source_path']}: {len(vectors)} embeddings", file=sys.stderr)
        except Exception as e:
            print(f"  ERROR embedding {doc['source_path']}: {e}", file=sys.stderr)
            sys.exit(1)
        # Rate limit: ~2 requests/sec to avoid 429s
        if i < len(all_docs) - 1:
            time.sleep(0.5)

    # Phase 3: Output artifact.sql
    git_commit = subprocess.run(
        ['git', 'rev-parse', '--short', 'HEAD'],
        capture_output=True, text=True, cwd=PROJECT_ROOT,
    ).stdout.strip() or None

    now = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S+00')
    embedding_count = sum(len(d.get('vectors', [])) for d in all_docs)

    print("-- Built-in knowledge bootstrap artifact")
    print("-- Imported into the agent-system skills database.")
    print()
    print("BEGIN;")
    print()

    # Artifact metadata
    metadata_json = json.dumps({
        "notes": "Full project docs, internal docs, identity files, and skill definitions",
        "search_mode": "hybrid",
        "embedding_provider": "openrouter",
        "embedding_model": MODEL.lower(),
        "embedding_dimensions": DIMENSIONS,
    }, indent=2)

    print(f"""INSERT INTO builtin_knowledge_artifacts (
    artifact_version,
    schema_version,
    source_snapshot,
    chunker_name,
    chunker_version,
    chunk_size,
    chunk_overlap,
    embedding_provider,
    embedding_model,
    embedding_dimensions,
    generated_at,
    document_count,
    chunk_count,
    embedding_count,
    git_commit,
    metadata
)
VALUES (
    '{ARTIFACT_VERSION}',
    '{SCHEMA_VERSION}',
    'full project docs + skills + identity files',
    'heading-split',
    'v2',
    {CHUNK_SIZE},
    0,
    'openrouter',
    '{MODEL.lower()}',
    {DIMENSIONS},
    TIMESTAMPTZ '{now}',
    {len(all_docs)},
    {total_chunks},
    {embedding_count},
    {f"'{git_commit}'" if git_commit else 'NULL'},
    '{metadata_json}'::jsonb
)
ON CONFLICT (artifact_version) DO UPDATE
SET
    schema_version = EXCLUDED.schema_version,
    source_snapshot = EXCLUDED.source_snapshot,
    chunker_name = EXCLUDED.chunker_name,
    chunker_version = EXCLUDED.chunker_version,
    chunk_size = EXCLUDED.chunk_size,
    chunk_overlap = EXCLUDED.chunk_overlap,
    embedding_provider = EXCLUDED.embedding_provider,
    embedding_model = EXCLUDED.embedding_model,
    embedding_dimensions = EXCLUDED.embedding_dimensions,
    generated_at = EXCLUDED.generated_at,
    document_count = EXCLUDED.document_count,
    chunk_count = EXCLUDED.chunk_count,
    embedding_count = EXCLUDED.embedding_count,
    git_commit = EXCLUDED.git_commit,
    metadata = EXCLUDED.metadata;""")
    print()

    # Documents
    print(f"-- {len(all_docs)} documents")
    doc_values = []
    for doc in all_docs:
        sp = doc['source_path'].replace("'", "''")
        st = doc['source_type']
        title = doc['title'].replace("'", "''")
        cs = doc['checksum']
        doc_values.append(
            f"        ('{sp}', '{st}', '{title}', 'en', '{cs}', '{{}}'::jsonb)"
        )

    print(f"""WITH artifact AS (
    SELECT id
    FROM builtin_knowledge_artifacts
    WHERE artifact_version = '{ARTIFACT_VERSION}'
),
docs(source_path, source_type, title, locale, checksum, metadata_json) AS (
    VALUES
{',\n'.join(doc_values)}
)
INSERT INTO builtin_knowledge_documents (
    artifact_id,
    source_path,
    source_type,
    title,
    locale,
    checksum,
    metadata
)
SELECT
    artifact.id,
    docs.source_path,
    docs.source_type,
    docs.title,
    docs.locale,
    docs.checksum,
    docs.metadata_json
FROM artifact
CROSS JOIN docs
ON CONFLICT (artifact_id, source_path) DO UPDATE
SET
    source_type = EXCLUDED.source_type,
    title = EXCLUDED.title,
    locale = EXCLUDED.locale,
    checksum = EXCLUDED.checksum,
    metadata = EXCLUDED.metadata;""")
    print()

    # Chunks
    print(f"-- {total_chunks} chunks")
    chunk_values = []
    for doc in all_docs:
        sp = doc['source_path'].replace("'", "''")
        for order, chunk_text in enumerate(doc['chunks']):
            content_hash = hashlib.sha256(chunk_text.encode()).hexdigest()[:32]
            # Use dollar-quoting to avoid escaping issues
            chunk_values.append(
                f"        ('{sp}', {order}, $chunk${chunk_text}$chunk$, '{content_hash}', '{{}}'::jsonb)"
            )

    print(f"""WITH artifact AS (
    SELECT id
    FROM builtin_knowledge_artifacts
    WHERE artifact_version = '{ARTIFACT_VERSION}'
),
docs AS (
    SELECT id, source_path
    FROM builtin_knowledge_documents
    WHERE artifact_id = (SELECT id FROM artifact)
),
chunks(source_path, chunk_order, chunk_text, content_hash, metadata_json) AS (
    VALUES
{',\n'.join(chunk_values)}
)
INSERT INTO builtin_knowledge_chunks (
    document_id,
    chunk_order,
    chunk_text,
    content_hash,
    metadata
)
SELECT
    docs.id,
    chunks.chunk_order,
    chunks.chunk_text,
    chunks.content_hash,
    chunks.metadata_json
FROM chunks
JOIN docs ON docs.source_path = chunks.source_path
ON CONFLICT (document_id, chunk_order) DO UPDATE
SET
    chunk_text = EXCLUDED.chunk_text,
    content_hash = EXCLUDED.content_hash,
    metadata = EXCLUDED.metadata;""")
    print()

    # Embeddings
    print(f"-- {embedding_count} embeddings")
    print()
    for doc in all_docs:
        for chunk_text, vector in zip(doc['chunks'], doc.get('vectors', [])):
            content_hash = hashlib.sha256(chunk_text.encode()).hexdigest()[:32]
            vec_sql = format_vector(vector)
            print(f"""INSERT INTO builtin_knowledge_embeddings (chunk_id, embedding, embedding_provider, embedding_model)
SELECT c.id, '{vec_sql}'::vector, 'openrouter', '{MODEL.lower()}'
FROM builtin_knowledge_chunks c
WHERE c.content_hash = '{content_hash}'
ON CONFLICT (chunk_id) DO UPDATE
SET embedding = EXCLUDED.embedding,
    embedding_provider = EXCLUDED.embedding_provider,
    embedding_model = EXCLUDED.embedding_model;""")
            print()

    print("COMMIT;")

    print(f"-- Generation complete: {len(all_docs)} docs, {total_chunks} chunks, {embedding_count} embeddings", file=sys.stderr)


if __name__ == "__main__":
    main()