clawdie-ai/scripts/memory/embed-builtin-knowledge.py
Clawdie AI 3ab5d01ba9 Regenerate built-in knowledge: 84 docs, 1853 embeddings
Rewrote embed-builtin-knowledge.py to dynamically discover all project
docs, internal docs, identity files, and skill definitions instead of
using a hardcoded 6-chunk array.  Artifact now covers full corpus via
OpenRouter BAAI/bge-m3 at 1024 dimensions.

Also fixed stale config.ts defaults: EMBED_MODEL from nomic-embed to
BAAI/bge-m3, EMBED_DIMENSIONS from 768 to 1024 (matching the pgvector
schema's vector(1024) column).

Tested: imported into live clawdie_skills DB, FTS search confirmed.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---
Build: pass | Tests: pass — Tests  603 passed (603)
2026-04-06 01:37:26 +00:00

412 lines
13 KiB
Python

#!/usr/bin/env python3
"""
scripts/memory/embed-builtin-knowledge.py — Generate artifact.sql for built-in knowledge.
Discovers all project docs and skill definitions, chunks by markdown headings,
generates embeddings via OpenRouter (BAAI/bge-m3), and outputs a complete
artifact.sql ready for import into the skills database.
Usage:
OPENROUTER_API_KEY=sk-or-... python3 scripts/memory/embed-builtin-knowledge.py
OPENROUTER_API_KEY=sk-or-... python3 scripts/memory/embed-builtin-knowledge.py --dry-run
Output (stdout): Complete artifact.sql with all INSERT statements.
Progress (stderr): Discovery and embedding progress.
"""
import hashlib
import json
import os
import re
import subprocess
import sys
import time
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
# ── Config ────────────────────────────────────────────────────────────────────
OPENROUTER_URL = "https://openrouter.ai/api/v1/embeddings"
MODEL = "BAAI/bge-m3"
DIMENSIONS = 1024
CHUNK_SIZE = 900
ARTIFACT_VERSION = "v1.0.2-full"
SCHEMA_VERSION = "builtin-knowledge-base-v1"
# Files to skip — outdated, test, or meta-only content
SKIP_FILES = {
'nanoclaw-architecture-final.md',
'WIZARD-SIMULATION-TEST.md',
'DOCUMENTATION-SYNC-RUNBOOK.md',
}
# Doc patterns: (glob relative to PROJECT_ROOT, source_type, importance)
DOC_PATTERNS = [
('docs/public/**/*.md', 'doc'),
('docs/internal/**/*.md', 'doc'),
('SOUL.md', 'identity'),
('IDENTITY.md', 'identity'),
('USER.md', 'identity'),
('AGENTS.md', 'identity'),
('MEMORY.md', 'identity'),
('CLAWDIE-ISO.md', 'identity'),
('.agent/skills/*/SKILL.md', 'skill'),
]
# ── Chunking ──────────────────────────────────────────────────────────────────
def chunk_markdown(text: str) -> list[str]:
"""Split markdown by headings, then further split long sections. Max CHUNK_SIZE chars."""
sections = re.split(r'(?=\n#{1,3} )', text)
chunks: list[str] = []
for section in sections:
section = section.strip()
if not section:
continue
if len(section) <= CHUNK_SIZE:
chunks.append(section)
else:
paragraphs = re.split(r'\n\n+', section)
current = ''
for para in paragraphs:
para = para.strip()
if not para:
continue
if len(current) + len(para) + 2 <= CHUNK_SIZE:
current = f'{current}\n\n{para}'.strip() if current else para
else:
if current:
chunks.append(current)
while len(para) > CHUNK_SIZE:
chunks.append(para[:CHUNK_SIZE])
para = para[CHUNK_SIZE:]
current = para
if current:
chunks.append(current)
return chunks or [text[:CHUNK_SIZE]]
# ── File discovery ────────────────────────────────────────────────────────────
def collect_files() -> list[tuple[Path, str]]:
"""Collect all embeddable files with their source_type."""
files: list[tuple[Path, str]] = []
seen: set[Path] = set()
for pattern, source_type in DOC_PATTERNS:
for path in sorted(PROJECT_ROOT.glob(pattern)):
if path.name in SKIP_FILES:
continue
if path in seen:
continue
seen.add(path)
files.append((path, source_type))
return files
def derive_title(path: Path, text: str) -> str:
"""Extract title from first heading or filename."""
for line in text.splitlines()[:10]:
line = line.strip()
if line.startswith('# '):
return line.lstrip('#').strip()[:120]
return path.stem.replace('-', ' ').replace('_', ' ').title()[:120]
# ── Embeddings ────────────────────────────────────────────────────────────────
def embed_batch(texts: list[str]) -> list[list[float]]:
"""Embed a batch of texts via OpenRouter."""
api_key = os.environ.get("OPENROUTER_API_KEY", "")
if not api_key:
print("ERROR: OPENROUTER_API_KEY not set", file=sys.stderr)
sys.exit(1)
body = {"input": texts, "model": MODEL}
payload = json.dumps(body).encode()
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}",
}
req = urllib.request.Request(OPENROUTER_URL, data=payload, headers=headers)
with urllib.request.urlopen(req, timeout=120) as resp:
data = json.load(resp)
return [d["embedding"] for d in sorted(data["data"], key=lambda d: d["index"])]
# ── SQL helpers ───────────────────────────────────────────────────────────────
def sql_escape(text: str) -> str:
"""Escape text for SQL using dollar-quoting where possible."""
# Use $$ quoting for chunk text (may contain single quotes)
return text
def format_vector(vec: list[float]) -> str:
return "[" + ",".join(f"{v:.8f}" for v in vec) + "]"
# ── Main ──────────────────────────────────────────────────────────────────────
def main():
dry_run = '--dry-run' in sys.argv
files = collect_files()
print(f"-- Discovered {len(files)} source files", file=sys.stderr)
if not files:
print("ERROR: No files found", file=sys.stderr)
sys.exit(1)
# Phase 1: Chunk all files
all_docs: list[dict] = []
total_chunks = 0
for path, source_type in files:
rel = str(path.relative_to(PROJECT_ROOT))
text = path.read_text(encoding='utf-8', errors='replace').strip()
if not text:
print(f" skip (empty): {rel}", file=sys.stderr)
continue
chunks = chunk_markdown(text)
title = derive_title(path, text)
checksum = hashlib.sha256(text.encode()).hexdigest()[:16]
all_docs.append({
'source_path': rel,
'source_type': source_type,
'title': title,
'checksum': checksum,
'chunks': chunks,
})
total_chunks += len(chunks)
print(f" {rel}: {len(chunks)} chunks", file=sys.stderr)
print(f"-- Total: {len(all_docs)} docs, {total_chunks} chunks", file=sys.stderr)
if dry_run:
print(f"\nDRY RUN — would embed {total_chunks} chunks from {len(all_docs)} docs", file=sys.stderr)
sys.exit(0)
# Phase 2: Generate embeddings (batch per file, with rate limiting)
print(f"-- Generating embeddings via {MODEL}...", file=sys.stderr)
for i, doc in enumerate(all_docs):
chunk_texts = doc['chunks']
try:
vectors = embed_batch(chunk_texts)
doc['vectors'] = vectors
print(f" [{i+1}/{len(all_docs)}] {doc['source_path']}: {len(vectors)} embeddings", file=sys.stderr)
except Exception as e:
print(f" ERROR embedding {doc['source_path']}: {e}", file=sys.stderr)
sys.exit(1)
# Rate limit: ~2 requests/sec to avoid 429s
if i < len(all_docs) - 1:
time.sleep(0.5)
# Phase 3: Output artifact.sql
git_commit = subprocess.run(
['git', 'rev-parse', '--short', 'HEAD'],
capture_output=True, text=True, cwd=PROJECT_ROOT,
).stdout.strip() or None
now = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S+00')
embedding_count = sum(len(d.get('vectors', [])) for d in all_docs)
print("-- Built-in knowledge bootstrap artifact")
print("-- Imported into the agent-system skills database.")
print()
print("BEGIN;")
print()
# Artifact metadata
metadata_json = json.dumps({
"notes": "Full project docs, internal docs, identity files, and skill definitions",
"search_mode": "hybrid",
"embedding_provider": "openrouter",
"embedding_model": MODEL.lower(),
"embedding_dimensions": DIMENSIONS,
}, indent=2)
print(f"""INSERT INTO builtin_knowledge_artifacts (
artifact_version,
schema_version,
source_snapshot,
chunker_name,
chunker_version,
chunk_size,
chunk_overlap,
embedding_provider,
embedding_model,
embedding_dimensions,
generated_at,
document_count,
chunk_count,
embedding_count,
git_commit,
metadata
)
VALUES (
'{ARTIFACT_VERSION}',
'{SCHEMA_VERSION}',
'full project docs + skills + identity files',
'heading-split',
'v2',
{CHUNK_SIZE},
0,
'openrouter',
'{MODEL.lower()}',
{DIMENSIONS},
TIMESTAMPTZ '{now}',
{len(all_docs)},
{total_chunks},
{embedding_count},
{f"'{git_commit}'" if git_commit else 'NULL'},
'{metadata_json}'::jsonb
)
ON CONFLICT (artifact_version) DO UPDATE
SET
schema_version = EXCLUDED.schema_version,
source_snapshot = EXCLUDED.source_snapshot,
chunker_name = EXCLUDED.chunker_name,
chunker_version = EXCLUDED.chunker_version,
chunk_size = EXCLUDED.chunk_size,
chunk_overlap = EXCLUDED.chunk_overlap,
embedding_provider = EXCLUDED.embedding_provider,
embedding_model = EXCLUDED.embedding_model,
embedding_dimensions = EXCLUDED.embedding_dimensions,
generated_at = EXCLUDED.generated_at,
document_count = EXCLUDED.document_count,
chunk_count = EXCLUDED.chunk_count,
embedding_count = EXCLUDED.embedding_count,
git_commit = EXCLUDED.git_commit,
metadata = EXCLUDED.metadata;""")
print()
# Documents
print(f"-- {len(all_docs)} documents")
doc_values = []
for doc in all_docs:
sp = doc['source_path'].replace("'", "''")
st = doc['source_type']
title = doc['title'].replace("'", "''")
cs = doc['checksum']
doc_values.append(
f" ('{sp}', '{st}', '{title}', 'en', '{cs}', '{{}}'::jsonb)"
)
print(f"""WITH artifact AS (
SELECT id
FROM builtin_knowledge_artifacts
WHERE artifact_version = '{ARTIFACT_VERSION}'
),
docs(source_path, source_type, title, locale, checksum, metadata_json) AS (
VALUES
{',\n'.join(doc_values)}
)
INSERT INTO builtin_knowledge_documents (
artifact_id,
source_path,
source_type,
title,
locale,
checksum,
metadata
)
SELECT
artifact.id,
docs.source_path,
docs.source_type,
docs.title,
docs.locale,
docs.checksum,
docs.metadata_json
FROM artifact
CROSS JOIN docs
ON CONFLICT (artifact_id, source_path) DO UPDATE
SET
source_type = EXCLUDED.source_type,
title = EXCLUDED.title,
locale = EXCLUDED.locale,
checksum = EXCLUDED.checksum,
metadata = EXCLUDED.metadata;""")
print()
# Chunks
print(f"-- {total_chunks} chunks")
chunk_values = []
for doc in all_docs:
sp = doc['source_path'].replace("'", "''")
for order, chunk_text in enumerate(doc['chunks']):
content_hash = hashlib.sha256(chunk_text.encode()).hexdigest()[:32]
# Use dollar-quoting to avoid escaping issues
chunk_values.append(
f" ('{sp}', {order}, $chunk${chunk_text}$chunk$, '{content_hash}', '{{}}'::jsonb)"
)
print(f"""WITH artifact AS (
SELECT id
FROM builtin_knowledge_artifacts
WHERE artifact_version = '{ARTIFACT_VERSION}'
),
docs AS (
SELECT id, source_path
FROM builtin_knowledge_documents
WHERE artifact_id = (SELECT id FROM artifact)
),
chunks(source_path, chunk_order, chunk_text, content_hash, metadata_json) AS (
VALUES
{',\n'.join(chunk_values)}
)
INSERT INTO builtin_knowledge_chunks (
document_id,
chunk_order,
chunk_text,
content_hash,
metadata
)
SELECT
docs.id,
chunks.chunk_order,
chunks.chunk_text,
chunks.content_hash,
chunks.metadata_json
FROM chunks
JOIN docs ON docs.source_path = chunks.source_path
ON CONFLICT (document_id, chunk_order) DO UPDATE
SET
chunk_text = EXCLUDED.chunk_text,
content_hash = EXCLUDED.content_hash,
metadata = EXCLUDED.metadata;""")
print()
# Embeddings
print(f"-- {embedding_count} embeddings")
print()
for doc in all_docs:
for chunk_text, vector in zip(doc['chunks'], doc.get('vectors', [])):
content_hash = hashlib.sha256(chunk_text.encode()).hexdigest()[:32]
vec_sql = format_vector(vector)
print(f"""INSERT INTO builtin_knowledge_embeddings (chunk_id, embedding, embedding_provider, embedding_model)
SELECT c.id, '{vec_sql}'::vector, 'openrouter', '{MODEL.lower()}'
FROM builtin_knowledge_chunks c
WHERE c.content_hash = '{content_hash}'
ON CONFLICT (chunk_id) DO UPDATE
SET embedding = EXCLUDED.embedding,
embedding_provider = EXCLUDED.embedding_provider,
embedding_model = EXCLUDED.embedding_model;""")
print()
print("COMMIT;")
print(f"-- Generation complete: {len(all_docs)} docs, {total_chunks} chunks, {embedding_count} embeddings", file=sys.stderr)
if __name__ == "__main__":
main()