- embed-docs.py: new script to batch-embed project docs into klavdija_brain (37 docs → 852 chunks with session_id pattern doc:<path>) - import-memories.py: updated embed() to send Authorization header and dimensions=1024 parameter for OpenAI/configurable provider support - memories-transfer-v2.json: recovered historical session memories (local copy) - package.json: add @supabase/supabase-js dependency - src/channels/telegram.ts, src/config.ts: Telegram channel integration Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --- Build: pass | Tests: pass — Tests 431 passed (431)
285 lines
11 KiB
Python
Executable file
285 lines
11 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
scripts/memory/embed-docs.py — Embed project docs into klavdija_brain.
|
|
|
|
Reads markdown files, splits by heading sections, embeds via bge-m3,
|
|
and inserts into the memories/memory_chunks/memory_embeddings tables.
|
|
|
|
Usage:
|
|
python3 scripts/memory/embed-docs.py # embed all docs
|
|
python3 scripts/memory/embed-docs.py --dry-run # show what would be embedded
|
|
python3 scripts/memory/embed-docs.py --force # re-embed even if already present
|
|
|
|
session_id pattern: doc:<relative-path> e.g. doc:docs/HEARTBEAT.md
|
|
"""
|
|
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# ── Config ─────────────────────────────────────────────────────────────────────
|
|
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
|
|
|
def load_env(path: Path) -> dict[str, str]:
|
|
env: dict[str, str] = {}
|
|
if not path.exists():
|
|
return env
|
|
for line in path.read_text().splitlines():
|
|
line = line.strip()
|
|
if not line or line.startswith('#') or '=' not in line:
|
|
continue
|
|
k, _, v = line.partition('=')
|
|
env[k.strip()] = v.strip().strip('"\'')
|
|
return env
|
|
|
|
ENV = load_env(PROJECT_ROOT / '.env')
|
|
AGENT_NAME = ENV.get('AGENT_NAME', 'klavdija')
|
|
DB_HOST = ENV.get('WARDEN_DB_IP', '192.168.100.3')
|
|
DB_PORT = ENV.get('DB_PORT', '5432')
|
|
DB_NAME = f'{AGENT_NAME}_brain'
|
|
DB_USER = f'{AGENT_NAME}_brain'
|
|
DB_PASSWORD = ENV.get('MEMORY_DB_PASSWORD', '')
|
|
EMBED_URL = ENV.get('EMBED_BASE_URL', 'http://192.168.100.5:8080/v1')
|
|
EMBED_MODEL = ENV.get('EMBED_MODEL', 'bge-m3')
|
|
EMBED_DIMS = int(ENV.get('EMBED_DIMENSIONS', '1024'))
|
|
EMBED_KEY = ENV.get('EMBED_API_KEY', '') or ENV.get('OPENAI_API_KEY', '')
|
|
|
|
# Files to skip — outdated, test, or meta-only content
|
|
SKIP_FILES = {
|
|
'nanoclaw-architecture-final.md', # old NanoClaw predecessor, pre-FreeBSD
|
|
'WIZARD-SIMULATION-TEST.md', # test fixture, not knowledge
|
|
'DOCUMENTATION-SYNC-RUNBOOK.md', # meta-docs about managing docs
|
|
}
|
|
|
|
# Docs to embed: docs/*.md + root identity files
|
|
DOC_PATTERNS = [
|
|
('docs/*.md', 3), # (glob relative to project root, importance)
|
|
('SOUL.md', 5),
|
|
('IDENTITY.md', 5),
|
|
('USER.md', 5),
|
|
('AGENTS.md', 4),
|
|
('MEMORY.md', 4),
|
|
('CLAWDIE-ISO.md', 3),
|
|
]
|
|
|
|
MAX_CHUNK = 900 # chars — larger than session memories for doc retrieval
|
|
|
|
# ── Chunking ───────────────────────────────────────────────────────────────────
|
|
|
|
def chunk_markdown(text: str) -> list[str]:
|
|
"""
|
|
Split markdown by headings (## / ###), then further split long sections.
|
|
Each chunk is ≤ MAX_CHUNK chars.
|
|
"""
|
|
# Split on heading lines
|
|
sections = re.split(r'(?=\n#{1,3} )', text)
|
|
chunks: list[str] = []
|
|
|
|
for section in sections:
|
|
section = section.strip()
|
|
if not section:
|
|
continue
|
|
|
|
if len(section) <= MAX_CHUNK:
|
|
chunks.append(section)
|
|
else:
|
|
# Split long section on paragraph boundaries
|
|
paragraphs = re.split(r'\n\n+', section)
|
|
current = ''
|
|
for para in paragraphs:
|
|
para = para.strip()
|
|
if not para:
|
|
continue
|
|
if len(current) + len(para) + 2 <= MAX_CHUNK:
|
|
current = f'{current}\n\n{para}'.strip() if current else para
|
|
else:
|
|
if current:
|
|
chunks.append(current)
|
|
# If single paragraph still too long, hard split
|
|
while len(para) > MAX_CHUNK:
|
|
chunks.append(para[:MAX_CHUNK])
|
|
para = para[MAX_CHUNK:]
|
|
current = para
|
|
if current:
|
|
chunks.append(current)
|
|
|
|
return chunks or [text[:MAX_CHUNK]]
|
|
|
|
# ── Embeddings ─────────────────────────────────────────────────────────────────
|
|
|
|
import urllib.request
|
|
|
|
def embed(texts: list[str]) -> list[list[float]]:
|
|
"""Embed a batch of texts. Sends all chunks in one API call."""
|
|
body: dict = {'input': texts, 'model': EMBED_MODEL}
|
|
if EMBED_DIMS != 1536: # only send if not default
|
|
body['dimensions'] = EMBED_DIMS
|
|
payload = json.dumps(body).encode()
|
|
headers = {'Content-Type': 'application/json'}
|
|
if EMBED_KEY:
|
|
headers['Authorization'] = f'Bearer {EMBED_KEY}'
|
|
req = urllib.request.Request(f'{EMBED_URL}/embeddings', data=payload, headers=headers)
|
|
with urllib.request.urlopen(req, timeout=60) as resp:
|
|
data = json.load(resp)
|
|
return [d['embedding'] for d in sorted(data['data'], key=lambda d: d['index'])]
|
|
|
|
# ── PostgreSQL via psql ────────────────────────────────────────────────────────
|
|
|
|
def psql(sql: str) -> str:
|
|
env = os.environ.copy()
|
|
env['PGPASSWORD'] = DB_PASSWORD
|
|
r = subprocess.run(
|
|
['psql', '-h', DB_HOST, '-p', DB_PORT, '-U', DB_USER, '-d', DB_NAME,
|
|
'--no-align', '--tuples-only', '--quiet', '-c', sql],
|
|
capture_output=True, text=True, env=env,
|
|
)
|
|
if r.returncode != 0:
|
|
raise RuntimeError(f"psql: {r.stderr.strip()}")
|
|
return r.stdout.strip()
|
|
|
|
def session_exists(session_id: str) -> bool:
|
|
safe = session_id.replace("'", "''")
|
|
result = psql(f"SELECT 1 FROM memories WHERE session_id = '{safe}' LIMIT 1")
|
|
return bool(result.strip())
|
|
|
|
def insert_doc(session_id: str, summary: str, topics: list[str],
|
|
importance: int, chunks: list[str], vectors: list[list[float]]) -> None:
|
|
safe_summary = summary.replace("'", "''")
|
|
safe_session = session_id.replace("'", "''")
|
|
topics_arr = '{' + ','.join(f'"{t}"' for t in topics) + '}'
|
|
|
|
mem_id = psql(f"""
|
|
INSERT INTO memories (session_id, importance, summary, topics)
|
|
VALUES ('{safe_session}', {importance}, '{safe_summary}', '{topics_arr}')
|
|
RETURNING id
|
|
""")
|
|
|
|
for order, (chunk, vector) in enumerate(zip(chunks, vectors)):
|
|
safe_chunk = chunk.replace("'", "''")
|
|
content_hash = hashlib.sha256(chunk.encode()).hexdigest()
|
|
chunk_id = psql(f"""
|
|
INSERT INTO memory_chunks (memory_id, chunk_order, chunk_text, content_hash)
|
|
VALUES ('{mem_id}', {order}, '{safe_chunk}', '{content_hash}')
|
|
RETURNING id
|
|
""")
|
|
vector_str = '[' + ','.join(str(v) for v in vector) + ']'
|
|
psql(f"""
|
|
INSERT INTO memory_embeddings (chunk_id, embedding, embedding_provider, embedding_model)
|
|
VALUES ('{chunk_id}', '{vector_str}'::vector, 'llama-cpp', '{EMBED_MODEL}')
|
|
""")
|
|
|
|
def delete_doc(session_id: str) -> None:
|
|
safe = session_id.replace("'", "''")
|
|
psql(f"""
|
|
DELETE FROM memory_embeddings
|
|
WHERE chunk_id IN (
|
|
SELECT mc.id FROM memory_chunks mc
|
|
JOIN memories m ON mc.memory_id = m.id
|
|
WHERE m.session_id = '{safe}'
|
|
)
|
|
""")
|
|
psql(f"""
|
|
DELETE FROM memory_chunks
|
|
WHERE memory_id IN (
|
|
SELECT id FROM memories WHERE session_id = '{safe}'
|
|
)
|
|
""")
|
|
psql(f"DELETE FROM memories WHERE session_id = '{safe}'")
|
|
|
|
# ── Topic extraction ───────────────────────────────────────────────────────────
|
|
|
|
def extract_topics(path: Path, text: str) -> list[str]:
|
|
topics = [path.stem.lower().replace('-', ' ').replace('_', ' ')]
|
|
# Pull first-level headings as topics
|
|
for m in re.finditer(r'^# (.+)', text, re.MULTILINE):
|
|
word = m.group(1).strip().lower()[:30]
|
|
if word and word not in topics:
|
|
topics.append(word)
|
|
return topics[:5]
|
|
|
|
# ── Main ───────────────────────────────────────────────────────────────────────
|
|
|
|
def collect_files() -> list[tuple[Path, int]]:
|
|
files: list[tuple[Path, int]] = []
|
|
seen: set[Path] = set()
|
|
for pattern, importance in DOC_PATTERNS:
|
|
for path in sorted(PROJECT_ROOT.glob(pattern)):
|
|
if path.name in SKIP_FILES:
|
|
continue
|
|
if path in seen:
|
|
continue
|
|
seen.add(path)
|
|
files.append((path, importance))
|
|
return files
|
|
|
|
def main() -> None:
|
|
dry_run = '--dry-run' in sys.argv
|
|
force = '--force' in sys.argv
|
|
|
|
files = collect_files()
|
|
|
|
print(f"Embedding {len(files)} documents into {DB_NAME}@{DB_HOST}")
|
|
print(f"Embed: {EMBED_URL} ({EMBED_MODEL}) chunk_size={MAX_CHUNK}")
|
|
if dry_run:
|
|
print("DRY RUN — no writes\n")
|
|
print()
|
|
|
|
total_chunks = 0
|
|
skipped = 0
|
|
embedded = 0
|
|
|
|
for path, importance in files:
|
|
rel = str(path.relative_to(PROJECT_ROOT))
|
|
session_id = f'doc:{rel}'
|
|
text = path.read_text(encoding='utf-8', errors='replace').strip()
|
|
|
|
if not text:
|
|
print(f" skip (empty): {rel}")
|
|
skipped += 1
|
|
continue
|
|
|
|
if not dry_run and not force and session_exists(session_id):
|
|
print(f" skip (exists): {rel}")
|
|
skipped += 1
|
|
continue
|
|
|
|
chunks = chunk_markdown(text)
|
|
summary = f"{path.name}: {text.splitlines()[0].lstrip('#').strip()[:80]}"
|
|
topics = extract_topics(path, text)
|
|
|
|
if dry_run:
|
|
print(f" would embed: {rel} ({len(chunks)} chunks, importance={importance})")
|
|
total_chunks += len(chunks)
|
|
continue
|
|
|
|
# Embed in one batch per file
|
|
try:
|
|
vectors = embed(chunks)
|
|
except Exception as e:
|
|
print(f" ERROR embedding {rel}: {e}")
|
|
continue
|
|
|
|
if force and session_exists(session_id):
|
|
delete_doc(session_id)
|
|
|
|
try:
|
|
insert_doc(session_id, summary, topics, importance, chunks, vectors)
|
|
print(f" ✓ {rel} ({len(chunks)} chunks)")
|
|
total_chunks += len(chunks)
|
|
embedded += 1
|
|
except Exception as e:
|
|
print(f" ERROR inserting {rel}: {e}")
|
|
|
|
print()
|
|
if dry_run:
|
|
print(f"Would embed {len(files) - skipped} docs → {total_chunks} chunks")
|
|
else:
|
|
print(f"Done. {embedded} docs embedded, {skipped} skipped, {total_chunks} total chunks.")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|