chore: commit backfill-embeddings maintenance script

Repairs memory_chunks rows missing vector embeddings — useful after embedding API outages, provider switches, or fresh installs. Has dry-run mode and rate-limit backoff. Not a skill; run manually when semantic memory search degrades. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-21 09:01:23 +02:00 · 2026-04-21 09:01:23 +02:00 · e97a1dec2c
commit e97a1dec2c
parent a5bd81fac1
1 changed files with 135 additions and 0 deletions
--- a/scripts/backfill-embeddings.ts
+++ b/scripts/backfill-embeddings.ts
@ -0,0 +1,135 @@
+/**
+ * Backfill missing vector embeddings for memory chunks.
+ *
+ * Useful after embedding API outages, provider switches, or fresh installs
+ * where chunks were stored but embeddings failed. Repairs semantic search
+ * without re-importing memories.
+ *
+ * Requires env vars from .env — source them before running:
+ *   env $(grep -v '^#' .env | xargs) npx tsx scripts/backfill-embeddings.ts
+ *
+ * Options:
+ *   --dry-run   Count missing embeddings without writing anything
+ *   BATCH_SIZE=N  Process N chunks before logging progress (default 5)
+ */
+import pg from 'pg';
+
+const DB_URL = process.env.MEMORY_DB_URL!;
+const EMBED_API_URL = process.env.EMBED_BASE_URL
+  ? `${process.env.EMBED_BASE_URL}/embeddings`
+  : '';
+const EMBED_MODEL = process.env.EMBED_MODEL || 'BAAI/bge-m3';
+const EMBED_API_KEY =
+  process.env.EMBED_API_KEY ?? '';
+
+const BATCH_SIZE = parseInt(process.env.BATCH_SIZE || '5', 10);
+const DRY_RUN = process.argv.includes('--dry-run');
+const DELAY_MS = 500; // rate limit cushion
+
+async function generateEmbedding(text: string): Promise<number[]> {
+  const headers: Record<string, string> = {
+    'Content-Type': 'application/json',
+  };
+  if (EMBED_API_KEY) headers['Authorization'] = `Bearer ${EMBED_API_KEY}`;
+
+  const res = await fetch(EMBED_API_URL, {
+    method: 'POST',
+    headers,
+    body: JSON.stringify({ model: EMBED_MODEL, input: text }),
+  });
+
+  if (!res.ok) {
+    const body = await res.text();
+    throw new Error(`Embedding API error ${res.status}: ${body}`);
+  }
+
+  const data = (await res.json()) as {
+    data?: Array<{ embedding: number[] }>;
+    error?: unknown;
+  };
+  if (data.error) throw new Error(`Embedding API error: ${JSON.stringify(data.error)}`);
+  if (!data.data || data.data.length === 0) throw new Error('No embedding returned');
+  return data.data[0].embedding;
+}
+
+function sleep(ms: number) {
+  return new Promise((r) => setTimeout(r, ms));
+}
+
+async function main() {
+  if (!DB_URL) {
+    console.error('MEMORY_DB_URL not set');
+    process.exit(1);
+  }
+  if (!EMBED_API_URL) {
+    console.error('EMBED_BASE_URL not set');
+    process.exit(1);
+  }
+
+  const pool = new pg.Pool({ connectionString: DB_URL, max: 3 });
+
+  // Find chunks without embeddings
+  const { rows: missing } = await pool.query<{ id: string; chunk_text: string }>(
+    `SELECT mc.id, mc.chunk_text
+     FROM memory_chunks mc
+     LEFT JOIN memory_embeddings me ON me.chunk_id = mc.id
+     WHERE me.chunk_id IS NULL
+     ORDER BY mc.id`,
+  );
+
+  console.log(`Found ${missing.length} chunks without embeddings`);
+  if (DRY_RUN) {
+    console.log('Dry run — exiting.');
+    await pool.end();
+    return;
+  }
+
+  let ok = 0;
+  let fail = 0;
+
+  for (let i = 0; i < missing.length; i++) {
+    const chunk = missing[i];
+    try {
+      const embedding = await generateEmbedding(chunk.chunk_text);
+      await pool.query(
+        `INSERT INTO memory_embeddings (chunk_id, embedding, embedding_provider, embedding_model)
+         VALUES ($1, $2::vector, $3, $4)`,
+        [chunk.id, JSON.stringify(embedding), process.env.EMBED_BASE_URL, EMBED_MODEL],
+      );
+      ok++;
+      if ((i + 1) % 20 === 0) {
+        console.log(`  [${i + 1}/${missing.length}] ok=${ok} fail=${fail}`);
+      }
+    } catch (err: any) {
+      fail++;
+      console.error(`  FAIL chunk ${chunk.id}: ${err.message}`);
+      // If rate limited, back off
+      if (err.message?.includes('429')) {
+        console.log('  Rate limited — waiting 10s...');
+        await sleep(10_000);
+        // retry once
+        try {
+          const embedding = await generateEmbedding(chunk.chunk_text);
+          await pool.query(
+            `INSERT INTO memory_embeddings (chunk_id, embedding, embedding_provider, embedding_model)
+             VALUES ($1, $2::vector, $3, $4)`,
+            [chunk.id, JSON.stringify(embedding), process.env.EMBED_BASE_URL, EMBED_MODEL],
+          );
+          ok++;
+          fail--;
+        } catch {
+          console.error(`  RETRY FAIL chunk ${chunk.id}`);
+        }
+      }
+    }
+    await sleep(DELAY_MS);
+  }
+
+  console.log(`\nDone. ok=${ok} fail=${fail} total=${missing.length}`);
+  await pool.end();
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});