From e97a1dec2cfcff160bae2e44a9dc04b59d86bdbb Mon Sep 17 00:00:00 2001 From: Mevy Assistant Date: Tue, 21 Apr 2026 09:01:23 +0200 Subject: [PATCH] chore: commit backfill-embeddings maintenance script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Repairs memory_chunks rows missing vector embeddings — useful after embedding API outages, provider switches, or fresh installs. Has dry-run mode and rate-limit backoff. Not a skill; run manually when semantic memory search degrades. Co-Authored-By: Claude Sonnet 4.6 --- scripts/backfill-embeddings.ts | 135 +++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 scripts/backfill-embeddings.ts diff --git a/scripts/backfill-embeddings.ts b/scripts/backfill-embeddings.ts new file mode 100644 index 0000000..5cdf1db --- /dev/null +++ b/scripts/backfill-embeddings.ts @@ -0,0 +1,135 @@ +/** + * Backfill missing vector embeddings for memory chunks. + * + * Useful after embedding API outages, provider switches, or fresh installs + * where chunks were stored but embeddings failed. Repairs semantic search + * without re-importing memories. + * + * Requires env vars from .env — source them before running: + * env $(grep -v '^#' .env | xargs) npx tsx scripts/backfill-embeddings.ts + * + * Options: + * --dry-run Count missing embeddings without writing anything + * BATCH_SIZE=N Process N chunks before logging progress (default 5) + */ +import pg from 'pg'; + +const DB_URL = process.env.MEMORY_DB_URL!; +const EMBED_API_URL = process.env.EMBED_BASE_URL + ? `${process.env.EMBED_BASE_URL}/embeddings` + : ''; +const EMBED_MODEL = process.env.EMBED_MODEL || 'BAAI/bge-m3'; +const EMBED_API_KEY = + process.env.EMBED_API_KEY ?? ''; + +const BATCH_SIZE = parseInt(process.env.BATCH_SIZE || '5', 10); +const DRY_RUN = process.argv.includes('--dry-run'); +const DELAY_MS = 500; // rate limit cushion + +async function generateEmbedding(text: string): Promise { + const headers: Record = { + 'Content-Type': 'application/json', + }; + if (EMBED_API_KEY) headers['Authorization'] = `Bearer ${EMBED_API_KEY}`; + + const res = await fetch(EMBED_API_URL, { + method: 'POST', + headers, + body: JSON.stringify({ model: EMBED_MODEL, input: text }), + }); + + if (!res.ok) { + const body = await res.text(); + throw new Error(`Embedding API error ${res.status}: ${body}`); + } + + const data = (await res.json()) as { + data?: Array<{ embedding: number[] }>; + error?: unknown; + }; + if (data.error) throw new Error(`Embedding API error: ${JSON.stringify(data.error)}`); + if (!data.data || data.data.length === 0) throw new Error('No embedding returned'); + return data.data[0].embedding; +} + +function sleep(ms: number) { + return new Promise((r) => setTimeout(r, ms)); +} + +async function main() { + if (!DB_URL) { + console.error('MEMORY_DB_URL not set'); + process.exit(1); + } + if (!EMBED_API_URL) { + console.error('EMBED_BASE_URL not set'); + process.exit(1); + } + + const pool = new pg.Pool({ connectionString: DB_URL, max: 3 }); + + // Find chunks without embeddings + const { rows: missing } = await pool.query<{ id: string; chunk_text: string }>( + `SELECT mc.id, mc.chunk_text + FROM memory_chunks mc + LEFT JOIN memory_embeddings me ON me.chunk_id = mc.id + WHERE me.chunk_id IS NULL + ORDER BY mc.id`, + ); + + console.log(`Found ${missing.length} chunks without embeddings`); + if (DRY_RUN) { + console.log('Dry run — exiting.'); + await pool.end(); + return; + } + + let ok = 0; + let fail = 0; + + for (let i = 0; i < missing.length; i++) { + const chunk = missing[i]; + try { + const embedding = await generateEmbedding(chunk.chunk_text); + await pool.query( + `INSERT INTO memory_embeddings (chunk_id, embedding, embedding_provider, embedding_model) + VALUES ($1, $2::vector, $3, $4)`, + [chunk.id, JSON.stringify(embedding), process.env.EMBED_BASE_URL, EMBED_MODEL], + ); + ok++; + if ((i + 1) % 20 === 0) { + console.log(` [${i + 1}/${missing.length}] ok=${ok} fail=${fail}`); + } + } catch (err: any) { + fail++; + console.error(` FAIL chunk ${chunk.id}: ${err.message}`); + // If rate limited, back off + if (err.message?.includes('429')) { + console.log(' Rate limited — waiting 10s...'); + await sleep(10_000); + // retry once + try { + const embedding = await generateEmbedding(chunk.chunk_text); + await pool.query( + `INSERT INTO memory_embeddings (chunk_id, embedding, embedding_provider, embedding_model) + VALUES ($1, $2::vector, $3, $4)`, + [chunk.id, JSON.stringify(embedding), process.env.EMBED_BASE_URL, EMBED_MODEL], + ); + ok++; + fail--; + } catch { + console.error(` RETRY FAIL chunk ${chunk.id}`); + } + } + } + await sleep(DELAY_MS); + } + + console.log(`\nDone. ok=${ok} fail=${fail} total=${missing.length}`); + await pool.end(); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +});