/** * Backfill missing vector embeddings for memory chunks. * * Useful after embedding API outages, provider switches, or fresh installs * where chunks were stored but embeddings failed. Repairs semantic search * without re-importing memories. * * Requires env vars from .env — source them before running: * env $(grep -v '^#' .env | xargs) npx tsx scripts/backfill-embeddings.ts * * Options: * --dry-run Count missing embeddings without writing anything * BATCH_SIZE=N Process N chunks before logging progress (default 5) */ import pg from 'pg'; const DB_URL = process.env.MEMORY_DB_URL!; const EMBED_API_URL = process.env.EMBED_BASE_URL ? `${process.env.EMBED_BASE_URL}/embeddings` : ''; const EMBED_MODEL = process.env.EMBED_MODEL || 'BAAI/bge-m3'; const EMBED_API_KEY = process.env.EMBED_API_KEY ?? ''; const BATCH_SIZE = parseInt(process.env.BATCH_SIZE || '5', 10); const DRY_RUN = process.argv.includes('--dry-run'); const DELAY_MS = 500; // rate limit cushion async function generateEmbedding(text: string): Promise { const headers: Record = { 'Content-Type': 'application/json', }; if (EMBED_API_KEY) headers['Authorization'] = `Bearer ${EMBED_API_KEY}`; const res = await fetch(EMBED_API_URL, { method: 'POST', headers, body: JSON.stringify({ model: EMBED_MODEL, input: text }), }); if (!res.ok) { const body = await res.text(); throw new Error(`Embedding API error ${res.status}: ${body}`); } const data = (await res.json()) as { data?: Array<{ embedding: number[] }>; error?: unknown; }; if (data.error) throw new Error(`Embedding API error: ${JSON.stringify(data.error)}`); if (!data.data || data.data.length === 0) throw new Error('No embedding returned'); return data.data[0].embedding; } function sleep(ms: number) { return new Promise((r) => setTimeout(r, ms)); } async function main() { if (!DB_URL) { console.error('MEMORY_DB_URL not set'); process.exit(1); } if (!EMBED_API_URL) { console.error('EMBED_BASE_URL not set'); process.exit(1); } const pool = new pg.Pool({ connectionString: DB_URL, max: 3 }); // Find chunks without embeddings const { rows: missing } = await pool.query<{ id: string; chunk_text: string }>( `SELECT mc.id, mc.chunk_text FROM memory_chunks mc LEFT JOIN memory_embeddings me ON me.chunk_id = mc.id WHERE me.chunk_id IS NULL ORDER BY mc.id`, ); console.log(`Found ${missing.length} chunks without embeddings`); if (DRY_RUN) { console.log('Dry run — exiting.'); await pool.end(); return; } let ok = 0; let fail = 0; for (let i = 0; i < missing.length; i++) { const chunk = missing[i]; try { const embedding = await generateEmbedding(chunk.chunk_text); await pool.query( `INSERT INTO memory_embeddings (chunk_id, embedding, embedding_provider, embedding_model) VALUES ($1, $2::vector, $3, $4)`, [chunk.id, JSON.stringify(embedding), process.env.EMBED_BASE_URL, EMBED_MODEL], ); ok++; if ((i + 1) % 20 === 0) { console.log(` [${i + 1}/${missing.length}] ok=${ok} fail=${fail}`); } } catch (err: any) { fail++; console.error(` FAIL chunk ${chunk.id}: ${err.message}`); // If rate limited, back off if (err.message?.includes('429')) { console.log(' Rate limited — waiting 10s...'); await sleep(10_000); // retry once try { const embedding = await generateEmbedding(chunk.chunk_text); await pool.query( `INSERT INTO memory_embeddings (chunk_id, embedding, embedding_provider, embedding_model) VALUES ($1, $2::vector, $3, $4)`, [chunk.id, JSON.stringify(embedding), process.env.EMBED_BASE_URL, EMBED_MODEL], ); ok++; fail--; } catch { console.error(` RETRY FAIL chunk ${chunk.id}`); } } } await sleep(DELAY_MS); } console.log(`\nDone. ok=${ok} fail=${fail} total=${missing.length}`); await pool.end(); } main().catch((err) => { console.error(err); process.exit(1); });