chore: commit backfill-embeddings maintenance script
Repairs memory_chunks rows missing vector embeddings — useful after embedding API outages, provider switches, or fresh installs. Has dry-run mode and rate-limit backoff. Not a skill; run manually when semantic memory search degrades. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
a5bd81fac1
commit
e97a1dec2c
1 changed files with 135 additions and 0 deletions
135
scripts/backfill-embeddings.ts
Normal file
135
scripts/backfill-embeddings.ts
Normal file
|
|
@ -0,0 +1,135 @@
|
|||
/**
|
||||
* Backfill missing vector embeddings for memory chunks.
|
||||
*
|
||||
* Useful after embedding API outages, provider switches, or fresh installs
|
||||
* where chunks were stored but embeddings failed. Repairs semantic search
|
||||
* without re-importing memories.
|
||||
*
|
||||
* Requires env vars from .env — source them before running:
|
||||
* env $(grep -v '^#' .env | xargs) npx tsx scripts/backfill-embeddings.ts
|
||||
*
|
||||
* Options:
|
||||
* --dry-run Count missing embeddings without writing anything
|
||||
* BATCH_SIZE=N Process N chunks before logging progress (default 5)
|
||||
*/
|
||||
import pg from 'pg';
|
||||
|
||||
const DB_URL = process.env.MEMORY_DB_URL!;
|
||||
const EMBED_API_URL = process.env.EMBED_BASE_URL
|
||||
? `${process.env.EMBED_BASE_URL}/embeddings`
|
||||
: '';
|
||||
const EMBED_MODEL = process.env.EMBED_MODEL || 'BAAI/bge-m3';
|
||||
const EMBED_API_KEY =
|
||||
process.env.EMBED_API_KEY ?? '';
|
||||
|
||||
const BATCH_SIZE = parseInt(process.env.BATCH_SIZE || '5', 10);
|
||||
const DRY_RUN = process.argv.includes('--dry-run');
|
||||
const DELAY_MS = 500; // rate limit cushion
|
||||
|
||||
async function generateEmbedding(text: string): Promise<number[]> {
|
||||
const headers: Record<string, string> = {
|
||||
'Content-Type': 'application/json',
|
||||
};
|
||||
if (EMBED_API_KEY) headers['Authorization'] = `Bearer ${EMBED_API_KEY}`;
|
||||
|
||||
const res = await fetch(EMBED_API_URL, {
|
||||
method: 'POST',
|
||||
headers,
|
||||
body: JSON.stringify({ model: EMBED_MODEL, input: text }),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
const body = await res.text();
|
||||
throw new Error(`Embedding API error ${res.status}: ${body}`);
|
||||
}
|
||||
|
||||
const data = (await res.json()) as {
|
||||
data?: Array<{ embedding: number[] }>;
|
||||
error?: unknown;
|
||||
};
|
||||
if (data.error) throw new Error(`Embedding API error: ${JSON.stringify(data.error)}`);
|
||||
if (!data.data || data.data.length === 0) throw new Error('No embedding returned');
|
||||
return data.data[0].embedding;
|
||||
}
|
||||
|
||||
function sleep(ms: number) {
|
||||
return new Promise((r) => setTimeout(r, ms));
|
||||
}
|
||||
|
||||
async function main() {
|
||||
if (!DB_URL) {
|
||||
console.error('MEMORY_DB_URL not set');
|
||||
process.exit(1);
|
||||
}
|
||||
if (!EMBED_API_URL) {
|
||||
console.error('EMBED_BASE_URL not set');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const pool = new pg.Pool({ connectionString: DB_URL, max: 3 });
|
||||
|
||||
// Find chunks without embeddings
|
||||
const { rows: missing } = await pool.query<{ id: string; chunk_text: string }>(
|
||||
`SELECT mc.id, mc.chunk_text
|
||||
FROM memory_chunks mc
|
||||
LEFT JOIN memory_embeddings me ON me.chunk_id = mc.id
|
||||
WHERE me.chunk_id IS NULL
|
||||
ORDER BY mc.id`,
|
||||
);
|
||||
|
||||
console.log(`Found ${missing.length} chunks without embeddings`);
|
||||
if (DRY_RUN) {
|
||||
console.log('Dry run — exiting.');
|
||||
await pool.end();
|
||||
return;
|
||||
}
|
||||
|
||||
let ok = 0;
|
||||
let fail = 0;
|
||||
|
||||
for (let i = 0; i < missing.length; i++) {
|
||||
const chunk = missing[i];
|
||||
try {
|
||||
const embedding = await generateEmbedding(chunk.chunk_text);
|
||||
await pool.query(
|
||||
`INSERT INTO memory_embeddings (chunk_id, embedding, embedding_provider, embedding_model)
|
||||
VALUES ($1, $2::vector, $3, $4)`,
|
||||
[chunk.id, JSON.stringify(embedding), process.env.EMBED_BASE_URL, EMBED_MODEL],
|
||||
);
|
||||
ok++;
|
||||
if ((i + 1) % 20 === 0) {
|
||||
console.log(` [${i + 1}/${missing.length}] ok=${ok} fail=${fail}`);
|
||||
}
|
||||
} catch (err: any) {
|
||||
fail++;
|
||||
console.error(` FAIL chunk ${chunk.id}: ${err.message}`);
|
||||
// If rate limited, back off
|
||||
if (err.message?.includes('429')) {
|
||||
console.log(' Rate limited — waiting 10s...');
|
||||
await sleep(10_000);
|
||||
// retry once
|
||||
try {
|
||||
const embedding = await generateEmbedding(chunk.chunk_text);
|
||||
await pool.query(
|
||||
`INSERT INTO memory_embeddings (chunk_id, embedding, embedding_provider, embedding_model)
|
||||
VALUES ($1, $2::vector, $3, $4)`,
|
||||
[chunk.id, JSON.stringify(embedding), process.env.EMBED_BASE_URL, EMBED_MODEL],
|
||||
);
|
||||
ok++;
|
||||
fail--;
|
||||
} catch {
|
||||
console.error(` RETRY FAIL chunk ${chunk.id}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
await sleep(DELAY_MS);
|
||||
}
|
||||
|
||||
console.log(`\nDone. ok=${ok} fail=${fail} total=${missing.length}`);
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
Loading…
Add table
Reference in a new issue