clawdie-ai/scripts/backfill-embeddings.ts

/**
 * Backfill missing vector embeddings for memory chunks.
 *
 * Useful after embedding API outages, provider switches, or fresh installs
 * where chunks were stored but embeddings failed. Repairs semantic search
 * without re-importing memories.
 *
 * Requires env vars from .env — source them before running:
 *   env $(grep -v '^#' .env | xargs) npx tsx scripts/backfill-embeddings.ts
 *
 * Options:
 *   --dry-run   Count missing embeddings without writing anything
 *   BATCH_SIZE=N  Process N chunks before logging progress (default 5)
 */
import pg from 'pg';

const DB_URL = process.env.MEMORY_DB_URL!;
const EMBED_API_URL = process.env.EMBED_BASE_URL
  ? `${process.env.EMBED_BASE_URL}/embeddings`
  : '';
const EMBED_MODEL = process.env.EMBED_MODEL || 'BAAI/bge-m3';
const EMBED_API_KEY =
  process.env.EMBED_API_KEY ?? '';

const BATCH_SIZE = parseInt(process.env.BATCH_SIZE || '5', 10);
const DRY_RUN = process.argv.includes('--dry-run');
const DELAY_MS = 500; // rate limit cushion

async function generateEmbedding(text: string): Promise<number[]> {
  const headers: Record<string, string> = {
    'Content-Type': 'application/json',
  };
  if (EMBED_API_KEY) headers['Authorization'] = `Bearer ${EMBED_API_KEY}`;

  const res = await fetch(EMBED_API_URL, {
    method: 'POST',
    headers,
    body: JSON.stringify({ model: EMBED_MODEL, input: text }),
  });

  if (!res.ok) {
    const body = await res.text();
    throw new Error(`Embedding API error ${res.status}: ${body}`);
  }

  const data = (await res.json()) as {
    data?: Array<{ embedding: number[] }>;
    error?: unknown;
  };
  if (data.error) throw new Error(`Embedding API error: ${JSON.stringify(data.error)}`);
  if (!data.data || data.data.length === 0) throw new Error('No embedding returned');
  return data.data[0].embedding;
}

function sleep(ms: number) {
  return new Promise((r) => setTimeout(r, ms));
}

async function main() {
  if (!DB_URL) {
    console.error('MEMORY_DB_URL not set');
    process.exit(1);
  }
  if (!EMBED_API_URL) {
    console.error('EMBED_BASE_URL not set');
    process.exit(1);
  }

  const pool = new pg.Pool({ connectionString: DB_URL, max: 3 });

  // Find chunks without embeddings
  const { rows: missing } = await pool.query<{ id: string; chunk_text: string }>(
    `SELECT mc.id, mc.chunk_text
     FROM memory_chunks mc
     LEFT JOIN memory_embeddings me ON me.chunk_id = mc.id
     WHERE me.chunk_id IS NULL
     ORDER BY mc.id`,
  );

  console.log(`Found ${missing.length} chunks without embeddings`);
  if (DRY_RUN) {
    console.log('Dry run — exiting.');
    await pool.end();
    return;
  }

  let ok = 0;
  let fail = 0;

  for (let i = 0; i < missing.length; i++) {
    const chunk = missing[i];
    try {
      const embedding = await generateEmbedding(chunk.chunk_text);
      await pool.query(
        `INSERT INTO memory_embeddings (chunk_id, embedding, embedding_provider, embedding_model)
         VALUES ($1, $2::vector, $3, $4)`,
        [chunk.id, JSON.stringify(embedding), process.env.EMBED_BASE_URL, EMBED_MODEL],
      );
      ok++;
      if ((i + 1) % 20 === 0) {
        console.log(`  [${i + 1}/${missing.length}] ok=${ok} fail=${fail}`);
      }
    } catch (err: any) {
      fail++;
      console.error(`  FAIL chunk ${chunk.id}: ${err.message}`);
      // If rate limited, back off
      if (err.message?.includes('429')) {
        console.log('  Rate limited — waiting 10s...');
        await sleep(10_000);
        // retry once
        try {
          const embedding = await generateEmbedding(chunk.chunk_text);
          await pool.query(
            `INSERT INTO memory_embeddings (chunk_id, embedding, embedding_provider, embedding_model)
             VALUES ($1, $2::vector, $3, $4)`,
            [chunk.id, JSON.stringify(embedding), process.env.EMBED_BASE_URL, EMBED_MODEL],
          );
          ok++;
          fail--;
        } catch {
          console.error(`  RETRY FAIL chunk ${chunk.id}`);
        }
      }
    }
    await sleep(DELAY_MS);
  }

  console.log(`\nDone. ok=${ok} fail=${fail} total=${missing.length}`);
  await pool.end();
}

main().catch((err) => {
  console.error(err);
  process.exit(1);
});