import { mkdirSync, readFileSync } from 'node:fs'; import { join } from 'node:path'; import { buildChunkId, chunkText, createEmbedder, extractTitle, hasFlag, listSourceFiles, loadRagRuntime, parseLimitFiles, readConfig, repoRoot, } from './rag-utils.mjs'; const config = readConfig(); const limitFiles = parseLimitFiles(process.argv); const dryRun = hasFlag(process.argv, '--dry-run'); const files = listSourceFiles(config, limitFiles); const rows = []; for (const file of files) { const text = readFileSync(file.path, 'utf8'); const title = extractTitle(text, file.rel); for (const chunk of chunkText(text, config.chunk ?? {})) { rows.push({ id: buildChunkId(file.rel, chunk.index), path: file.rel, title, chunk_index: chunk.index, source_weight: file.weight, text: chunk.text, }); } } console.log(`[rag:index] source files=${files.length}, chunks=${rows.length}`); if (dryRun) { for (const row of rows.slice(0, 10)) { console.log(`- ${row.id} ${row.title}`); } process.exit(0); } if (rows.length === 0) { throw new Error('No RAG chunks found.'); } const { lancedb, transformers } = await loadRagRuntime(config); const embed = await createEmbedder(transformers, config.model); for (let index = 0; index < rows.length; index += 1) { rows[index].vector = await embed(rows[index].text, 'passage'); if ((index + 1) % 25 === 0 || index + 1 === rows.length) { console.log(`[rag:index] embedded ${index + 1}/${rows.length}`); } } mkdirSync(join(repoRoot, config.databaseDir), { recursive: true }); const db = await lancedb.connect(join(repoRoot, config.databaseDir)); await db.createTable(config.tableName, rows, { mode: 'overwrite' }); console.log( `[rag:index] wrote table=${config.tableName}, db=${config.databaseDir}, model=${config.model}`, );