import { existsSync, readdirSync, readFileSync, statSync } from 'node:fs'; import { dirname, extname, join, relative, resolve } from 'node:path'; import { fileURLToPath, pathToFileURL } from 'node:url'; export const repoRoot = resolve(dirname(fileURLToPath(import.meta.url)), '../..'); export const configPath = join(repoRoot, 'scripts/rag/rag-config.json'); export function readConfig() { return JSON.parse(readFileSync(configPath, 'utf8')); } export function normalizePath(filePath) { return filePath.replace(/\\/gu, '/'); } export function repoRelative(filePath) { return normalizePath(relative(repoRoot, filePath)); } export function resolveRepoPath(filePath) { return resolve(repoRoot, filePath); } export function getRuntimeNodeModules(config) { return join(repoRoot, config.runtimeDir, 'node_modules'); } export function assertLocalRuntime(config) { const runtimeModules = getRuntimeNodeModules(config); const hasLance = existsSync(join(runtimeModules, '@lancedb/lancedb')); const hasTransformers = existsSync(join(runtimeModules, '@huggingface/transformers')); if (hasLance && hasTransformers) { return runtimeModules; } throw new Error( [ '本地 RAG 运行时依赖尚未安装。', '按项目约定,RAG 依赖不进入根 package.json,也不默认安装。', '需要启用 RAG 时,Agent 必须先询问用户,然后在本地 gitignored 目录安装:', '', ` mkdir -p ${config.runtimeDir}`, ` npm init -y --prefix ${config.runtimeDir}`, ` npm install --prefix ${config.runtimeDir} @lancedb/lancedb@0.30.0 @huggingface/transformers@4.2.0`, '', `当前检查目录:${runtimeModules}`, ].join('\n'), ); } export async function loadRagRuntime(config) { const runtimeModules = assertLocalRuntime(config); const lancedb = await import( pathToFileURL(join(runtimeModules, '@lancedb/lancedb/dist/index.js')).href ); const transformers = await import( pathToFileURL( join(runtimeModules, '@huggingface/transformers/dist/transformers.node.mjs'), ).href ); transformers.env.cacheDir = join(repoRoot, config.modelCacheDir); transformers.env.useFSCache = true; transformers.env.allowRemoteModels = true; return { lancedb, transformers }; } export function listSourceFiles(config, limitFiles = Number.POSITIVE_INFINITY) { const excluded = config.exclude ?? []; const files = []; const seen = new Set(); for (const source of config.sources ?? []) { const sourcePath = resolveRepoPath(source.path); if (!existsSync(sourcePath)) { if (!source.optional) { throw new Error(`RAG source not found: ${source.path}`); } continue; } for (const filePath of walkTextFiles(sourcePath, excluded)) { const rel = repoRelative(filePath); if (seen.has(rel)) { continue; } seen.add(rel); files.push({ path: filePath, rel, weight: source.weight ?? 1 }); if (files.length >= limitFiles) { return files; } } } return files; } function walkTextFiles(targetPath, excluded) { const stat = statSync(targetPath); if (stat.isFile()) { return shouldReadFile(targetPath, excluded) ? [targetPath] : []; } const files = []; const walk = (dir) => { for (const name of readdirSync(dir)) { const child = join(dir, name); const rel = `${repoRelative(child)}${statSync(child).isDirectory() ? '/' : ''}`; if (excluded.some((prefix) => rel.startsWith(prefix))) { continue; } const childStat = statSync(child); if (childStat.isDirectory()) { walk(child); } else if (shouldReadFile(child, excluded)) { files.push(child); } } }; walk(targetPath); return files.sort((a, b) => repoRelative(a).localeCompare(repoRelative(b))); } function shouldReadFile(filePath, excluded) { const rel = repoRelative(filePath); if (excluded.some((prefix) => rel.startsWith(prefix))) { return false; } if (rel === 'AGENTS.md' || rel === 'CONTEXT.md' || rel.endsWith('/README.md')) { return true; } return new Set(['.md', '.txt']).has(extname(filePath).toLowerCase()); } export function chunkText(text, options) { const maxChars = options.maxChars ?? 1600; const overlapChars = options.overlapChars ?? 220; const normalized = text.replace(/\r\n?/gu, '\n').trim(); if (!normalized) { return []; } const blocks = normalized.split(/\n(?=#{1,6}\s+)/u); const chunks = []; let current = ''; const pushCurrent = () => { const trimmed = current.trim(); if (trimmed) { chunks.push(trimmed); } current = ''; }; for (const block of blocks) { if ((current.length + block.length + 2) <= maxChars) { current = current ? `${current}\n\n${block}` : block; continue; } pushCurrent(); if (block.length <= maxChars) { current = block; continue; } for (let start = 0; start < block.length; start += Math.max(1, maxChars - overlapChars)) { chunks.push(block.slice(start, start + maxChars).trim()); } } pushCurrent(); return chunks.map((chunk, index) => ({ index, text: chunk })); } export function buildChunkId(filePath, chunkIndex) { return `${filePath}#${chunkIndex}`; } export function extractTitle(text, fallback) { const title = text.match(/^#\s+(.+)$/mu)?.[1]?.trim(); return title || fallback; } export async function createEmbedder(transformers, model) { const extractor = await transformers.pipeline('feature-extraction', model); return async function embed(text, type) { const prefix = type === 'query' ? 'query: ' : 'passage: '; const output = await extractor(`${prefix}${text}`, { pooling: 'mean', normalize: true, }); return Array.from(output.data, Number); }; } export function parseLimitFiles(argv) { const value = readArg(argv, '--limit-files'); if (!value) { return Number.POSITIVE_INFINITY; } const parsed = Number(value); if (!Number.isInteger(parsed) || parsed <= 0) { throw new Error(`Invalid --limit-files value: ${value}`); } return parsed; } export function readArg(argv, name, fallback = undefined) { const index = argv.indexOf(name); if (index === -1) { return fallback; } return argv[index + 1] ?? fallback; } export function hasFlag(argv, name) { return argv.includes(name); }