Files
Genarrative/scripts/rag/rag-utils.mjs
kdletters 15a527d7f4 整理项目记忆与Agent RAG入口
迁移项目共享记忆到 docs/project-memory,保留 .hermes 仅作为工具目录

新增 Agent 本地 RAG 索引与上下文包检索脚本

记录 RAG 依赖只安装到 .rag/runtime 并加入忽略规则

同步文档与检查脚本中的项目记忆路径
2026-06-16 16:06:54 +08:00

222 lines
6.3 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { existsSync, readdirSync, readFileSync, statSync } from 'node:fs';
import { dirname, extname, join, relative, resolve } from 'node:path';
import { fileURLToPath, pathToFileURL } from 'node:url';
export const repoRoot = resolve(dirname(fileURLToPath(import.meta.url)), '../..');
export const configPath = join(repoRoot, 'scripts/rag/rag-config.json');
export function readConfig() {
return JSON.parse(readFileSync(configPath, 'utf8'));
}
export function normalizePath(filePath) {
return filePath.replace(/\\/gu, '/');
}
export function repoRelative(filePath) {
return normalizePath(relative(repoRoot, filePath));
}
export function resolveRepoPath(filePath) {
return resolve(repoRoot, filePath);
}
export function getRuntimeNodeModules(config) {
return join(repoRoot, config.runtimeDir, 'node_modules');
}
export function assertLocalRuntime(config) {
const runtimeModules = getRuntimeNodeModules(config);
const hasLance = existsSync(join(runtimeModules, '@lancedb/lancedb'));
const hasTransformers = existsSync(join(runtimeModules, '@huggingface/transformers'));
if (hasLance && hasTransformers) {
return runtimeModules;
}
throw new Error(
[
'本地 RAG 运行时依赖尚未安装。',
'按项目约定RAG 依赖不进入根 package.json也不默认安装。',
'需要启用 RAG 时Agent 必须先询问用户,然后在本地 gitignored 目录安装:',
'',
` mkdir -p ${config.runtimeDir}`,
` npm init -y --prefix ${config.runtimeDir}`,
` npm install --prefix ${config.runtimeDir} @lancedb/lancedb@0.30.0 @huggingface/transformers@4.2.0`,
'',
`当前检查目录:${runtimeModules}`,
].join('\n'),
);
}
export async function loadRagRuntime(config) {
const runtimeModules = assertLocalRuntime(config);
const lancedb = await import(
pathToFileURL(join(runtimeModules, '@lancedb/lancedb/dist/index.js')).href
);
const transformers = await import(
pathToFileURL(
join(runtimeModules, '@huggingface/transformers/dist/transformers.node.mjs'),
).href
);
transformers.env.cacheDir = join(repoRoot, config.modelCacheDir);
transformers.env.useFSCache = true;
transformers.env.allowRemoteModels = true;
return { lancedb, transformers };
}
export function listSourceFiles(config, limitFiles = Number.POSITIVE_INFINITY) {
const excluded = config.exclude ?? [];
const files = [];
const seen = new Set();
for (const source of config.sources ?? []) {
const sourcePath = resolveRepoPath(source.path);
if (!existsSync(sourcePath)) {
if (!source.optional) {
throw new Error(`RAG source not found: ${source.path}`);
}
continue;
}
for (const filePath of walkTextFiles(sourcePath, excluded)) {
const rel = repoRelative(filePath);
if (seen.has(rel)) {
continue;
}
seen.add(rel);
files.push({ path: filePath, rel, weight: source.weight ?? 1 });
if (files.length >= limitFiles) {
return files;
}
}
}
return files;
}
function walkTextFiles(targetPath, excluded) {
const stat = statSync(targetPath);
if (stat.isFile()) {
return shouldReadFile(targetPath, excluded) ? [targetPath] : [];
}
const files = [];
const walk = (dir) => {
for (const name of readdirSync(dir)) {
const child = join(dir, name);
const rel = `${repoRelative(child)}${statSync(child).isDirectory() ? '/' : ''}`;
if (excluded.some((prefix) => rel.startsWith(prefix))) {
continue;
}
const childStat = statSync(child);
if (childStat.isDirectory()) {
walk(child);
} else if (shouldReadFile(child, excluded)) {
files.push(child);
}
}
};
walk(targetPath);
return files.sort((a, b) => repoRelative(a).localeCompare(repoRelative(b)));
}
function shouldReadFile(filePath, excluded) {
const rel = repoRelative(filePath);
if (excluded.some((prefix) => rel.startsWith(prefix))) {
return false;
}
if (rel === 'AGENTS.md' || rel === 'CONTEXT.md' || rel.endsWith('/README.md')) {
return true;
}
return new Set(['.md', '.txt']).has(extname(filePath).toLowerCase());
}
export function chunkText(text, options) {
const maxChars = options.maxChars ?? 1600;
const overlapChars = options.overlapChars ?? 220;
const normalized = text.replace(/\r\n?/gu, '\n').trim();
if (!normalized) {
return [];
}
const blocks = normalized.split(/\n(?=#{1,6}\s+)/u);
const chunks = [];
let current = '';
const pushCurrent = () => {
const trimmed = current.trim();
if (trimmed) {
chunks.push(trimmed);
}
current = '';
};
for (const block of blocks) {
if ((current.length + block.length + 2) <= maxChars) {
current = current ? `${current}\n\n${block}` : block;
continue;
}
pushCurrent();
if (block.length <= maxChars) {
current = block;
continue;
}
for (let start = 0; start < block.length; start += Math.max(1, maxChars - overlapChars)) {
chunks.push(block.slice(start, start + maxChars).trim());
}
}
pushCurrent();
return chunks.map((chunk, index) => ({ index, text: chunk }));
}
export function buildChunkId(filePath, chunkIndex) {
return `${filePath}#${chunkIndex}`;
}
export function extractTitle(text, fallback) {
const title = text.match(/^#\s+(.+)$/mu)?.[1]?.trim();
return title || fallback;
}
export async function createEmbedder(transformers, model) {
const extractor = await transformers.pipeline('feature-extraction', model);
return async function embed(text, type) {
const prefix = type === 'query' ? 'query: ' : 'passage: ';
const output = await extractor(`${prefix}${text}`, {
pooling: 'mean',
normalize: true,
});
return Array.from(output.data, Number);
};
}
export function parseLimitFiles(argv) {
const value = readArg(argv, '--limit-files');
if (!value) {
return Number.POSITIVE_INFINITY;
}
const parsed = Number(value);
if (!Number.isInteger(parsed) || parsed <= 0) {
throw new Error(`Invalid --limit-files value: ${value}`);
}
return parsed;
}
export function readArg(argv, name, fallback = undefined) {
const index = argv.indexOf(name);
if (index === -1) {
return fallback;
}
return argv[index + 1] ?? fallback;
}
export function hasFlag(argv, name) {
return argv.includes(name);
}