整理项目记忆与Agent RAG入口
迁移项目共享记忆到 docs/project-memory,保留 .hermes 仅作为工具目录 新增 Agent 本地 RAG 索引与上下文包检索脚本 记录 RAG 依赖只安装到 .rag/runtime 并加入忽略规则 同步文档与检查脚本中的项目记忆路径
This commit is contained in:
221
scripts/rag/rag-utils.mjs
Normal file
221
scripts/rag/rag-utils.mjs
Normal file
@@ -0,0 +1,221 @@
|
||||
import { existsSync, readdirSync, readFileSync, statSync } from 'node:fs';
|
||||
import { dirname, extname, join, relative, resolve } from 'node:path';
|
||||
import { fileURLToPath, pathToFileURL } from 'node:url';
|
||||
|
||||
export const repoRoot = resolve(dirname(fileURLToPath(import.meta.url)), '../..');
|
||||
export const configPath = join(repoRoot, 'scripts/rag/rag-config.json');
|
||||
|
||||
export function readConfig() {
|
||||
return JSON.parse(readFileSync(configPath, 'utf8'));
|
||||
}
|
||||
|
||||
export function normalizePath(filePath) {
|
||||
return filePath.replace(/\\/gu, '/');
|
||||
}
|
||||
|
||||
export function repoRelative(filePath) {
|
||||
return normalizePath(relative(repoRoot, filePath));
|
||||
}
|
||||
|
||||
export function resolveRepoPath(filePath) {
|
||||
return resolve(repoRoot, filePath);
|
||||
}
|
||||
|
||||
export function getRuntimeNodeModules(config) {
|
||||
return join(repoRoot, config.runtimeDir, 'node_modules');
|
||||
}
|
||||
|
||||
export function assertLocalRuntime(config) {
|
||||
const runtimeModules = getRuntimeNodeModules(config);
|
||||
const hasLance = existsSync(join(runtimeModules, '@lancedb/lancedb'));
|
||||
const hasTransformers = existsSync(join(runtimeModules, '@huggingface/transformers'));
|
||||
|
||||
if (hasLance && hasTransformers) {
|
||||
return runtimeModules;
|
||||
}
|
||||
|
||||
throw new Error(
|
||||
[
|
||||
'本地 RAG 运行时依赖尚未安装。',
|
||||
'按项目约定,RAG 依赖不进入根 package.json,也不默认安装。',
|
||||
'需要启用 RAG 时,Agent 必须先询问用户,然后在本地 gitignored 目录安装:',
|
||||
'',
|
||||
` mkdir -p ${config.runtimeDir}`,
|
||||
` npm init -y --prefix ${config.runtimeDir}`,
|
||||
` npm install --prefix ${config.runtimeDir} @lancedb/lancedb@0.30.0 @huggingface/transformers@4.2.0`,
|
||||
'',
|
||||
`当前检查目录:${runtimeModules}`,
|
||||
].join('\n'),
|
||||
);
|
||||
}
|
||||
|
||||
export async function loadRagRuntime(config) {
|
||||
const runtimeModules = assertLocalRuntime(config);
|
||||
const lancedb = await import(
|
||||
pathToFileURL(join(runtimeModules, '@lancedb/lancedb/dist/index.js')).href
|
||||
);
|
||||
const transformers = await import(
|
||||
pathToFileURL(
|
||||
join(runtimeModules, '@huggingface/transformers/dist/transformers.node.mjs'),
|
||||
).href
|
||||
);
|
||||
|
||||
transformers.env.cacheDir = join(repoRoot, config.modelCacheDir);
|
||||
transformers.env.useFSCache = true;
|
||||
transformers.env.allowRemoteModels = true;
|
||||
|
||||
return { lancedb, transformers };
|
||||
}
|
||||
|
||||
export function listSourceFiles(config, limitFiles = Number.POSITIVE_INFINITY) {
|
||||
const excluded = config.exclude ?? [];
|
||||
const files = [];
|
||||
const seen = new Set();
|
||||
|
||||
for (const source of config.sources ?? []) {
|
||||
const sourcePath = resolveRepoPath(source.path);
|
||||
if (!existsSync(sourcePath)) {
|
||||
if (!source.optional) {
|
||||
throw new Error(`RAG source not found: ${source.path}`);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const filePath of walkTextFiles(sourcePath, excluded)) {
|
||||
const rel = repoRelative(filePath);
|
||||
if (seen.has(rel)) {
|
||||
continue;
|
||||
}
|
||||
seen.add(rel);
|
||||
files.push({ path: filePath, rel, weight: source.weight ?? 1 });
|
||||
if (files.length >= limitFiles) {
|
||||
return files;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return files;
|
||||
}
|
||||
|
||||
function walkTextFiles(targetPath, excluded) {
|
||||
const stat = statSync(targetPath);
|
||||
if (stat.isFile()) {
|
||||
return shouldReadFile(targetPath, excluded) ? [targetPath] : [];
|
||||
}
|
||||
|
||||
const files = [];
|
||||
const walk = (dir) => {
|
||||
for (const name of readdirSync(dir)) {
|
||||
const child = join(dir, name);
|
||||
const rel = `${repoRelative(child)}${statSync(child).isDirectory() ? '/' : ''}`;
|
||||
if (excluded.some((prefix) => rel.startsWith(prefix))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const childStat = statSync(child);
|
||||
if (childStat.isDirectory()) {
|
||||
walk(child);
|
||||
} else if (shouldReadFile(child, excluded)) {
|
||||
files.push(child);
|
||||
}
|
||||
}
|
||||
};
|
||||
walk(targetPath);
|
||||
return files.sort((a, b) => repoRelative(a).localeCompare(repoRelative(b)));
|
||||
}
|
||||
|
||||
function shouldReadFile(filePath, excluded) {
|
||||
const rel = repoRelative(filePath);
|
||||
if (excluded.some((prefix) => rel.startsWith(prefix))) {
|
||||
return false;
|
||||
}
|
||||
if (rel === 'AGENTS.md' || rel === 'CONTEXT.md' || rel.endsWith('/README.md')) {
|
||||
return true;
|
||||
}
|
||||
return new Set(['.md', '.txt']).has(extname(filePath).toLowerCase());
|
||||
}
|
||||
|
||||
export function chunkText(text, options) {
|
||||
const maxChars = options.maxChars ?? 1600;
|
||||
const overlapChars = options.overlapChars ?? 220;
|
||||
const normalized = text.replace(/\r\n?/gu, '\n').trim();
|
||||
if (!normalized) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const blocks = normalized.split(/\n(?=#{1,6}\s+)/u);
|
||||
const chunks = [];
|
||||
let current = '';
|
||||
|
||||
const pushCurrent = () => {
|
||||
const trimmed = current.trim();
|
||||
if (trimmed) {
|
||||
chunks.push(trimmed);
|
||||
}
|
||||
current = '';
|
||||
};
|
||||
|
||||
for (const block of blocks) {
|
||||
if ((current.length + block.length + 2) <= maxChars) {
|
||||
current = current ? `${current}\n\n${block}` : block;
|
||||
continue;
|
||||
}
|
||||
pushCurrent();
|
||||
if (block.length <= maxChars) {
|
||||
current = block;
|
||||
continue;
|
||||
}
|
||||
for (let start = 0; start < block.length; start += Math.max(1, maxChars - overlapChars)) {
|
||||
chunks.push(block.slice(start, start + maxChars).trim());
|
||||
}
|
||||
}
|
||||
pushCurrent();
|
||||
|
||||
return chunks.map((chunk, index) => ({ index, text: chunk }));
|
||||
}
|
||||
|
||||
export function buildChunkId(filePath, chunkIndex) {
|
||||
return `${filePath}#${chunkIndex}`;
|
||||
}
|
||||
|
||||
export function extractTitle(text, fallback) {
|
||||
const title = text.match(/^#\s+(.+)$/mu)?.[1]?.trim();
|
||||
return title || fallback;
|
||||
}
|
||||
|
||||
export async function createEmbedder(transformers, model) {
|
||||
const extractor = await transformers.pipeline('feature-extraction', model);
|
||||
|
||||
return async function embed(text, type) {
|
||||
const prefix = type === 'query' ? 'query: ' : 'passage: ';
|
||||
const output = await extractor(`${prefix}${text}`, {
|
||||
pooling: 'mean',
|
||||
normalize: true,
|
||||
});
|
||||
return Array.from(output.data, Number);
|
||||
};
|
||||
}
|
||||
|
||||
export function parseLimitFiles(argv) {
|
||||
const value = readArg(argv, '--limit-files');
|
||||
if (!value) {
|
||||
return Number.POSITIVE_INFINITY;
|
||||
}
|
||||
const parsed = Number(value);
|
||||
if (!Number.isInteger(parsed) || parsed <= 0) {
|
||||
throw new Error(`Invalid --limit-files value: ${value}`);
|
||||
}
|
||||
return parsed;
|
||||
}
|
||||
|
||||
export function readArg(argv, name, fallback = undefined) {
|
||||
const index = argv.indexOf(name);
|
||||
if (index === -1) {
|
||||
return fallback;
|
||||
}
|
||||
return argv[index + 1] ?? fallback;
|
||||
}
|
||||
|
||||
export function hasFlag(argv, name) {
|
||||
return argv.includes(name);
|
||||
}
|
||||
Reference in New Issue
Block a user