Files
Genarrative/packages/shared/src/llm/narrativeLanguage.ts
kdletters cbc27bad4a
Some checks failed
CI / verify (push) Has been cancelled
init with react+axum+spacetimedb
2026-04-26 18:06:23 +08:00

67 lines
1.6 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
const CJK_CHAR_PATTERN = /[\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF]/gu;
const LATIN_WORD_PATTERN = /[A-Za-z][A-Za-z'-]{1,}/g;
const LATIN_FRAGMENT_PATTERN =
/[A-Za-z][A-Za-z0-9'"()\-,:;!?/]*(?:\s+[A-Za-z0-9'"()\-,:;!?/]+)+/gu;
const SAFE_LATIN_TOKENS = new Set([
'act',
'ai',
'boss',
'cd',
'hp',
'json',
'llm',
'mp',
'npc',
'qa',
'rpg',
]);
function getCjkCharCount(text: string) {
return text.match(CJK_CHAR_PATTERN)?.length ?? 0;
}
function getSignificantLatinWords(text: string) {
return (text.match(LATIN_WORD_PATTERN) ?? [])
.map((word) => word.toLowerCase())
.filter((word) => word.length >= 4 && !SAFE_LATIN_TOKENS.has(word));
}
export function hasMixedNarrativeLanguage(text: string) {
const trimmed = text.trim();
if (!trimmed) {
return false;
}
const cjkCharCount = getCjkCharCount(trimmed);
const latinSentenceFragments = (trimmed.match(LATIN_FRAGMENT_PATTERN) ?? [])
.map((fragment) => fragment.trim())
.filter((fragment) => fragment.split(/\s+/u).length >= 2);
const significantLatinWords = getSignificantLatinWords(trimmed);
if (latinSentenceFragments.length > 0) {
return true;
}
if (cjkCharCount > 0 && significantLatinWords.length >= 2) {
return true;
}
return cjkCharCount === 0 && significantLatinWords.length >= 3;
}
export function sanitizePromptNarrativeText(
text: string | null | undefined,
fallback: string | null = null,
) {
if (typeof text !== 'string') {
return fallback;
}
const trimmed = text.trim();
if (!trimmed) {
return fallback;
}
return hasMixedNarrativeLanguage(trimmed) ? fallback : trimmed;
}