67 lines
1.6 KiB
TypeScript
67 lines
1.6 KiB
TypeScript
const CJK_CHAR_PATTERN = /[\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF]/gu;
|
||
const LATIN_WORD_PATTERN = /[A-Za-z][A-Za-z'’-]{1,}/g;
|
||
const LATIN_FRAGMENT_PATTERN =
|
||
/[A-Za-z][A-Za-z0-9'"“”‘’()\-,:;!?/]*(?:\s+[A-Za-z0-9'"“”‘’()\-,:;!?/]+)+/gu;
|
||
const SAFE_LATIN_TOKENS = new Set([
|
||
'act',
|
||
'ai',
|
||
'boss',
|
||
'cd',
|
||
'hp',
|
||
'json',
|
||
'llm',
|
||
'mp',
|
||
'npc',
|
||
'qa',
|
||
'rpg',
|
||
]);
|
||
|
||
function getCjkCharCount(text: string) {
|
||
return text.match(CJK_CHAR_PATTERN)?.length ?? 0;
|
||
}
|
||
|
||
function getSignificantLatinWords(text: string) {
|
||
return (text.match(LATIN_WORD_PATTERN) ?? [])
|
||
.map((word) => word.toLowerCase())
|
||
.filter((word) => word.length >= 4 && !SAFE_LATIN_TOKENS.has(word));
|
||
}
|
||
|
||
export function hasMixedNarrativeLanguage(text: string) {
|
||
const trimmed = text.trim();
|
||
if (!trimmed) {
|
||
return false;
|
||
}
|
||
|
||
const cjkCharCount = getCjkCharCount(trimmed);
|
||
const latinSentenceFragments = (trimmed.match(LATIN_FRAGMENT_PATTERN) ?? [])
|
||
.map((fragment) => fragment.trim())
|
||
.filter((fragment) => fragment.split(/\s+/u).length >= 2);
|
||
const significantLatinWords = getSignificantLatinWords(trimmed);
|
||
|
||
if (latinSentenceFragments.length > 0) {
|
||
return true;
|
||
}
|
||
|
||
if (cjkCharCount > 0 && significantLatinWords.length >= 2) {
|
||
return true;
|
||
}
|
||
|
||
return cjkCharCount === 0 && significantLatinWords.length >= 3;
|
||
}
|
||
|
||
export function sanitizePromptNarrativeText(
|
||
text: string | null | undefined,
|
||
fallback: string | null = null,
|
||
) {
|
||
if (typeof text !== 'string') {
|
||
return fallback;
|
||
}
|
||
|
||
const trimmed = text.trim();
|
||
if (!trimmed) {
|
||
return fallback;
|
||
}
|
||
|
||
return hasMixedNarrativeLanguage(trimmed) ? fallback : trimmed;
|
||
}
|