Files
LittleWhiteBox/modules/story-summary/vector/tokenizer.js

288 lines
9.8 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { xbLog } from '../../../core/debug-core.js';
import { extensionFolderPath } from '../../../core/constants.js';
const MODULE_ID = 'tokenizer';
// ═══════════════════════════════════════════════════════════════════════════
// 词性过滤
// ═══════════════════════════════════════════════════════════════════════════
// 保留的词性(名词类 + 英文)
const KEEP_POS_PREFIXES = ['n', 'eng'];
function shouldKeepByPos(pos) {
return KEEP_POS_PREFIXES.some(prefix => pos.startsWith(prefix));
}
// ═══════════════════════════════════════════════════════════════════════════
// 语言检测
// ═══════════════════════════════════════════════════════════════════════════
function shouldUseJieba(text) {
const zh = (text.match(/[\u4e00-\u9fff]/g) || []).length;
return zh >= 5;
}
function detectMainLanguage(text) {
const zh = (text.match(/[\u4e00-\u9fff]/g) || []).length;
const jp = (text.match(/[\u3040-\u309f\u30a0-\u30ff]/g) || []).length;
const en = (text.match(/[a-zA-Z]/g) || []).length;
const total = zh + jp + en || 1;
if (jp / total > 0.2) return 'jp';
if (en / total > 0.5) return 'en';
return 'zh';
}
// 替换原有的大停用词表
const STOP_WORDS = new Set([
// 系统词
'用户', '角色', '玩家', '旁白', 'user', 'assistant', 'system',
// 时间泛词
'时候', '现在', '今天', '明天', '昨天', '早上', '晚上',
// 方位泛词
'这里', '那里', '上面', '下面', '里面', '外面',
// 泛化名词
'东西', '事情', '事儿', '地方', '样子', '意思', '感觉',
'一下', '一些', '一点', '一会', '一次',
]);
// 英文停用词fallback 用)
const EN_STOP_WORDS = new Set([
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
'could', 'should', 'may', 'might', 'must', 'can',
'to', 'of', 'in', 'on', 'at', 'for', 'with', 'by', 'from',
'and', 'or', 'but', 'if', 'that', 'this', 'it', 'its',
'i', 'you', 'he', 'she', 'we', 'they',
'my', 'your', 'his', 'her', 'our', 'their',
'what', 'which', 'who', 'whom', 'where', 'when', 'why', 'how',
]);
let jiebaModule = null;
let jiebaReady = false;
let jiebaLoading = false;
async function ensureJieba() {
if (jiebaReady) return true;
if (jiebaLoading) {
for (let i = 0; i < 50; i++) {
await new Promise(r => setTimeout(r, 100));
if (jiebaReady) return true;
}
return false;
}
jiebaLoading = true;
try {
const jiebaPath = `/${extensionFolderPath}/libs/jieba-wasm/jieba_rs_wasm.js`;
// eslint-disable-next-line no-unsanitized/method
jiebaModule = await import(jiebaPath);
if (jiebaModule.default) {
await jiebaModule.default();
}
jiebaReady = true;
xbLog.info(MODULE_ID, 'jieba-wasm 加载成功');
const keys = Object.getOwnPropertyNames(jiebaModule || {});
const dkeys = Object.getOwnPropertyNames(jiebaModule?.default || {});
xbLog.info(MODULE_ID, `jieba keys: ${keys.join(',')}`);
xbLog.info(MODULE_ID, `jieba default keys: ${dkeys.join(',')}`);
xbLog.info(MODULE_ID, `jieba.tag: ${typeof jiebaModule?.tag}`);
return true;
} catch (e) {
xbLog.error(MODULE_ID, 'jieba-wasm 加载失败', e);
jiebaLoading = false;
return false;
}
}
function fallbackTokenize(text) {
const tokens = [];
const lang = detectMainLanguage(text);
// 英文
const enMatches = text.match(/[a-zA-Z]{2,20}/gi) || [];
tokens.push(...enMatches.filter(w => !EN_STOP_WORDS.has(w.toLowerCase())));
// 日语假名
if (lang === 'jp') {
const kanaMatches = text.match(/[\u3040-\u309f\u30a0-\u30ff]{2,10}/g) || [];
tokens.push(...kanaMatches);
}
// 中文/日语汉字
const zhMatches = text.match(/[\u4e00-\u9fff]{2,6}/g) || [];
tokens.push(...zhMatches);
// 数字+汉字组合
const numZhMatches = text.match(/\d+[\u4e00-\u9fff]{1,4}/g) || [];
tokens.push(...numZhMatches);
return tokens;
}
export async function extractNouns(text, options = {}) {
const { minLen = 2, maxCount = 0 } = options;
if (!text?.trim()) return [];
// 中文为主 → 用 jieba
if (shouldUseJieba(text)) {
const hasJieba = await ensureJieba();
if (hasJieba && jiebaModule?.tag) {
try {
const tagged = jiebaModule.tag(text, true);
const result = [];
const seen = new Set();
const list = Array.isArray(tagged) ? tagged : [];
for (const item of list) {
let word = '';
let pos = '';
if (Array.isArray(item)) {
[word, pos] = item;
} else if (item && typeof item === 'object') {
word = item.word || item.w || item.text || item.term || '';
pos = item.tag || item.pos || item.p || '';
}
if (!word || !pos) continue;
if (word.length < minLen) continue;
if (!shouldKeepByPos(pos)) continue;
if (STOP_WORDS.has(word)) continue;
if (seen.has(word)) continue;
seen.add(word);
result.push(word);
if (maxCount > 0 && result.length >= maxCount) break;
}
return result;
} catch (e) {
xbLog.warn(MODULE_ID, 'jieba tag 失败:' + (e && e.message ? e.message : String(e)));
}
}
}
// 非中文 / jieba 失败 → fallback
const tokens = fallbackTokenize(text);
const result = [];
const seen = new Set();
for (const t of tokens) {
if (t.length < minLen) continue;
if (STOP_WORDS.has(t)) continue;
if (seen.has(t)) continue;
seen.add(t);
result.push(t);
if (maxCount > 0 && result.length >= maxCount) break;
}
return result;
}
export async function extractRareTerms(text, maxCount = 15) {
if (!text?.trim()) return [];
// 中文为主 → 用 jieba
if (shouldUseJieba(text)) {
const hasJieba = await ensureJieba();
if (hasJieba && jiebaModule?.tag) {
try {
const tagged = jiebaModule.tag(text, true);
const candidates = [];
const seen = new Set();
const list = Array.isArray(tagged) ? tagged : [];
for (const item of list) {
let word = '';
let pos = '';
if (Array.isArray(item)) {
[word, pos] = item;
} else if (item && typeof item === 'object') {
word = item.word || item.w || item.text || item.term || '';
pos = item.tag || item.pos || item.p || '';
}
if (!word || !pos) continue;
if (word.length < 2) continue;
if (!shouldKeepByPos(pos)) continue;
if (STOP_WORDS.has(word)) continue;
if (seen.has(word)) continue;
seen.add(word);
// 稀有度评分
let score = 0;
if (word.length >= 4) score += 3;
else if (word.length >= 3) score += 1;
if (/[a-zA-Z]/.test(word)) score += 2;
if (/\d/.test(word)) score += 1;
// 专名词性加分
if (['nr', 'ns', 'nt', 'nz'].some(p => pos.startsWith(p))) score += 2;
candidates.push({ term: word, score });
}
candidates.sort((a, b) => b.score - a.score);
return candidates.slice(0, maxCount).map(x => x.term);
} catch (e) {
xbLog.warn(MODULE_ID, 'jieba tag 失败:' + (e && e.message ? e.message : String(e)));
}
}
}
// 非中文 / jieba 失败 → fallback
const allNouns = await extractNouns(text, { minLen: 2, maxCount: 0 });
const scored = allNouns.map(t => {
let score = 0;
if (t.length >= 4) score += 3;
else if (t.length >= 3) score += 1;
if (/[a-zA-Z]/.test(t)) score += 2;
if (/\d/.test(t)) score += 1;
return { term: t, score };
});
scored.sort((a, b) => b.score - a.score);
return scored.slice(0, maxCount).map(x => x.term);
}
export async function extractNounsFromFactsO(facts, relevantSubjects, maxCount = 5) {
if (!facts?.length || !relevantSubjects?.size) return [];
const oTexts = [];
for (const f of facts) {
if (f.retracted) continue;
// 只取相关主体的 facts
const s = String(f.s || '').trim();
if (!relevantSubjects.has(s)) continue;
const o = String(f.o || '').trim();
if (!o) continue;
// 跳过太长的 O可能是完整句子
if (o.length > 30) continue;
oTexts.push(o);
}
if (!oTexts.length) return [];
const combined = oTexts.join(' ');
return await extractNouns(combined, { minLen: 2, maxCount });
}
export { ensureJieba };