Files
LittleWhiteBox/modules/story-summary/vector/text-search.js

174 lines
5.2 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// text-search.js - 最终版
import MiniSearch from '../../../libs/minisearch.mjs';
const STOP_WORDS = new Set([
'的', '了', '是', '在', '和', '与', '或', '但', '而', '却',
'这', '那', '他', '她', '它', '我', '你', '们', '着', '过',
'把', '被', '给', '让', '向', '就', '都', '也', '还', '又',
'很', '太', '更', '最', '只', '才', '已', '正', '会', '能',
'要', '可', '得', '地', '之', '所', '以', '为', '于', '有',
'不', '去', '来', '上', '下', '里', '说', '看', '吧', '呢',
'啊', '吗', '呀', '哦', '嗯', '么',
'の', 'に', 'は', 'を', 'が', 'と', 'で', 'へ', 'や', 'か',
'も', 'な', 'よ', 'ね', 'わ', 'です', 'ます', 'した', 'ない',
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
'to', 'of', 'in', 'on', 'at', 'for', 'with', 'by', 'from',
'and', 'or', 'but', 'if', 'that', 'this', 'it', 'its',
'i', 'you', 'he', 'she', 'we', 'they', 'my', 'your', 'his',
]);
function tokenize(text) {
const s = String(text || '').toLowerCase().trim();
if (!s) return [];
const tokens = new Set();
// CJK Bigram + Trigram
const cjk = s.match(/[\u4e00-\u9fff\u3400-\u4dbf]+/g) || [];
for (const seg of cjk) {
const chars = [...seg].filter(c => !STOP_WORDS.has(c));
for (let i = 0; i < chars.length - 1; i++) {
tokens.add(chars[i] + chars[i + 1]);
}
for (let i = 0; i < chars.length - 2; i++) {
tokens.add(chars[i] + chars[i + 1] + chars[i + 2]);
}
}
// 日语假名
const kana = s.match(/[\u3040-\u309f\u30a0-\u30ff]{2,}/g) || [];
for (const k of kana) {
if (!STOP_WORDS.has(k)) tokens.add(k);
}
// 英文
const en = s.match(/[a-z]{2,}/g) || [];
for (const w of en) {
if (!STOP_WORDS.has(w)) tokens.add(w);
}
return [...tokens];
}
let idx = null;
let lastRevision = null;
function stripFloorTag(s) {
return String(s || '').replace(/\s*\(#\d+(?:-\d+)?\)\s*$/, '').trim();
}
export function ensureEventTextIndex(events, revision) {
if (!events?.length) {
idx = null;
lastRevision = null;
return;
}
if (idx && revision === lastRevision) return;
try {
idx = new MiniSearch({
fields: ['title', 'summary', 'participants'],
storeFields: ['id'],
tokenize,
searchOptions: { tokenize },
});
idx.addAll(events.map(e => ({
id: e.id,
title: e.title || '',
summary: stripFloorTag(e.summary),
participants: (e.participants || []).join(' '),
})));
lastRevision = revision;
} catch (e) {
console.error('[text-search] Index build failed:', e);
idx = null;
}
}
/**
* BM25 检索,返回 top-K 候选给 RRF
*
* 设计原则:
* - 不做分数过滤BM25 分数跨查询不可比)
* - 不做匹配数过滤bigram 让一个词产生多个 token
* - 只做 top-KBM25 排序本身有区分度)
* - 质量过滤交给 RRF 后的 hasVector 过滤
*/
/**
* 动态 top-K累积分数占比法
*
* 原理BM25 分数服从幂律分布,少数高分条目贡献大部分总分
* 取累积分数达到阈值的最小 K
*
* 参考帕累托法则80/20 法则)在信息检索中的应用
*/
function dynamicTopK(scores, coverage = 0.90, minK = 15, maxK = 80) {
if (!scores.length) return 0;
const total = scores.reduce((a, b) => a + b, 0);
if (total <= 0) return Math.min(minK, scores.length);
let cumulative = 0;
for (let i = 0; i < scores.length; i++) {
cumulative += scores[i];
if (cumulative / total >= coverage) {
return Math.max(minK, Math.min(maxK, i + 1));
}
}
return Math.min(maxK, scores.length);
}
export function searchEventsByText(queryText, limit = 80) {
if (!idx || !queryText?.trim()) return [];
try {
const results = idx.search(queryText, {
boost: { title: 4, participants: 2, summary: 1 },
fuzzy: false,
prefix: false,
});
if (!results.length) return [];
const scores = results.map(r => r.score);
const k = dynamicTopK(scores, 0.90, 15, limit);
const output = results.slice(0, k).map((r, i) => ({
id: r.id,
textRank: i + 1,
score: r.score,
}));
const total = scores.reduce((a, b) => a + b, 0);
const kCumulative = scores.slice(0, k).reduce((a, b) => a + b, 0);
output._gapInfo = {
total: results.length,
returned: k,
coverage: ((kCumulative / total) * 100).toFixed(1) + '%',
scoreRange: {
top: scores[0]?.toFixed(1),
cutoff: scores[k - 1]?.toFixed(1),
p50: scores[Math.floor(scores.length / 2)]?.toFixed(1),
last: scores[scores.length - 1]?.toFixed(1),
},
};
return output;
} catch (e) {
console.error('[text-search] Search failed:', e);
return [];
}
}
export function clearEventTextIndex() {
idx = null;
lastRevision = null;
}