// text-search.js - 最终版 import MiniSearch from '../../../libs/minisearch.mjs'; const STOP_WORDS = new Set([ '的', '了', '是', '在', '和', '与', '或', '但', '而', '却', '这', '那', '他', '她', '它', '我', '你', '们', '着', '过', '把', '被', '给', '让', '向', '就', '都', '也', '还', '又', '很', '太', '更', '最', '只', '才', '已', '正', '会', '能', '要', '可', '得', '地', '之', '所', '以', '为', '于', '有', '不', '去', '来', '上', '下', '里', '说', '看', '吧', '呢', '啊', '吗', '呀', '哦', '嗯', '么', 'の', 'に', 'は', 'を', 'が', 'と', 'で', 'へ', 'や', 'か', 'も', 'な', 'よ', 'ね', 'わ', 'です', 'ます', 'した', 'ない', 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'to', 'of', 'in', 'on', 'at', 'for', 'with', 'by', 'from', 'and', 'or', 'but', 'if', 'that', 'this', 'it', 'its', 'i', 'you', 'he', 'she', 'we', 'they', 'my', 'your', 'his', ]); function tokenize(text) { const s = String(text || '').toLowerCase().trim(); if (!s) return []; const tokens = new Set(); // CJK Bigram + Trigram const cjk = s.match(/[\u4e00-\u9fff\u3400-\u4dbf]+/g) || []; for (const seg of cjk) { const chars = [...seg].filter(c => !STOP_WORDS.has(c)); for (let i = 0; i < chars.length - 1; i++) { tokens.add(chars[i] + chars[i + 1]); } for (let i = 0; i < chars.length - 2; i++) { tokens.add(chars[i] + chars[i + 1] + chars[i + 2]); } } // 日语假名 const kana = s.match(/[\u3040-\u309f\u30a0-\u30ff]{2,}/g) || []; for (const k of kana) { if (!STOP_WORDS.has(k)) tokens.add(k); } // 英文 const en = s.match(/[a-z]{2,}/g) || []; for (const w of en) { if (!STOP_WORDS.has(w)) tokens.add(w); } return [...tokens]; } let idx = null; let lastRevision = null; function stripFloorTag(s) { return String(s || '').replace(/\s*\(#\d+(?:-\d+)?\)\s*$/, '').trim(); } export function ensureEventTextIndex(events, revision) { if (!events?.length) { idx = null; lastRevision = null; return; } if (idx && revision === lastRevision) return; try { idx = new MiniSearch({ fields: ['title', 'summary', 'participants'], storeFields: ['id'], tokenize, searchOptions: { tokenize }, }); idx.addAll(events.map(e => ({ id: e.id, title: e.title || '', summary: stripFloorTag(e.summary), participants: (e.participants || []).join(' '), }))); lastRevision = revision; } catch (e) { console.error('[text-search] Index build failed:', e); idx = null; } } /** * BM25 检索,返回 top-K 候选给 RRF * * 设计原则: * - 不做分数过滤(BM25 分数跨查询不可比) * - 不做匹配数过滤(bigram 让一个词产生多个 token) * - 只做 top-K(BM25 排序本身有区分度) * - 质量过滤交给 RRF 后的 hasVector 过滤 */ /** * 动态 top-K:累积分数占比法 * * 原理:BM25 分数服从幂律分布,少数高分条目贡献大部分总分 * 取累积分数达到阈值的最小 K * * 参考:帕累托法则(80/20 法则)在信息检索中的应用 */ function dynamicTopK(scores, coverage = 0.90, minK = 15, maxK = 80) { if (!scores.length) return 0; const total = scores.reduce((a, b) => a + b, 0); if (total <= 0) return Math.min(minK, scores.length); let cumulative = 0; for (let i = 0; i < scores.length; i++) { cumulative += scores[i]; if (cumulative / total >= coverage) { return Math.max(minK, Math.min(maxK, i + 1)); } } return Math.min(maxK, scores.length); } export function searchEventsByText(queryText, limit = 80) { if (!idx || !queryText?.trim()) return []; try { const results = idx.search(queryText, { boost: { title: 4, participants: 2, summary: 1 }, fuzzy: false, prefix: false, }); if (!results.length) return []; const scores = results.map(r => r.score); const k = dynamicTopK(scores, 0.90, 15, limit); const output = results.slice(0, k).map((r, i) => ({ id: r.id, textRank: i + 1, score: r.score, })); const total = scores.reduce((a, b) => a + b, 0); const kCumulative = scores.slice(0, k).reduce((a, b) => a + b, 0); output._gapInfo = { total: results.length, returned: k, coverage: ((kCumulative / total) * 100).toFixed(1) + '%', scoreRange: { top: scores[0]?.toFixed(1), cutoff: scores[k - 1]?.toFixed(1), p50: scores[Math.floor(scores.length / 2)]?.toFixed(1), last: scores[scores.length - 1]?.toFixed(1), }, }; return output; } catch (e) { console.error('[text-search] Search failed:', e); return []; } } export function clearEventTextIndex() { idx = null; lastRevision = null; }