Update recall entity weighting and prompt sections
This commit is contained in:
@@ -91,9 +91,9 @@ function cleanSummary(summary) {
|
|||||||
|
|
||||||
function buildSystemPreamble() {
|
function buildSystemPreamble() {
|
||||||
return [
|
return [
|
||||||
"以上内容为因上下文窗口限制保留的可见历史",
|
"以上是还留在眼前的对话",
|
||||||
"以下【剧情记忆】是对可见与不可见历史的总结:",
|
"以下是脑海里的记忆:",
|
||||||
"• 【世界约束】记录着已确立的事实",
|
"• [定了的事] 这些是不会变的",
|
||||||
"• 其余部分是过往经历的回忆碎片",
|
"• 其余部分是过往经历的回忆碎片",
|
||||||
"",
|
"",
|
||||||
"请内化这些记忆:",
|
"请内化这些记忆:",
|
||||||
@@ -103,7 +103,7 @@ function buildSystemPreamble() {
|
|||||||
function buildPostscript() {
|
function buildPostscript() {
|
||||||
return [
|
return [
|
||||||
"",
|
"",
|
||||||
"——",
|
"这些记忆是真实的,请自然地记住它们。",
|
||||||
].join("\n");
|
].join("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -594,49 +594,36 @@ async function buildVectorPrompt(store, recallResult, causalById, queryEntities
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ═══════════════════════════════════════════════════════════════════
|
// ═══════════════════════════════════════════════════════════════════
|
||||||
// 按注入顺序拼接 sections
|
// ═══════════════════════════════════════════════════════════════════════
|
||||||
// ═══════════════════════════════════════════════════════════════════
|
// 按注入顺序拼接 sections
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════
|
||||||
|
const sections = [];
|
||||||
|
// 1. 世界约束 → 定了的事
|
||||||
|
if (assembled.world.lines.length) {
|
||||||
|
sections.push(`[定了的事] 已确立的事实\n${assembled.world.lines.join("\n")}`);
|
||||||
|
}
|
||||||
|
// 2. 核心经历 → 印象深的事
|
||||||
|
if (assembled.events.direct.length) {
|
||||||
|
sections.push(`[印象深的事] 记得很清楚\n\n${assembled.events.direct.join("\n\n")}`);
|
||||||
|
}
|
||||||
|
// 3. 过往背景 → 好像有关的事
|
||||||
|
if (assembled.events.similar.length) {
|
||||||
|
sections.push(`[好像有关的事] 听说过或有点模糊\n\n${assembled.events.similar.join("\n\n")}`);
|
||||||
|
}
|
||||||
|
// 4. 远期片段 → 更早以前
|
||||||
|
if (assembled.orphans.lines.length) {
|
||||||
|
sections.push(`[更早以前] 记忆里残留的老画面\n${assembled.orphans.lines.join("\n")}`);
|
||||||
|
}
|
||||||
|
// 5. 待整理 → 刚发生的
|
||||||
|
if (assembled.recentOrphans.lines.length) {
|
||||||
|
sections.push(`[刚发生的] 还没来得及想明白\n${assembled.recentOrphans.lines.join("\n")}`);
|
||||||
|
}
|
||||||
|
// 6. 人物弧光 → 这些人
|
||||||
|
if (assembled.arcs.lines.length) {
|
||||||
|
sections.push(`[这些人] 他们现在怎样了\n${assembled.arcs.lines.join("\n")}`);
|
||||||
|
}
|
||||||
|
|
||||||
const sections = [];
|
if (!sections.length) {
|
||||||
|
|
||||||
// 1. 世界约束
|
|
||||||
if (assembled.world.lines.length) {
|
|
||||||
sections.push(`[世界约束] 已确立的事实\n${assembled.world.lines.join("\n")}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 2. 核心经历
|
|
||||||
if (assembled.events.direct.length) {
|
|
||||||
sections.push(`[核心经历] 深刻的记忆\n\n${assembled.events.direct.join("\n\n")}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3. 过往背景
|
|
||||||
if (assembled.events.similar.length) {
|
|
||||||
sections.push(`[过往背景] 听别人说起或比较模糊的往事\n\n${assembled.events.similar.join("\n\n")}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 4. 远期片段
|
|
||||||
if (assembled.orphans.lines.length) {
|
|
||||||
sections.push(`[远期片段] 记忆里残留的一些老画面\n${assembled.orphans.lines.join("\n")}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 5. 待整理
|
|
||||||
if (assembled.recentOrphans.lines.length) {
|
|
||||||
sections.push(`[待整理] 最近发生但尚未梳理的原始记忆\n${assembled.recentOrphans.lines.join("\n")}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 6. 人物弧光(最后注入,但预算已在优先级 2 预留)
|
|
||||||
if (assembled.arcs.lines.length) {
|
|
||||||
sections.push(`[人物弧光]\n${assembled.arcs.lines.join("\n")}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ═══════════════════════════════════════════════════════════════════
|
|
||||||
// 统计 & 返回
|
|
||||||
// ═══════════════════════════════════════════════════════════════════
|
|
||||||
|
|
||||||
// 总预算 = 主装配 + 待整理
|
|
||||||
injectionStats.budget.used = total.used + (recentOrphanStats.tokens || 0);
|
|
||||||
|
|
||||||
if (!sections.length) {
|
|
||||||
return { promptText: "", injectionLogText: "", injectionStats };
|
return { promptText: "", injectionLogText: "", injectionStats };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -296,19 +296,34 @@ function buildEntityLexicon(store, allEvents) {
|
|||||||
.slice(0, 5000);
|
.slice(0, 5000);
|
||||||
}
|
}
|
||||||
|
|
||||||
function extractEntities(text, lexicon) {
|
/**
|
||||||
const t = normalize(text);
|
* 从分段消息中提取实体,继承消息权重
|
||||||
if (!t || !lexicon?.length) return [];
|
* @param {string[]} segments
|
||||||
|
* @param {number[]} weights
|
||||||
|
* @param {string[]} lexicon
|
||||||
|
* @returns {Map<string, number>}
|
||||||
|
*/
|
||||||
|
function extractEntitiesWithWeights(segments, weights, lexicon) {
|
||||||
|
const entityWeights = new Map();
|
||||||
|
|
||||||
const sorted = [...lexicon].sort((a, b) => b.length - a.length);
|
if (!segments?.length || !lexicon?.length) return entityWeights;
|
||||||
const hits = [];
|
|
||||||
for (const e of sorted) {
|
for (let i = 0; i < segments.length; i++) {
|
||||||
if (t.includes(e)) hits.push(e);
|
const text = normalize(segments[i]);
|
||||||
if (hits.length >= 20) break;
|
const weight = weights?.[i] || 0;
|
||||||
|
|
||||||
|
for (const entity of lexicon) {
|
||||||
|
if (text.includes(entity)) {
|
||||||
|
const existing = entityWeights.get(entity) || 0;
|
||||||
|
if (weight > existing) {
|
||||||
|
entityWeights.set(entity, weight);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return hits;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
return entityWeights;
|
||||||
|
}
|
||||||
// ═══════════════════════════════════════════════════════════════════════════
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
// MMR
|
// MMR
|
||||||
// ═══════════════════════════════════════════════════════════════════════════
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
@@ -457,7 +472,7 @@ async function searchChunks(queryVector, vectorConfig, l0FloorBonus = new Map(),
|
|||||||
// L2 Events 检索(RRF 混合 + MMR 后置)
|
// L2 Events 检索(RRF 混合 + MMR 后置)
|
||||||
// ═══════════════════════════════════════════════════════════════════════════
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
async function searchEvents(queryVector, queryTextForSearch, allEvents, vectorConfig, store, queryEntities, l0FloorBonus = new Map()) {
|
async function searchEvents(queryVector, queryTextForSearch, allEvents, vectorConfig, store, queryEntityWeights, l0FloorBonus = new Map()) {
|
||||||
const { chatId } = getContext();
|
const { chatId } = getContext();
|
||||||
if (!chatId || !queryVector?.length) return [];
|
if (!chatId || !queryVector?.length) return [];
|
||||||
|
|
||||||
@@ -475,11 +490,14 @@ async function searchEvents(queryVector, queryTextForSearch, allEvents, vectorCo
|
|||||||
|
|
||||||
// 文本路检索
|
// 文本路检索
|
||||||
const textRanked = searchEventsByText(queryTextForSearch, CONFIG.TEXT_SEARCH_LIMIT);
|
const textRanked = searchEventsByText(queryTextForSearch, CONFIG.TEXT_SEARCH_LIMIT);
|
||||||
|
const textGapInfo = textRanked._gapInfo || null;
|
||||||
|
|
||||||
// ═══════════════════════════════════════════════════════════════════════
|
// ═══════════════════════════════════════════════════════════════════════
|
||||||
// 向量路检索(只保留 L0 加权)
|
// 向量路检索(只保留 L0 加权)
|
||||||
// ═══════════════════════════════════════════════════════════════════════
|
// ═══════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
const ENTITY_BONUS_FACTOR = 0.10;
|
||||||
|
|
||||||
const scored = (allEvents || []).map((event, idx) => {
|
const scored = (allEvents || []).map((event, idx) => {
|
||||||
const v = vectorMap.get(event.id);
|
const v = vectorMap.get(event.id);
|
||||||
const sim = v ? cosineSimilarity(queryVector, v) : 0;
|
const sim = v ? cosineSimilarity(queryVector, v) : 0;
|
||||||
@@ -497,6 +515,17 @@ async function searchEvents(queryVector, queryTextForSearch, allEvents, vectorCo
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const participants = (event.participants || []).map(p => normalize(p));
|
||||||
|
let maxEntityWeight = 0;
|
||||||
|
for (const p of participants) {
|
||||||
|
const w = queryEntityWeights.get(p) || 0;
|
||||||
|
if (w > maxEntityWeight) {
|
||||||
|
maxEntityWeight = w;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const entityBonus = ENTITY_BONUS_FACTOR * maxEntityWeight;
|
||||||
|
bonus += entityBonus;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
_id: event.id,
|
_id: event.id,
|
||||||
_idx: idx,
|
_idx: idx,
|
||||||
@@ -504,9 +533,12 @@ async function searchEvents(queryVector, queryTextForSearch, allEvents, vectorCo
|
|||||||
similarity: sim,
|
similarity: sim,
|
||||||
finalScore: sim + bonus,
|
finalScore: sim + bonus,
|
||||||
vector: v,
|
vector: v,
|
||||||
|
_entityBonus: entityBonus,
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const entityBonusById = new Map(scored.map(s => [s._id, s._entityBonus]));
|
||||||
|
|
||||||
const preFilterDistribution = {
|
const preFilterDistribution = {
|
||||||
total: scored.length,
|
total: scored.length,
|
||||||
'0.85+': scored.filter(s => s.finalScore >= 0.85).length,
|
'0.85+': scored.filter(s => s.finalScore >= 0.85).length,
|
||||||
@@ -518,7 +550,6 @@ async function searchEvents(queryVector, queryTextForSearch, allEvents, vectorCo
|
|||||||
threshold: CONFIG.MIN_SIMILARITY_EVENT,
|
threshold: CONFIG.MIN_SIMILARITY_EVENT,
|
||||||
};
|
};
|
||||||
|
|
||||||
// 向量路:纯相似度排序(不在这里做 MMR)
|
|
||||||
const candidates = scored
|
const candidates = scored
|
||||||
.filter(s => s.finalScore >= CONFIG.MIN_SIMILARITY_EVENT)
|
.filter(s => s.finalScore >= CONFIG.MIN_SIMILARITY_EVENT)
|
||||||
.sort((a, b) => b.finalScore - a.finalScore)
|
.sort((a, b) => b.finalScore - a.finalScore)
|
||||||
@@ -530,15 +561,12 @@ async function searchEvents(queryVector, queryTextForSearch, allEvents, vectorCo
|
|||||||
vector: s.vector,
|
vector: s.vector,
|
||||||
}));
|
}));
|
||||||
|
|
||||||
// RRF 融合
|
|
||||||
const eventById = new Map(allEvents.map(e => [e.id, e]));
|
const eventById = new Map(allEvents.map(e => [e.id, e]));
|
||||||
const fused = fuseEventsByRRF(vectorRanked, textRanked, eventById);
|
const fused = fuseEventsByRRF(vectorRanked, textRanked, eventById);
|
||||||
|
|
||||||
// 向量非空时过滤纯 TEXT
|
|
||||||
const hasVector = vectorRanked.length > 0;
|
const hasVector = vectorRanked.length > 0;
|
||||||
const filtered = hasVector ? fused.filter(x => x.type !== 'TEXT') : fused;
|
const filtered = hasVector ? fused.filter(x => x.type !== 'TEXT') : fused;
|
||||||
|
|
||||||
// MMR 放在融合后:对最终候选集去重
|
|
||||||
const mmrInput = filtered.slice(0, CONFIG.CANDIDATE_EVENTS).map(x => ({
|
const mmrInput = filtered.slice(0, CONFIG.CANDIDATE_EVENTS).map(x => ({
|
||||||
...x,
|
...x,
|
||||||
_id: x.id,
|
_id: x.id,
|
||||||
@@ -551,7 +579,6 @@ async function searchEvents(queryVector, queryTextForSearch, allEvents, vectorCo
|
|||||||
c => c.vector || null,
|
c => c.vector || null,
|
||||||
c => c.rrf
|
c => c.rrf
|
||||||
);
|
);
|
||||||
|
|
||||||
// 构造结果
|
// 构造结果
|
||||||
const results = mmrOutput.map(x => ({
|
const results = mmrOutput.map(x => ({
|
||||||
event: x.event,
|
event: x.event,
|
||||||
@@ -559,6 +586,7 @@ async function searchEvents(queryVector, queryTextForSearch, allEvents, vectorCo
|
|||||||
_recallType: x.type === 'HYBRID' ? 'DIRECT' : 'SIMILAR',
|
_recallType: x.type === 'HYBRID' ? 'DIRECT' : 'SIMILAR',
|
||||||
_recallReason: x.type,
|
_recallReason: x.type,
|
||||||
_rrfDetail: { vRank: x.vRank, tRank: x.tRank, rrf: x.rrf },
|
_rrfDetail: { vRank: x.vRank, tRank: x.tRank, rrf: x.rrf },
|
||||||
|
_entityBonus: entityBonusById.get(x.event?.id) || 0,
|
||||||
}));
|
}));
|
||||||
|
|
||||||
// 统计信息附加到第一条结果
|
// 统计信息附加到第一条结果
|
||||||
@@ -571,6 +599,7 @@ async function searchEvents(queryVector, queryTextForSearch, allEvents, vectorCo
|
|||||||
vectorOnlyCount: fused.filter(x => x.type === 'VECTOR').length,
|
vectorOnlyCount: fused.filter(x => x.type === 'VECTOR').length,
|
||||||
textOnlyFiltered: fused.filter(x => x.type === 'TEXT').length,
|
textOnlyFiltered: fused.filter(x => x.type === 'TEXT').length,
|
||||||
};
|
};
|
||||||
|
results[0]._textGapInfo = textGapInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
return results;
|
return results;
|
||||||
@@ -587,10 +616,11 @@ function formatRecallLog({
|
|||||||
chunkResults,
|
chunkResults,
|
||||||
eventResults,
|
eventResults,
|
||||||
allEvents,
|
allEvents,
|
||||||
queryEntities,
|
queryEntityWeights = new Map(),
|
||||||
causalEvents = [],
|
causalEvents = [],
|
||||||
chunkPreFilterStats = null,
|
chunkPreFilterStats = null,
|
||||||
l0Results = [],
|
l0Results = [],
|
||||||
|
textGapInfo = null,
|
||||||
}) {
|
}) {
|
||||||
const lines = [
|
const lines = [
|
||||||
'\u2554' + '\u2550'.repeat(62) + '\u2557',
|
'\u2554' + '\u2550'.repeat(62) + '\u2557',
|
||||||
@@ -621,7 +651,18 @@ function formatRecallLog({
|
|||||||
lines.push('\u250c' + '\u2500'.repeat(61) + '\u2510');
|
lines.push('\u250c' + '\u2500'.repeat(61) + '\u2510');
|
||||||
lines.push('\u2502 【提取实体】 \u2502');
|
lines.push('\u2502 【提取实体】 \u2502');
|
||||||
lines.push('\u2514' + '\u2500'.repeat(61) + '\u2518');
|
lines.push('\u2514' + '\u2500'.repeat(61) + '\u2518');
|
||||||
lines.push(` ${queryEntities?.length ? queryEntities.join('、') : '(无)'}`);
|
|
||||||
|
if (queryEntityWeights?.size) {
|
||||||
|
const sorted = Array.from(queryEntityWeights.entries())
|
||||||
|
.sort((a, b) => b[1] - a[1])
|
||||||
|
.slice(0, 8);
|
||||||
|
const formatted = sorted
|
||||||
|
.map(([e, w]) => `${e}(${(w * 100).toFixed(0)}%)`)
|
||||||
|
.join(' | ');
|
||||||
|
lines.push(` ${formatted}`);
|
||||||
|
} else {
|
||||||
|
lines.push(' (无)');
|
||||||
|
}
|
||||||
|
|
||||||
lines.push('');
|
lines.push('');
|
||||||
lines.push('\u250c' + '\u2500'.repeat(61) + '\u2510');
|
lines.push('\u250c' + '\u2500'.repeat(61) + '\u2510');
|
||||||
@@ -642,7 +683,7 @@ function formatRecallLog({
|
|||||||
lines.push(' L1 原文片段:');
|
lines.push(' L1 原文片段:');
|
||||||
if (chunkPreFilterStats) {
|
if (chunkPreFilterStats) {
|
||||||
const dist = chunkPreFilterStats.distribution || {};
|
const dist = chunkPreFilterStats.distribution || {};
|
||||||
lines.push(` \u5168\u91cf: ${chunkPreFilterStats.total} \u6761 | \u901a\u8fc7\u9608\u503c(\u8fdc\u671f\u2265${chunkPreFilterStats.thresholdRemote}, \u5f85\u6574\u7406\u2265${chunkPreFilterStats.thresholdRecent}): ${chunkPreFilterStats.passThreshold} \u6761 | \u6700\u7ec8: ${chunkResults.length} \u6761`);
|
lines.push(` 全量: ${chunkPreFilterStats.total} 条 | 通过阈值(远期≥${chunkPreFilterStats.thresholdRemote}, 待整理≥${chunkPreFilterStats.thresholdRecent}): ${chunkPreFilterStats.passThreshold} 条 | 最终: ${chunkResults.length} 条`);
|
||||||
lines.push(` 匹配度: 0.8+: ${dist['0.8+'] || 0} | 0.7-0.8: ${dist['0.7-0.8'] || 0} | 0.6-0.7: ${dist['0.6-0.7'] || 0}`);
|
lines.push(` 匹配度: 0.8+: ${dist['0.8+'] || 0} | 0.7-0.8: ${dist['0.7-0.8'] || 0} | 0.6-0.7: ${dist['0.6-0.7'] || 0}`);
|
||||||
} else {
|
} else {
|
||||||
lines.push(` 选入: ${chunkResults.length} 条`);
|
lines.push(` 选入: ${chunkResults.length} 条`);
|
||||||
@@ -656,6 +697,18 @@ function formatRecallLog({
|
|||||||
lines.push(` 总事件: ${allEvents.length} 条 | 最终: ${eventResults.length} 条`);
|
lines.push(` 总事件: ${allEvents.length} 条 | 最终: ${eventResults.length} 条`);
|
||||||
lines.push(` 向量路: ${rrfStats.vectorCount || 0} 条 | 文本路: ${rrfStats.textCount || 0} 条`);
|
lines.push(` 向量路: ${rrfStats.vectorCount || 0} 条 | 文本路: ${rrfStats.textCount || 0} 条`);
|
||||||
lines.push(` HYBRID: ${rrfStats.hybridCount || 0} 条 | 纯 VECTOR: ${rrfStats.vectorOnlyCount || 0} 条 | 纯 TEXT (已过滤): ${rrfStats.textOnlyFiltered || 0} 条`);
|
lines.push(` HYBRID: ${rrfStats.hybridCount || 0} 条 | 纯 VECTOR: ${rrfStats.vectorOnlyCount || 0} 条 | 纯 TEXT (已过滤): ${rrfStats.textOnlyFiltered || 0} 条`);
|
||||||
|
const entityBoostedEvents = eventResults.filter(e => e._entityBonus > 0).length;
|
||||||
|
lines.push(` 实体加分事件: ${entityBoostedEvents} 条`);
|
||||||
|
|
||||||
|
if (textGapInfo) {
|
||||||
|
lines.push('');
|
||||||
|
lines.push(' 文本检索 (BM25 动态 top-K):');
|
||||||
|
lines.push(` 命中: ${textGapInfo.total} 条 | 返回: ${textGapInfo.returned} 条 (覆盖 ${textGapInfo.coverage} 总分)`);
|
||||||
|
if (textGapInfo.scoreRange) {
|
||||||
|
const s = textGapInfo.scoreRange;
|
||||||
|
lines.push(` 分数: Top=${s.top} | 截断=${s.cutoff} | P50=${s.p50} | Last=${s.last}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Causal
|
// Causal
|
||||||
if (causalEvents.length) {
|
if (causalEvents.length) {
|
||||||
@@ -702,7 +755,8 @@ export async function recallMemory(queryText, allEvents, vectorConfig, options =
|
|||||||
}
|
}
|
||||||
|
|
||||||
const lexicon = buildEntityLexicon(store, allEvents);
|
const lexicon = buildEntityLexicon(store, allEvents);
|
||||||
const queryEntities = extractEntities(segments.join('\n'), lexicon);
|
const queryEntityWeights = extractEntitiesWithWeights(segments, weights, lexicon);
|
||||||
|
const queryEntities = Array.from(queryEntityWeights.keys());
|
||||||
|
|
||||||
// 构建文本查询串:最后一条消息 + 实体 + 关键词
|
// 构建文本查询串:最后一条消息 + 实体 + 关键词
|
||||||
const lastSeg = segments[segments.length - 1] || '';
|
const lastSeg = segments[segments.length - 1] || '';
|
||||||
@@ -727,10 +781,11 @@ export async function recallMemory(queryText, allEvents, vectorConfig, options =
|
|||||||
|
|
||||||
const [chunkResults, eventResults] = await Promise.all([
|
const [chunkResults, eventResults] = await Promise.all([
|
||||||
searchChunks(queryVector, vectorConfig, l0FloorBonus, lastSummarizedFloor),
|
searchChunks(queryVector, vectorConfig, l0FloorBonus, lastSummarizedFloor),
|
||||||
searchEvents(queryVector, queryTextForSearch, allEvents, vectorConfig, store, queryEntities, l0FloorBonus),
|
searchEvents(queryVector, queryTextForSearch, allEvents, vectorConfig, store, queryEntityWeights, l0FloorBonus),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const chunkPreFilterStats = chunkResults._preFilterStats || null;
|
const chunkPreFilterStats = chunkResults._preFilterStats || null;
|
||||||
|
const textGapInfo = eventResults[0]?._textGapInfo || null;
|
||||||
|
|
||||||
const mergedChunks = mergeAndSparsify(l0VirtualChunks, chunkResults, CONFIG.FLOOR_MAX_CHUNKS);
|
const mergedChunks = mergeAndSparsify(l0VirtualChunks, chunkResults, CONFIG.FLOOR_MAX_CHUNKS);
|
||||||
|
|
||||||
@@ -764,10 +819,11 @@ export async function recallMemory(queryText, allEvents, vectorConfig, options =
|
|||||||
chunkResults: mergedChunks,
|
chunkResults: mergedChunks,
|
||||||
eventResults,
|
eventResults,
|
||||||
allEvents,
|
allEvents,
|
||||||
queryEntities,
|
queryEntityWeights,
|
||||||
causalEvents: causalEventsTruncated,
|
causalEvents: causalEventsTruncated,
|
||||||
chunkPreFilterStats,
|
chunkPreFilterStats,
|
||||||
l0Results,
|
l0Results,
|
||||||
|
textGapInfo,
|
||||||
});
|
});
|
||||||
|
|
||||||
console.group('%c[Recall]', 'color: #7c3aed; font-weight: bold');
|
console.group('%c[Recall]', 'color: #7c3aed; font-weight: bold');
|
||||||
|
|||||||
@@ -1,37 +1,70 @@
|
|||||||
// ═══════════════════════════════════════════════════════════════════════════
|
// text-search.js - 最终版
|
||||||
// Text Search - L2 事件文本检索(MiniSearch)
|
|
||||||
// 与向量检索互补,通过 RRF 融合
|
|
||||||
// ═══════════════════════════════════════════════════════════════════════════
|
|
||||||
|
|
||||||
import MiniSearch from '../../../libs/minisearch.mjs';
|
import MiniSearch from '../../../libs/minisearch.mjs';
|
||||||
|
|
||||||
|
const STOP_WORDS = new Set([
|
||||||
|
'的', '了', '是', '在', '和', '与', '或', '但', '而', '却',
|
||||||
|
'这', '那', '他', '她', '它', '我', '你', '们', '着', '过',
|
||||||
|
'把', '被', '给', '让', '向', '就', '都', '也', '还', '又',
|
||||||
|
'很', '太', '更', '最', '只', '才', '已', '正', '会', '能',
|
||||||
|
'要', '可', '得', '地', '之', '所', '以', '为', '于', '有',
|
||||||
|
'不', '去', '来', '上', '下', '里', '说', '看', '吧', '呢',
|
||||||
|
'啊', '吗', '呀', '哦', '嗯', '么',
|
||||||
|
'の', 'に', 'は', 'を', 'が', 'と', 'で', 'へ', 'や', 'か',
|
||||||
|
'も', 'な', 'よ', 'ね', 'わ', 'です', 'ます', 'した', 'ない',
|
||||||
|
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
|
||||||
|
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
|
||||||
|
'to', 'of', 'in', 'on', 'at', 'for', 'with', 'by', 'from',
|
||||||
|
'and', 'or', 'but', 'if', 'that', 'this', 'it', 'its',
|
||||||
|
'i', 'you', 'he', 'she', 'we', 'they', 'my', 'your', 'his',
|
||||||
|
]);
|
||||||
|
|
||||||
|
function tokenize(text) {
|
||||||
|
const s = String(text || '').toLowerCase().trim();
|
||||||
|
if (!s) return [];
|
||||||
|
|
||||||
|
const tokens = new Set();
|
||||||
|
|
||||||
|
// CJK Bigram + Trigram
|
||||||
|
const cjk = s.match(/[\u4e00-\u9fff\u3400-\u4dbf]+/g) || [];
|
||||||
|
for (const seg of cjk) {
|
||||||
|
const chars = [...seg].filter(c => !STOP_WORDS.has(c));
|
||||||
|
for (let i = 0; i < chars.length - 1; i++) {
|
||||||
|
tokens.add(chars[i] + chars[i + 1]);
|
||||||
|
}
|
||||||
|
for (let i = 0; i < chars.length - 2; i++) {
|
||||||
|
tokens.add(chars[i] + chars[i + 1] + chars[i + 2]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 日语假名
|
||||||
|
const kana = s.match(/[\u3040-\u309f\u30a0-\u30ff]{2,}/g) || [];
|
||||||
|
for (const k of kana) {
|
||||||
|
if (!STOP_WORDS.has(k)) tokens.add(k);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 英文
|
||||||
|
const en = s.match(/[a-z]{2,}/g) || [];
|
||||||
|
for (const w of en) {
|
||||||
|
if (!STOP_WORDS.has(w)) tokens.add(w);
|
||||||
|
}
|
||||||
|
|
||||||
|
return [...tokens];
|
||||||
|
}
|
||||||
|
|
||||||
let idx = null;
|
let idx = null;
|
||||||
let lastRevision = null;
|
let lastRevision = null;
|
||||||
|
|
||||||
/**
|
|
||||||
* 中文逐字 + 英数字串分词
|
|
||||||
*/
|
|
||||||
function tokenize(text) {
|
|
||||||
return String(text || '').match(/[\u4e00-\u9fff]|[a-zA-Z0-9]+/g) || [];
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 去掉 summary 末尾的楼层标记
|
|
||||||
*/
|
|
||||||
function stripFloorTag(s) {
|
function stripFloorTag(s) {
|
||||||
return String(s || '').replace(/\s*\(#\d+(?:-\d+)?\)\s*$/, '').trim();
|
return String(s || '').replace(/\s*\(#\d+(?:-\d+)?\)\s*$/, '').trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* 构建/更新事件文本索引
|
|
||||||
*/
|
|
||||||
export function ensureEventTextIndex(events, revision) {
|
export function ensureEventTextIndex(events, revision) {
|
||||||
if (!events?.length) {
|
if (!events?.length) {
|
||||||
idx = null;
|
idx = null;
|
||||||
lastRevision = null;
|
lastRevision = null;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (idx && revision === lastRevision) return;
|
if (idx && revision === lastRevision) return;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -39,6 +72,7 @@ export function ensureEventTextIndex(events, revision) {
|
|||||||
fields: ['title', 'summary', 'participants'],
|
fields: ['title', 'summary', 'participants'],
|
||||||
storeFields: ['id'],
|
storeFields: ['id'],
|
||||||
tokenize,
|
tokenize,
|
||||||
|
searchOptions: { tokenize },
|
||||||
});
|
});
|
||||||
|
|
||||||
idx.addAll(events.map(e => ({
|
idx.addAll(events.map(e => ({
|
||||||
@@ -52,33 +86,87 @@ export function ensureEventTextIndex(events, revision) {
|
|||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error('[text-search] Index build failed:', e);
|
console.error('[text-search] Index build failed:', e);
|
||||||
idx = null;
|
idx = null;
|
||||||
lastRevision = null;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 文本检索事件
|
* BM25 检索,返回 top-K 候选给 RRF
|
||||||
|
*
|
||||||
|
* 设计原则:
|
||||||
|
* - 不做分数过滤(BM25 分数跨查询不可比)
|
||||||
|
* - 不做匹配数过滤(bigram 让一个词产生多个 token)
|
||||||
|
* - 只做 top-K(BM25 排序本身有区分度)
|
||||||
|
* - 质量过滤交给 RRF 后的 hasVector 过滤
|
||||||
*/
|
*/
|
||||||
|
/**
|
||||||
|
* 动态 top-K:累积分数占比法
|
||||||
|
*
|
||||||
|
* 原理:BM25 分数服从幂律分布,少数高分条目贡献大部分总分
|
||||||
|
* 取累积分数达到阈值的最小 K
|
||||||
|
*
|
||||||
|
* 参考:帕累托法则(80/20 法则)在信息检索中的应用
|
||||||
|
*/
|
||||||
|
function dynamicTopK(scores, coverage = 0.90, minK = 15, maxK = 80) {
|
||||||
|
if (!scores.length) return 0;
|
||||||
|
|
||||||
|
const total = scores.reduce((a, b) => a + b, 0);
|
||||||
|
if (total <= 0) return Math.min(minK, scores.length);
|
||||||
|
|
||||||
|
let cumulative = 0;
|
||||||
|
for (let i = 0; i < scores.length; i++) {
|
||||||
|
cumulative += scores[i];
|
||||||
|
if (cumulative / total >= coverage) {
|
||||||
|
return Math.max(minK, Math.min(maxK, i + 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Math.min(maxK, scores.length);
|
||||||
|
}
|
||||||
|
|
||||||
export function searchEventsByText(queryText, limit = 80) {
|
export function searchEventsByText(queryText, limit = 80) {
|
||||||
if (!idx || !queryText?.trim()) return [];
|
if (!idx || !queryText?.trim()) return [];
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const res = idx.search(queryText, {
|
const results = idx.search(queryText, {
|
||||||
limit,
|
boost: { title: 4, participants: 2, summary: 1 },
|
||||||
boost: { title: 2, participants: 1.5, summary: 1 },
|
fuzzy: false,
|
||||||
fuzzy: 0.2,
|
prefix: false,
|
||||||
prefix: true,
|
|
||||||
});
|
});
|
||||||
return res.map((r, i) => ({ id: r.id, textRank: i + 1 }));
|
|
||||||
|
if (!results.length) return [];
|
||||||
|
|
||||||
|
const scores = results.map(r => r.score);
|
||||||
|
const k = dynamicTopK(scores, 0.90, 15, limit);
|
||||||
|
|
||||||
|
const output = results.slice(0, k).map((r, i) => ({
|
||||||
|
id: r.id,
|
||||||
|
textRank: i + 1,
|
||||||
|
score: r.score,
|
||||||
|
}));
|
||||||
|
|
||||||
|
const total = scores.reduce((a, b) => a + b, 0);
|
||||||
|
const kCumulative = scores.slice(0, k).reduce((a, b) => a + b, 0);
|
||||||
|
|
||||||
|
output._gapInfo = {
|
||||||
|
total: results.length,
|
||||||
|
returned: k,
|
||||||
|
coverage: ((kCumulative / total) * 100).toFixed(1) + '%',
|
||||||
|
scoreRange: {
|
||||||
|
top: scores[0]?.toFixed(1),
|
||||||
|
cutoff: scores[k - 1]?.toFixed(1),
|
||||||
|
p50: scores[Math.floor(scores.length / 2)]?.toFixed(1),
|
||||||
|
last: scores[scores.length - 1]?.toFixed(1),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
return output;
|
||||||
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error('[text-search] Search failed:', e);
|
console.error('[text-search] Search failed:', e);
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* 清理索引
|
|
||||||
*/
|
|
||||||
export function clearEventTextIndex() {
|
export function clearEventTextIndex() {
|
||||||
idx = null;
|
idx = null;
|
||||||
lastRevision = null;
|
lastRevision = null;
|
||||||
|
|||||||
Reference in New Issue
Block a user