feat(ai): hybrid RRF retrieval, heading-breadcrumb chunks, merged search tool
Improve agent RAG quality with three changes, plus a roadmap doc for the rest.
- Indexer: prefix each chunk with its heading path ("Page > H1 > H2"), built by
walking the ProseMirror JSON (heading nodes) so a `#` inside a fenced code block
is never mistaken for a heading. Falls back to plain-text chunking on any error.
buildChunkRows: drop indexOf-against-source offsets (breadcrumb prefixes break
verbatim matching) for a cumulative cursor — offsets are provenance-only.
- Hybrid search: new migration adds a generated `fts` tsvector column + GIN index
to page_embeddings (same english+f_unaccent config as pages.tsv). New
PageEmbeddingRepo.hybridSearch fuses cosine + full-text rankings via Reciprocal
Rank Fusion (k=60, equal weights) in one SQL query at chunk granularity.
- Tools: collapse semanticSearch + searchPages into one hybrid `searchPages` tool
with a query-rewrite-oriented description; gracefully falls back to the REST
full-text path when embeddings are unconfigured. Access control (space scope +
page-permission post-filter) preserved. Add a query-rewrite hint to the default
system prompt.
- docs/rag-improvements-plan.md: record what shipped and the deferred backlog
(reranker, attachment indexing, eval harness, tuning).
Note: requires a corpus reindex to populate breadcrumbs on existing pages.
This commit is contained in:
@@ -9,6 +9,8 @@ const DEFAULT_PROMPT = [
|
||||
'You help the current user find, read, and reason about pages in their workspace.',
|
||||
'Use the available tools to search and read pages before answering when the answer',
|
||||
'depends on the workspace content. Cite the pages you used. Be concise and accurate.',
|
||||
"When searching, rephrase the user's question into focused keyword queries, and search",
|
||||
'again with different terms if the first results are weak.',
|
||||
].join(' ');
|
||||
|
||||
/**
|
||||
|
||||
@@ -77,9 +77,28 @@ export class EmbeddingIndexerService {
|
||||
return;
|
||||
}
|
||||
|
||||
const text = this.extractText(page);
|
||||
if (!text || text.trim().length === 0) {
|
||||
// Empty page -> remove any prior embeddings so search returns nothing.
|
||||
// Prefer heading-breadcrumb chunks: each chunk is prefixed with its heading
|
||||
// path ("Page Title > H1 > H2") so the breadcrumb is embedded AND stored in
|
||||
// `content` (feeding the fts column and the agent's snippet). Walk the
|
||||
// ProseMirror JSON — NOT the markdown text — so a `#` inside a fenced code
|
||||
// block is never mistaken for a heading. Degrades to the plain-text path on
|
||||
// any error / unknown structure (returns null).
|
||||
const breadcrumbChunks = page.content
|
||||
? await this.safeBuildBreadcrumbChunks(page.content, page.title)
|
||||
: null;
|
||||
|
||||
// Fall back to plain text when breadcrumb chunking is unavailable.
|
||||
const fallbackText =
|
||||
breadcrumbChunks && breadcrumbChunks.length > 0
|
||||
? null
|
||||
: this.extractText(page);
|
||||
|
||||
// Empty page (neither path produced content) -> remove any prior embeddings
|
||||
// so search returns nothing.
|
||||
if (
|
||||
(!breadcrumbChunks || breadcrumbChunks.length === 0) &&
|
||||
(!fallbackText || fallbackText.trim().length === 0)
|
||||
) {
|
||||
await this.pageEmbeddingRepo.deleteByPage(pageId, workspaceId);
|
||||
return;
|
||||
}
|
||||
@@ -105,12 +124,17 @@ export class EmbeddingIndexerService {
|
||||
throw err;
|
||||
}
|
||||
|
||||
// Chunk the plain text.
|
||||
const splitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize: CHUNK_SIZE,
|
||||
chunkOverlap: CHUNK_OVERLAP,
|
||||
});
|
||||
const chunks = await splitter.splitText(text);
|
||||
// Use breadcrumb chunks when available; otherwise chunk the plain text.
|
||||
let chunks: string[];
|
||||
if (breadcrumbChunks && breadcrumbChunks.length > 0) {
|
||||
chunks = breadcrumbChunks;
|
||||
} else {
|
||||
const splitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize: CHUNK_SIZE,
|
||||
chunkOverlap: CHUNK_OVERLAP,
|
||||
});
|
||||
chunks = await splitter.splitText(fallbackText as string);
|
||||
}
|
||||
if (chunks.length === 0) {
|
||||
await this.pageEmbeddingRepo.deleteByPage(pageId, workspaceId);
|
||||
return;
|
||||
@@ -139,7 +163,6 @@ export class EmbeddingIndexerService {
|
||||
const rows = this.buildChunkRows(
|
||||
chunks,
|
||||
vectors,
|
||||
text,
|
||||
{ pageId, workspaceId, spaceId },
|
||||
modelName,
|
||||
);
|
||||
@@ -255,14 +278,16 @@ export class EmbeddingIndexerService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Map chunk strings + vectors to insertable rows, computing chunkStart /
|
||||
* chunkLength against the source text. A moving cursor handles repeated
|
||||
* substrings and overlap so offsets stay monotonic.
|
||||
* Map chunk strings + vectors to insertable rows. Breadcrumb-prefixed chunks
|
||||
* are NOT verbatim substrings of any source text, so chunkStart is a running
|
||||
* cumulative offset (sum of previous chunk lengths) rather than an indexOf
|
||||
* position. These offsets are informational provenance only — search returns
|
||||
* `content` and never slices by offset. chunkIndex stays a global monotonic
|
||||
* index.
|
||||
*/
|
||||
private buildChunkRows(
|
||||
chunks: string[],
|
||||
vectors: number[][],
|
||||
sourceText: string,
|
||||
ids: { pageId: string; workspaceId: string; spaceId: string },
|
||||
modelName: string,
|
||||
): PageEmbeddingChunkRow[] {
|
||||
@@ -272,11 +297,8 @@ export class EmbeddingIndexerService {
|
||||
const chunk = chunks[i];
|
||||
const embedding = vectors[i];
|
||||
if (!embedding) continue;
|
||||
const found = sourceText.indexOf(chunk, cursor);
|
||||
const chunkStart = found >= 0 ? found : cursor;
|
||||
// Advance the cursor past the start so later identical chunks resolve to
|
||||
// later occurrences (overlap keeps the next search valid).
|
||||
cursor = chunkStart + 1;
|
||||
const chunkStart = cursor;
|
||||
cursor += chunk.length;
|
||||
rows.push({
|
||||
pageId: ids.pageId,
|
||||
workspaceId: ids.workspaceId,
|
||||
@@ -295,4 +317,106 @@ export class EmbeddingIndexerService {
|
||||
}
|
||||
return rows;
|
||||
}
|
||||
|
||||
/**
|
||||
* Thin try/catch wrapper around buildBreadcrumbChunks. Any failure (malformed
|
||||
* structure, unknown node type, etc.) returns null so the caller degrades
|
||||
* gracefully to the plain-text chunking path.
|
||||
*/
|
||||
private async safeBuildBreadcrumbChunks(
|
||||
contentJson: unknown,
|
||||
pageTitle: string | null,
|
||||
): Promise<string[] | null> {
|
||||
try {
|
||||
return await this.buildBreadcrumbChunks(contentJson, pageTitle);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build heading-breadcrumb chunks by walking the ProseMirror JSON document.
|
||||
*
|
||||
* Each section (the body following a heading) is split with the same 1000/200
|
||||
* RecursiveCharacterTextSplitter, and every resulting piece is prefixed with
|
||||
* its heading path ("Page Title > H1 > H2"). Walking the JSON — not markdown
|
||||
* text — means a `#` inside a fenced code block is never treated as a heading
|
||||
* (ProseMirror heading nodes are explicit).
|
||||
*
|
||||
* Returns null when `contentJson` is not an object with an array `content`, so
|
||||
* the caller falls back to plain-text chunking.
|
||||
*/
|
||||
private async buildBreadcrumbChunks(
|
||||
contentJson: unknown,
|
||||
pageTitle: string | null,
|
||||
): Promise<string[] | null> {
|
||||
const doc = contentJson as { content?: unknown };
|
||||
if (
|
||||
typeof contentJson !== 'object' ||
|
||||
contentJson === null ||
|
||||
!Array.isArray(doc.content)
|
||||
) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const splitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize: CHUNK_SIZE,
|
||||
chunkOverlap: CHUNK_OVERLAP,
|
||||
});
|
||||
|
||||
const out: string[] = [];
|
||||
const stack: { level: number; text: string }[] = [];
|
||||
let buffer = '';
|
||||
|
||||
// Flush the accumulated body as one or more chunks under the CURRENT crumb.
|
||||
const flush = async (): Promise<void> => {
|
||||
if (buffer.trim().length === 0) {
|
||||
buffer = '';
|
||||
return;
|
||||
}
|
||||
const crumb = [pageTitle, ...stack.map((s) => s.text)]
|
||||
.filter((s) => typeof s === 'string' && s.trim().length > 0)
|
||||
.join(' > ');
|
||||
const pieces = await splitter.splitText(buffer);
|
||||
for (const piece of pieces) {
|
||||
out.push(crumb ? `${crumb}\n\n${piece}` : piece);
|
||||
}
|
||||
buffer = '';
|
||||
};
|
||||
|
||||
for (const block of doc.content as Array<{
|
||||
type?: string;
|
||||
attrs?: { level?: number };
|
||||
}>) {
|
||||
if (block?.type === 'heading') {
|
||||
// Flush the preceding body under the crumb in effect BEFORE this
|
||||
// heading, then update the heading stack.
|
||||
await flush();
|
||||
const level =
|
||||
typeof block.attrs?.level === 'number' ? block.attrs.level : 1;
|
||||
// Pop deeper-or-equal headings: a new H2 replaces a prior H2/H3/...
|
||||
while (stack.length > 0 && stack[stack.length - 1].level >= level) {
|
||||
stack.pop();
|
||||
}
|
||||
const headingText = jsonToText({
|
||||
type: 'doc',
|
||||
content: [block],
|
||||
} as never).trim();
|
||||
if (headingText.length > 0) {
|
||||
stack.push({ level, text: headingText });
|
||||
}
|
||||
} else {
|
||||
const blockText = jsonToText({
|
||||
type: 'doc',
|
||||
content: [block],
|
||||
} as never);
|
||||
buffer = buffer.length > 0 ? `${buffer}\n${blockText}` : blockText;
|
||||
}
|
||||
}
|
||||
|
||||
// Flush any trailing body after the last heading.
|
||||
await flush();
|
||||
|
||||
return out;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,37 +87,126 @@ export class AiChatToolsService {
|
||||
return {
|
||||
searchPages: tool({
|
||||
description:
|
||||
'Full-text search across the pages the current user can access. ' +
|
||||
'Returns a compact list of matching pages with a short snippet.',
|
||||
'Search the wiki for pages relevant to a query. Combines exact ' +
|
||||
'keyword/identifier matching with semantic meaning and returns the ' +
|
||||
'most relevant pages with a short snippet, best match first. ' +
|
||||
"Rephrase the user's question into a focused search query (key terms " +
|
||||
'and entities), not a full sentence. If the first results look weak ' +
|
||||
'or incomplete, search again with different wording or synonyms ' +
|
||||
'before answering.',
|
||||
inputSchema: z.object({
|
||||
query: z.string().describe('The search query.'),
|
||||
limit: z
|
||||
.number()
|
||||
.int()
|
||||
.min(1)
|
||||
.max(50)
|
||||
.max(20)
|
||||
.optional()
|
||||
.describe('Maximum number of results (1-50).'),
|
||||
.describe('Maximum number of results (1-20).'),
|
||||
}),
|
||||
execute: async ({ query, limit }) => {
|
||||
// search(query, spaceId?, limit?) -> { items, success }.
|
||||
// Items are filterSearchResult(): { id, title, highlight, ... }.
|
||||
const result = await client.search(query, undefined, limit);
|
||||
const items = Array.isArray(result?.items) ? result.items : [];
|
||||
// Keep the payload token-efficient: id + title + a short snippet only.
|
||||
return items.map((raw) => {
|
||||
const item = raw as {
|
||||
id?: string;
|
||||
slugId?: string;
|
||||
title?: string;
|
||||
highlight?: string;
|
||||
};
|
||||
return {
|
||||
id: item.id ?? item.slugId,
|
||||
title: item.title ?? '',
|
||||
snippet: snippet(item.highlight),
|
||||
};
|
||||
});
|
||||
const trimmed = (query ?? '').trim();
|
||||
if (!trimmed) return [];
|
||||
|
||||
const cap = limit ?? 10;
|
||||
|
||||
// Loopback REST full-text fallback. Used when AI search is not
|
||||
// configured, embedding fails, there are no accessible spaces, or the
|
||||
// hybrid query returns nothing — so keyword search always works.
|
||||
const fallback = async () => {
|
||||
// search(query, spaceId?, limit?) -> { items, success }.
|
||||
// Items are filterSearchResult(): { id, title, highlight, ... }.
|
||||
const result = await client.search(trimmed, undefined, cap);
|
||||
const items = Array.isArray(result?.items) ? result.items : [];
|
||||
// Keep the payload token-efficient: id + title + a short snippet.
|
||||
return items.map((raw) => {
|
||||
const item = raw as {
|
||||
id?: string;
|
||||
slugId?: string;
|
||||
title?: string;
|
||||
highlight?: string;
|
||||
};
|
||||
return {
|
||||
id: item.id ?? item.slugId,
|
||||
title: item.title ?? '',
|
||||
snippet: snippet(item.highlight),
|
||||
};
|
||||
});
|
||||
};
|
||||
|
||||
// HYBRID path: fuse semantic (vector) + lexical (full-text) rankings
|
||||
// via RRF. Over-fetch candidates so the page-permission post-filter
|
||||
// still leaves enough results.
|
||||
const candidates = Math.min(Math.max(cap * 5, 50), 200);
|
||||
|
||||
// 1) Embed the query. Unconfigured embeddings (or any embedding error)
|
||||
// routes to the REST full-text fallback instead of erroring.
|
||||
let queryVector: number[];
|
||||
try {
|
||||
const [vec] = await this.aiService.embedTexts(workspaceId, [
|
||||
trimmed,
|
||||
]);
|
||||
if (!vec) return await fallback();
|
||||
queryVector = vec;
|
||||
} catch (err) {
|
||||
if (!(err instanceof AiEmbeddingNotConfiguredException)) {
|
||||
// Never leak provider/key details; log generically and fall back.
|
||||
this.logger.warn(
|
||||
`searchPages embed failed: ${
|
||||
err instanceof Error ? err.message : 'unknown error'
|
||||
}`,
|
||||
);
|
||||
}
|
||||
return await fallback();
|
||||
}
|
||||
|
||||
// 2) ACCESS CONTROL: the hybrid query runs IN-PROCESS (a direct
|
||||
// pgvector + full-text query), so unlike the loopback REST tools it
|
||||
// does NOT get CASL for free. Scope to the spaces the user can read
|
||||
// (member spaces + groups), mirroring SearchService.searchPage. No
|
||||
// accessible spaces => fall back to REST (which is CASL-scoped).
|
||||
const accessibleSpaceIds =
|
||||
await this.spaceMemberRepo.getUserSpaceIds(user.id);
|
||||
if (accessibleSpaceIds.length === 0) return await fallback();
|
||||
|
||||
// 3) Hybrid RRF retrieval, scoped to the workspace AND accessible
|
||||
// spaces.
|
||||
const hits = await this.pageEmbeddingRepo.hybridSearch(
|
||||
workspaceId,
|
||||
queryVector,
|
||||
trimmed,
|
||||
accessibleSpaceIds,
|
||||
candidates,
|
||||
);
|
||||
if (hits.length === 0) return await fallback();
|
||||
|
||||
// 4) Page-level permission post-filter: an accessible space does not
|
||||
// imply every page in it is accessible (restricted pages). Mirror
|
||||
// SearchService.searchPage's filterAccessiblePageIds pass.
|
||||
const pageIds = Array.from(new Set(hits.map((h) => h.pageId)));
|
||||
const accessibleIds =
|
||||
await this.pagePermissionRepo.filterAccessiblePageIds({
|
||||
pageIds,
|
||||
userId: user.id,
|
||||
});
|
||||
const accessibleSet = new Set(accessibleIds);
|
||||
|
||||
// Keep the best (first — hits are ordered by fused score desc) chunk
|
||||
// per page, capped to `cap`.
|
||||
const seen = new Set<string>();
|
||||
const results: { id: string; title: string; snippet: string }[] = [];
|
||||
for (const hit of hits) {
|
||||
if (!accessibleSet.has(hit.pageId)) continue;
|
||||
if (seen.has(hit.pageId)) continue;
|
||||
seen.add(hit.pageId);
|
||||
results.push({
|
||||
id: hit.pageId,
|
||||
title: hit.title ?? '',
|
||||
snippet: snippet(hit.content),
|
||||
});
|
||||
if (results.length >= cap) break;
|
||||
}
|
||||
return results;
|
||||
},
|
||||
}),
|
||||
|
||||
@@ -142,110 +231,6 @@ export class AiChatToolsService {
|
||||
},
|
||||
}),
|
||||
|
||||
semanticSearch: tool({
|
||||
description:
|
||||
'Semantic (vector) search across the pages the current user can ' +
|
||||
'access. Finds pages by meaning, not just keywords — use it to ' +
|
||||
'answer conceptual questions. Returns a compact list of relevant ' +
|
||||
'pages with a short snippet. Falls back to searchPages if semantic ' +
|
||||
'search is unavailable.',
|
||||
inputSchema: z.object({
|
||||
query: z.string().describe('The natural-language search query.'),
|
||||
limit: z
|
||||
.number()
|
||||
.int()
|
||||
.min(1)
|
||||
.max(20)
|
||||
.optional()
|
||||
.describe('Maximum number of results (1-20).'),
|
||||
}),
|
||||
execute: async ({ query, limit }) => {
|
||||
// ACCESS CONTROL: this tool runs IN-PROCESS (a direct pgvector query),
|
||||
// so unlike the loopback REST tools it does NOT get CASL for free. We
|
||||
// scope every query to the spaces the user can read, mirroring
|
||||
// SearchService.searchPage (§6.7 / §8). We additionally post-filter by
|
||||
// page-level permissions so restricted pages inside an accessible
|
||||
// space are never returned.
|
||||
const trimmed = (query ?? '').trim();
|
||||
if (trimmed.length === 0) return [];
|
||||
|
||||
// 1) Embed the query (no-op fallback when embeddings are unconfigured
|
||||
// so the agent can fall back to searchPages instead of erroring).
|
||||
let queryVector: number[];
|
||||
try {
|
||||
const [vec] = await this.aiService.embedTexts(workspaceId, [
|
||||
trimmed,
|
||||
]);
|
||||
if (!vec) return [];
|
||||
queryVector = vec;
|
||||
} catch (err) {
|
||||
if (err instanceof AiEmbeddingNotConfiguredException) {
|
||||
return {
|
||||
unavailable: true,
|
||||
reason:
|
||||
'semantic search unavailable (embeddings not configured)',
|
||||
};
|
||||
}
|
||||
// Never leak provider/key details; surface a generic unavailable.
|
||||
this.logger.warn(
|
||||
`semanticSearch embed failed: ${
|
||||
err instanceof Error ? err.message : 'unknown error'
|
||||
}`,
|
||||
);
|
||||
return {
|
||||
unavailable: true,
|
||||
reason: 'semantic search unavailable',
|
||||
};
|
||||
}
|
||||
|
||||
// 2) Resolve the spaces this user can read (member spaces + groups),
|
||||
// mirroring SearchService's space scoping. No spaces => no results.
|
||||
const accessibleSpaceIds =
|
||||
await this.spaceMemberRepo.getUserSpaceIds(user.id);
|
||||
if (accessibleSpaceIds.length === 0) return [];
|
||||
|
||||
// 3) Cosine ANN over the embeddings, scoped to the workspace AND the
|
||||
// accessible spaces. Over-fetch a little so the page-permission
|
||||
// post-filter still leaves enough results.
|
||||
const cap = limit ?? 10;
|
||||
const hits = await this.pageEmbeddingRepo.searchByEmbedding(
|
||||
workspaceId,
|
||||
queryVector,
|
||||
accessibleSpaceIds,
|
||||
cap * 3,
|
||||
);
|
||||
if (hits.length === 0) return [];
|
||||
|
||||
// 4) Page-level permission post-filter: a space being accessible does
|
||||
// not imply every page in it is (restricted pages). Mirror
|
||||
// SearchService.searchPage's filterAccessiblePageIds pass.
|
||||
const pageIds = Array.from(new Set(hits.map((h) => h.pageId)));
|
||||
const accessibleIds =
|
||||
await this.pagePermissionRepo.filterAccessiblePageIds({
|
||||
pageIds,
|
||||
userId: user.id,
|
||||
});
|
||||
const accessibleSet = new Set(accessibleIds);
|
||||
|
||||
// Keep the best (lowest-distance) hit per page, capped to `limit`.
|
||||
const seen = new Set<string>();
|
||||
const results: { pageId: string; title: string; snippet: string }[] =
|
||||
[];
|
||||
for (const hit of hits) {
|
||||
if (!accessibleSet.has(hit.pageId)) continue;
|
||||
if (seen.has(hit.pageId)) continue;
|
||||
seen.add(hit.pageId);
|
||||
results.push({
|
||||
pageId: hit.pageId,
|
||||
title: hit.title ?? '',
|
||||
snippet: snippet(hit.content),
|
||||
});
|
||||
if (results.length >= cap) break;
|
||||
}
|
||||
return results;
|
||||
},
|
||||
}),
|
||||
|
||||
// --- WRITE tools (all reversible — history/trash; §6.5 / D3) ---
|
||||
|
||||
createPage: tool({
|
||||
|
||||
Reference in New Issue
Block a user