feat(ai): hybrid RRF retrieval, heading-breadcrumb chunks, merged search tool

Improve agent RAG quality with three changes, plus a roadmap doc for the rest.

- Indexer: prefix each chunk with its heading path ("Page > H1 > H2"), built by
  walking the ProseMirror JSON (heading nodes) so a `#` inside a fenced code block
  is never mistaken for a heading. Falls back to plain-text chunking on any error.
  buildChunkRows: drop indexOf-against-source offsets (breadcrumb prefixes break
  verbatim matching) for a cumulative cursor — offsets are provenance-only.
- Hybrid search: new migration adds a generated `fts` tsvector column + GIN index
  to page_embeddings (same english+f_unaccent config as pages.tsv). New
  PageEmbeddingRepo.hybridSearch fuses cosine + full-text rankings via Reciprocal
  Rank Fusion (k=60, equal weights) in one SQL query at chunk granularity.
- Tools: collapse semanticSearch + searchPages into one hybrid `searchPages` tool
  with a query-rewrite-oriented description; gracefully falls back to the REST
  full-text path when embeddings are unconfigured. Access control (space scope +
  page-permission post-filter) preserved. Add a query-rewrite hint to the default
  system prompt.
- docs/rag-improvements-plan.md: record what shipped and the deferred backlog
  (reranker, attachment indexing, eval harness, tuning).

Note: requires a corpus reindex to populate breadcrumbs on existing pages.
This commit is contained in:
vvzvlad
2026-06-18 03:43:01 +03:00
parent 91a63f0b2c
commit c8e41e8916
6 changed files with 555 additions and 145 deletions

View File

@@ -9,6 +9,8 @@ const DEFAULT_PROMPT = [
'You help the current user find, read, and reason about pages in their workspace.',
'Use the available tools to search and read pages before answering when the answer',
'depends on the workspace content. Cite the pages you used. Be concise and accurate.',
"When searching, rephrase the user's question into focused keyword queries, and search",
'again with different terms if the first results are weak.',
].join(' ');
/**

View File

@@ -77,9 +77,28 @@ export class EmbeddingIndexerService {
return;
}
const text = this.extractText(page);
if (!text || text.trim().length === 0) {
// Empty page -> remove any prior embeddings so search returns nothing.
// Prefer heading-breadcrumb chunks: each chunk is prefixed with its heading
// path ("Page Title > H1 > H2") so the breadcrumb is embedded AND stored in
// `content` (feeding the fts column and the agent's snippet). Walk the
// ProseMirror JSON — NOT the markdown text — so a `#` inside a fenced code
// block is never mistaken for a heading. Degrades to the plain-text path on
// any error / unknown structure (returns null).
const breadcrumbChunks = page.content
? await this.safeBuildBreadcrumbChunks(page.content, page.title)
: null;
// Fall back to plain text when breadcrumb chunking is unavailable.
const fallbackText =
breadcrumbChunks && breadcrumbChunks.length > 0
? null
: this.extractText(page);
// Empty page (neither path produced content) -> remove any prior embeddings
// so search returns nothing.
if (
(!breadcrumbChunks || breadcrumbChunks.length === 0) &&
(!fallbackText || fallbackText.trim().length === 0)
) {
await this.pageEmbeddingRepo.deleteByPage(pageId, workspaceId);
return;
}
@@ -105,12 +124,17 @@ export class EmbeddingIndexerService {
throw err;
}
// Chunk the plain text.
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: CHUNK_SIZE,
chunkOverlap: CHUNK_OVERLAP,
});
const chunks = await splitter.splitText(text);
// Use breadcrumb chunks when available; otherwise chunk the plain text.
let chunks: string[];
if (breadcrumbChunks && breadcrumbChunks.length > 0) {
chunks = breadcrumbChunks;
} else {
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: CHUNK_SIZE,
chunkOverlap: CHUNK_OVERLAP,
});
chunks = await splitter.splitText(fallbackText as string);
}
if (chunks.length === 0) {
await this.pageEmbeddingRepo.deleteByPage(pageId, workspaceId);
return;
@@ -139,7 +163,6 @@ export class EmbeddingIndexerService {
const rows = this.buildChunkRows(
chunks,
vectors,
text,
{ pageId, workspaceId, spaceId },
modelName,
);
@@ -255,14 +278,16 @@ export class EmbeddingIndexerService {
}
/**
* Map chunk strings + vectors to insertable rows, computing chunkStart /
* chunkLength against the source text. A moving cursor handles repeated
* substrings and overlap so offsets stay monotonic.
* Map chunk strings + vectors to insertable rows. Breadcrumb-prefixed chunks
* are NOT verbatim substrings of any source text, so chunkStart is a running
* cumulative offset (sum of previous chunk lengths) rather than an indexOf
* position. These offsets are informational provenance only — search returns
* `content` and never slices by offset. chunkIndex stays a global monotonic
* index.
*/
private buildChunkRows(
chunks: string[],
vectors: number[][],
sourceText: string,
ids: { pageId: string; workspaceId: string; spaceId: string },
modelName: string,
): PageEmbeddingChunkRow[] {
@@ -272,11 +297,8 @@ export class EmbeddingIndexerService {
const chunk = chunks[i];
const embedding = vectors[i];
if (!embedding) continue;
const found = sourceText.indexOf(chunk, cursor);
const chunkStart = found >= 0 ? found : cursor;
// Advance the cursor past the start so later identical chunks resolve to
// later occurrences (overlap keeps the next search valid).
cursor = chunkStart + 1;
const chunkStart = cursor;
cursor += chunk.length;
rows.push({
pageId: ids.pageId,
workspaceId: ids.workspaceId,
@@ -295,4 +317,106 @@ export class EmbeddingIndexerService {
}
return rows;
}
/**
* Thin try/catch wrapper around buildBreadcrumbChunks. Any failure (malformed
* structure, unknown node type, etc.) returns null so the caller degrades
* gracefully to the plain-text chunking path.
*/
private async safeBuildBreadcrumbChunks(
contentJson: unknown,
pageTitle: string | null,
): Promise<string[] | null> {
try {
return await this.buildBreadcrumbChunks(contentJson, pageTitle);
} catch {
return null;
}
}
/**
* Build heading-breadcrumb chunks by walking the ProseMirror JSON document.
*
* Each section (the body following a heading) is split with the same 1000/200
* RecursiveCharacterTextSplitter, and every resulting piece is prefixed with
* its heading path ("Page Title > H1 > H2"). Walking the JSON — not markdown
* text — means a `#` inside a fenced code block is never treated as a heading
* (ProseMirror heading nodes are explicit).
*
* Returns null when `contentJson` is not an object with an array `content`, so
* the caller falls back to plain-text chunking.
*/
private async buildBreadcrumbChunks(
contentJson: unknown,
pageTitle: string | null,
): Promise<string[] | null> {
const doc = contentJson as { content?: unknown };
if (
typeof contentJson !== 'object' ||
contentJson === null ||
!Array.isArray(doc.content)
) {
return null;
}
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: CHUNK_SIZE,
chunkOverlap: CHUNK_OVERLAP,
});
const out: string[] = [];
const stack: { level: number; text: string }[] = [];
let buffer = '';
// Flush the accumulated body as one or more chunks under the CURRENT crumb.
const flush = async (): Promise<void> => {
if (buffer.trim().length === 0) {
buffer = '';
return;
}
const crumb = [pageTitle, ...stack.map((s) => s.text)]
.filter((s) => typeof s === 'string' && s.trim().length > 0)
.join(' > ');
const pieces = await splitter.splitText(buffer);
for (const piece of pieces) {
out.push(crumb ? `${crumb}\n\n${piece}` : piece);
}
buffer = '';
};
for (const block of doc.content as Array<{
type?: string;
attrs?: { level?: number };
}>) {
if (block?.type === 'heading') {
// Flush the preceding body under the crumb in effect BEFORE this
// heading, then update the heading stack.
await flush();
const level =
typeof block.attrs?.level === 'number' ? block.attrs.level : 1;
// Pop deeper-or-equal headings: a new H2 replaces a prior H2/H3/...
while (stack.length > 0 && stack[stack.length - 1].level >= level) {
stack.pop();
}
const headingText = jsonToText({
type: 'doc',
content: [block],
} as never).trim();
if (headingText.length > 0) {
stack.push({ level, text: headingText });
}
} else {
const blockText = jsonToText({
type: 'doc',
content: [block],
} as never);
buffer = buffer.length > 0 ? `${buffer}\n${blockText}` : blockText;
}
}
// Flush any trailing body after the last heading.
await flush();
return out;
}
}

View File

@@ -87,37 +87,126 @@ export class AiChatToolsService {
return {
searchPages: tool({
description:
'Full-text search across the pages the current user can access. ' +
'Returns a compact list of matching pages with a short snippet.',
'Search the wiki for pages relevant to a query. Combines exact ' +
'keyword/identifier matching with semantic meaning and returns the ' +
'most relevant pages with a short snippet, best match first. ' +
"Rephrase the user's question into a focused search query (key terms " +
'and entities), not a full sentence. If the first results look weak ' +
'or incomplete, search again with different wording or synonyms ' +
'before answering.',
inputSchema: z.object({
query: z.string().describe('The search query.'),
limit: z
.number()
.int()
.min(1)
.max(50)
.max(20)
.optional()
.describe('Maximum number of results (1-50).'),
.describe('Maximum number of results (1-20).'),
}),
execute: async ({ query, limit }) => {
// search(query, spaceId?, limit?) -> { items, success }.
// Items are filterSearchResult(): { id, title, highlight, ... }.
const result = await client.search(query, undefined, limit);
const items = Array.isArray(result?.items) ? result.items : [];
// Keep the payload token-efficient: id + title + a short snippet only.
return items.map((raw) => {
const item = raw as {
id?: string;
slugId?: string;
title?: string;
highlight?: string;
};
return {
id: item.id ?? item.slugId,
title: item.title ?? '',
snippet: snippet(item.highlight),
};
});
const trimmed = (query ?? '').trim();
if (!trimmed) return [];
const cap = limit ?? 10;
// Loopback REST full-text fallback. Used when AI search is not
// configured, embedding fails, there are no accessible spaces, or the
// hybrid query returns nothing — so keyword search always works.
const fallback = async () => {
// search(query, spaceId?, limit?) -> { items, success }.
// Items are filterSearchResult(): { id, title, highlight, ... }.
const result = await client.search(trimmed, undefined, cap);
const items = Array.isArray(result?.items) ? result.items : [];
// Keep the payload token-efficient: id + title + a short snippet.
return items.map((raw) => {
const item = raw as {
id?: string;
slugId?: string;
title?: string;
highlight?: string;
};
return {
id: item.id ?? item.slugId,
title: item.title ?? '',
snippet: snippet(item.highlight),
};
});
};
// HYBRID path: fuse semantic (vector) + lexical (full-text) rankings
// via RRF. Over-fetch candidates so the page-permission post-filter
// still leaves enough results.
const candidates = Math.min(Math.max(cap * 5, 50), 200);
// 1) Embed the query. Unconfigured embeddings (or any embedding error)
// routes to the REST full-text fallback instead of erroring.
let queryVector: number[];
try {
const [vec] = await this.aiService.embedTexts(workspaceId, [
trimmed,
]);
if (!vec) return await fallback();
queryVector = vec;
} catch (err) {
if (!(err instanceof AiEmbeddingNotConfiguredException)) {
// Never leak provider/key details; log generically and fall back.
this.logger.warn(
`searchPages embed failed: ${
err instanceof Error ? err.message : 'unknown error'
}`,
);
}
return await fallback();
}
// 2) ACCESS CONTROL: the hybrid query runs IN-PROCESS (a direct
// pgvector + full-text query), so unlike the loopback REST tools it
// does NOT get CASL for free. Scope to the spaces the user can read
// (member spaces + groups), mirroring SearchService.searchPage. No
// accessible spaces => fall back to REST (which is CASL-scoped).
const accessibleSpaceIds =
await this.spaceMemberRepo.getUserSpaceIds(user.id);
if (accessibleSpaceIds.length === 0) return await fallback();
// 3) Hybrid RRF retrieval, scoped to the workspace AND accessible
// spaces.
const hits = await this.pageEmbeddingRepo.hybridSearch(
workspaceId,
queryVector,
trimmed,
accessibleSpaceIds,
candidates,
);
if (hits.length === 0) return await fallback();
// 4) Page-level permission post-filter: an accessible space does not
// imply every page in it is accessible (restricted pages). Mirror
// SearchService.searchPage's filterAccessiblePageIds pass.
const pageIds = Array.from(new Set(hits.map((h) => h.pageId)));
const accessibleIds =
await this.pagePermissionRepo.filterAccessiblePageIds({
pageIds,
userId: user.id,
});
const accessibleSet = new Set(accessibleIds);
// Keep the best (first — hits are ordered by fused score desc) chunk
// per page, capped to `cap`.
const seen = new Set<string>();
const results: { id: string; title: string; snippet: string }[] = [];
for (const hit of hits) {
if (!accessibleSet.has(hit.pageId)) continue;
if (seen.has(hit.pageId)) continue;
seen.add(hit.pageId);
results.push({
id: hit.pageId,
title: hit.title ?? '',
snippet: snippet(hit.content),
});
if (results.length >= cap) break;
}
return results;
},
}),
@@ -142,110 +231,6 @@ export class AiChatToolsService {
},
}),
semanticSearch: tool({
description:
'Semantic (vector) search across the pages the current user can ' +
'access. Finds pages by meaning, not just keywords — use it to ' +
'answer conceptual questions. Returns a compact list of relevant ' +
'pages with a short snippet. Falls back to searchPages if semantic ' +
'search is unavailable.',
inputSchema: z.object({
query: z.string().describe('The natural-language search query.'),
limit: z
.number()
.int()
.min(1)
.max(20)
.optional()
.describe('Maximum number of results (1-20).'),
}),
execute: async ({ query, limit }) => {
// ACCESS CONTROL: this tool runs IN-PROCESS (a direct pgvector query),
// so unlike the loopback REST tools it does NOT get CASL for free. We
// scope every query to the spaces the user can read, mirroring
// SearchService.searchPage (§6.7 / §8). We additionally post-filter by
// page-level permissions so restricted pages inside an accessible
// space are never returned.
const trimmed = (query ?? '').trim();
if (trimmed.length === 0) return [];
// 1) Embed the query (no-op fallback when embeddings are unconfigured
// so the agent can fall back to searchPages instead of erroring).
let queryVector: number[];
try {
const [vec] = await this.aiService.embedTexts(workspaceId, [
trimmed,
]);
if (!vec) return [];
queryVector = vec;
} catch (err) {
if (err instanceof AiEmbeddingNotConfiguredException) {
return {
unavailable: true,
reason:
'semantic search unavailable (embeddings not configured)',
};
}
// Never leak provider/key details; surface a generic unavailable.
this.logger.warn(
`semanticSearch embed failed: ${
err instanceof Error ? err.message : 'unknown error'
}`,
);
return {
unavailable: true,
reason: 'semantic search unavailable',
};
}
// 2) Resolve the spaces this user can read (member spaces + groups),
// mirroring SearchService's space scoping. No spaces => no results.
const accessibleSpaceIds =
await this.spaceMemberRepo.getUserSpaceIds(user.id);
if (accessibleSpaceIds.length === 0) return [];
// 3) Cosine ANN over the embeddings, scoped to the workspace AND the
// accessible spaces. Over-fetch a little so the page-permission
// post-filter still leaves enough results.
const cap = limit ?? 10;
const hits = await this.pageEmbeddingRepo.searchByEmbedding(
workspaceId,
queryVector,
accessibleSpaceIds,
cap * 3,
);
if (hits.length === 0) return [];
// 4) Page-level permission post-filter: a space being accessible does
// not imply every page in it is (restricted pages). Mirror
// SearchService.searchPage's filterAccessiblePageIds pass.
const pageIds = Array.from(new Set(hits.map((h) => h.pageId)));
const accessibleIds =
await this.pagePermissionRepo.filterAccessiblePageIds({
pageIds,
userId: user.id,
});
const accessibleSet = new Set(accessibleIds);
// Keep the best (lowest-distance) hit per page, capped to `limit`.
const seen = new Set<string>();
const results: { pageId: string; title: string; snippet: string }[] =
[];
for (const hit of hits) {
if (!accessibleSet.has(hit.pageId)) continue;
if (seen.has(hit.pageId)) continue;
seen.add(hit.pageId);
results.push({
pageId: hit.pageId,
title: hit.title ?? '',
snippet: snippet(hit.content),
});
if (results.length >= cap) break;
}
return results;
},
}),
// --- WRITE tools (all reversible — history/trash; §6.5 / D3) ---
createPage: tool({