feat(ai): hybrid RRF retrieval, heading-breadcrumb chunks, merged search tool

Improve agent RAG quality with three changes, plus a roadmap doc for the rest.

- Indexer: prefix each chunk with its heading path ("Page > H1 > H2"), built by
  walking the ProseMirror JSON (heading nodes) so a `#` inside a fenced code block
  is never mistaken for a heading. Falls back to plain-text chunking on any error.
  buildChunkRows: drop indexOf-against-source offsets (breadcrumb prefixes break
  verbatim matching) for a cumulative cursor — offsets are provenance-only.
- Hybrid search: new migration adds a generated `fts` tsvector column + GIN index
  to page_embeddings (same english+f_unaccent config as pages.tsv). New
  PageEmbeddingRepo.hybridSearch fuses cosine + full-text rankings via Reciprocal
  Rank Fusion (k=60, equal weights) in one SQL query at chunk granularity.
- Tools: collapse semanticSearch + searchPages into one hybrid `searchPages` tool
  with a query-rewrite-oriented description; gracefully falls back to the REST
  full-text path when embeddings are unconfigured. Access control (space scope +
  page-permission post-filter) preserved. Add a query-rewrite hint to the default
  system prompt.
- docs/rag-improvements-plan.md: record what shipped and the deferred backlog
  (reranker, attachment indexing, eval harness, tuning).

Note: requires a corpus reindex to populate breadcrumbs on existing pages.
This commit is contained in:
vvzvlad
2026-06-18 03:43:01 +03:00
parent 91a63f0b2c
commit c8e41e8916
6 changed files with 555 additions and 145 deletions

View File

@@ -9,6 +9,8 @@ const DEFAULT_PROMPT = [
'You help the current user find, read, and reason about pages in their workspace.',
'Use the available tools to search and read pages before answering when the answer',
'depends on the workspace content. Cite the pages you used. Be concise and accurate.',
"When searching, rephrase the user's question into focused keyword queries, and search",
'again with different terms if the first results are weak.',
].join(' ');
/**

View File

@@ -77,9 +77,28 @@ export class EmbeddingIndexerService {
return;
}
const text = this.extractText(page);
if (!text || text.trim().length === 0) {
// Empty page -> remove any prior embeddings so search returns nothing.
// Prefer heading-breadcrumb chunks: each chunk is prefixed with its heading
// path ("Page Title > H1 > H2") so the breadcrumb is embedded AND stored in
// `content` (feeding the fts column and the agent's snippet). Walk the
// ProseMirror JSON — NOT the markdown text — so a `#` inside a fenced code
// block is never mistaken for a heading. Degrades to the plain-text path on
// any error / unknown structure (returns null).
const breadcrumbChunks = page.content
? await this.safeBuildBreadcrumbChunks(page.content, page.title)
: null;
// Fall back to plain text when breadcrumb chunking is unavailable.
const fallbackText =
breadcrumbChunks && breadcrumbChunks.length > 0
? null
: this.extractText(page);
// Empty page (neither path produced content) -> remove any prior embeddings
// so search returns nothing.
if (
(!breadcrumbChunks || breadcrumbChunks.length === 0) &&
(!fallbackText || fallbackText.trim().length === 0)
) {
await this.pageEmbeddingRepo.deleteByPage(pageId, workspaceId);
return;
}
@@ -105,12 +124,17 @@ export class EmbeddingIndexerService {
throw err;
}
// Chunk the plain text.
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: CHUNK_SIZE,
chunkOverlap: CHUNK_OVERLAP,
});
const chunks = await splitter.splitText(text);
// Use breadcrumb chunks when available; otherwise chunk the plain text.
let chunks: string[];
if (breadcrumbChunks && breadcrumbChunks.length > 0) {
chunks = breadcrumbChunks;
} else {
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: CHUNK_SIZE,
chunkOverlap: CHUNK_OVERLAP,
});
chunks = await splitter.splitText(fallbackText as string);
}
if (chunks.length === 0) {
await this.pageEmbeddingRepo.deleteByPage(pageId, workspaceId);
return;
@@ -139,7 +163,6 @@ export class EmbeddingIndexerService {
const rows = this.buildChunkRows(
chunks,
vectors,
text,
{ pageId, workspaceId, spaceId },
modelName,
);
@@ -255,14 +278,16 @@ export class EmbeddingIndexerService {
}
/**
* Map chunk strings + vectors to insertable rows, computing chunkStart /
* chunkLength against the source text. A moving cursor handles repeated
* substrings and overlap so offsets stay monotonic.
* Map chunk strings + vectors to insertable rows. Breadcrumb-prefixed chunks
* are NOT verbatim substrings of any source text, so chunkStart is a running
* cumulative offset (sum of previous chunk lengths) rather than an indexOf
* position. These offsets are informational provenance only — search returns
* `content` and never slices by offset. chunkIndex stays a global monotonic
* index.
*/
private buildChunkRows(
chunks: string[],
vectors: number[][],
sourceText: string,
ids: { pageId: string; workspaceId: string; spaceId: string },
modelName: string,
): PageEmbeddingChunkRow[] {
@@ -272,11 +297,8 @@ export class EmbeddingIndexerService {
const chunk = chunks[i];
const embedding = vectors[i];
if (!embedding) continue;
const found = sourceText.indexOf(chunk, cursor);
const chunkStart = found >= 0 ? found : cursor;
// Advance the cursor past the start so later identical chunks resolve to
// later occurrences (overlap keeps the next search valid).
cursor = chunkStart + 1;
const chunkStart = cursor;
cursor += chunk.length;
rows.push({
pageId: ids.pageId,
workspaceId: ids.workspaceId,
@@ -295,4 +317,106 @@ export class EmbeddingIndexerService {
}
return rows;
}
/**
* Thin try/catch wrapper around buildBreadcrumbChunks. Any failure (malformed
* structure, unknown node type, etc.) returns null so the caller degrades
* gracefully to the plain-text chunking path.
*/
private async safeBuildBreadcrumbChunks(
contentJson: unknown,
pageTitle: string | null,
): Promise<string[] | null> {
try {
return await this.buildBreadcrumbChunks(contentJson, pageTitle);
} catch {
return null;
}
}
/**
* Build heading-breadcrumb chunks by walking the ProseMirror JSON document.
*
* Each section (the body following a heading) is split with the same 1000/200
* RecursiveCharacterTextSplitter, and every resulting piece is prefixed with
* its heading path ("Page Title > H1 > H2"). Walking the JSON — not markdown
* text — means a `#` inside a fenced code block is never treated as a heading
* (ProseMirror heading nodes are explicit).
*
* Returns null when `contentJson` is not an object with an array `content`, so
* the caller falls back to plain-text chunking.
*/
private async buildBreadcrumbChunks(
contentJson: unknown,
pageTitle: string | null,
): Promise<string[] | null> {
const doc = contentJson as { content?: unknown };
if (
typeof contentJson !== 'object' ||
contentJson === null ||
!Array.isArray(doc.content)
) {
return null;
}
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: CHUNK_SIZE,
chunkOverlap: CHUNK_OVERLAP,
});
const out: string[] = [];
const stack: { level: number; text: string }[] = [];
let buffer = '';
// Flush the accumulated body as one or more chunks under the CURRENT crumb.
const flush = async (): Promise<void> => {
if (buffer.trim().length === 0) {
buffer = '';
return;
}
const crumb = [pageTitle, ...stack.map((s) => s.text)]
.filter((s) => typeof s === 'string' && s.trim().length > 0)
.join(' > ');
const pieces = await splitter.splitText(buffer);
for (const piece of pieces) {
out.push(crumb ? `${crumb}\n\n${piece}` : piece);
}
buffer = '';
};
for (const block of doc.content as Array<{
type?: string;
attrs?: { level?: number };
}>) {
if (block?.type === 'heading') {
// Flush the preceding body under the crumb in effect BEFORE this
// heading, then update the heading stack.
await flush();
const level =
typeof block.attrs?.level === 'number' ? block.attrs.level : 1;
// Pop deeper-or-equal headings: a new H2 replaces a prior H2/H3/...
while (stack.length > 0 && stack[stack.length - 1].level >= level) {
stack.pop();
}
const headingText = jsonToText({
type: 'doc',
content: [block],
} as never).trim();
if (headingText.length > 0) {
stack.push({ level, text: headingText });
}
} else {
const blockText = jsonToText({
type: 'doc',
content: [block],
} as never);
buffer = buffer.length > 0 ? `${buffer}\n${blockText}` : blockText;
}
}
// Flush any trailing body after the last heading.
await flush();
return out;
}
}

View File

@@ -87,37 +87,126 @@ export class AiChatToolsService {
return {
searchPages: tool({
description:
'Full-text search across the pages the current user can access. ' +
'Returns a compact list of matching pages with a short snippet.',
'Search the wiki for pages relevant to a query. Combines exact ' +
'keyword/identifier matching with semantic meaning and returns the ' +
'most relevant pages with a short snippet, best match first. ' +
"Rephrase the user's question into a focused search query (key terms " +
'and entities), not a full sentence. If the first results look weak ' +
'or incomplete, search again with different wording or synonyms ' +
'before answering.',
inputSchema: z.object({
query: z.string().describe('The search query.'),
limit: z
.number()
.int()
.min(1)
.max(50)
.max(20)
.optional()
.describe('Maximum number of results (1-50).'),
.describe('Maximum number of results (1-20).'),
}),
execute: async ({ query, limit }) => {
// search(query, spaceId?, limit?) -> { items, success }.
// Items are filterSearchResult(): { id, title, highlight, ... }.
const result = await client.search(query, undefined, limit);
const items = Array.isArray(result?.items) ? result.items : [];
// Keep the payload token-efficient: id + title + a short snippet only.
return items.map((raw) => {
const item = raw as {
id?: string;
slugId?: string;
title?: string;
highlight?: string;
};
return {
id: item.id ?? item.slugId,
title: item.title ?? '',
snippet: snippet(item.highlight),
};
});
const trimmed = (query ?? '').trim();
if (!trimmed) return [];
const cap = limit ?? 10;
// Loopback REST full-text fallback. Used when AI search is not
// configured, embedding fails, there are no accessible spaces, or the
// hybrid query returns nothing — so keyword search always works.
const fallback = async () => {
// search(query, spaceId?, limit?) -> { items, success }.
// Items are filterSearchResult(): { id, title, highlight, ... }.
const result = await client.search(trimmed, undefined, cap);
const items = Array.isArray(result?.items) ? result.items : [];
// Keep the payload token-efficient: id + title + a short snippet.
return items.map((raw) => {
const item = raw as {
id?: string;
slugId?: string;
title?: string;
highlight?: string;
};
return {
id: item.id ?? item.slugId,
title: item.title ?? '',
snippet: snippet(item.highlight),
};
});
};
// HYBRID path: fuse semantic (vector) + lexical (full-text) rankings
// via RRF. Over-fetch candidates so the page-permission post-filter
// still leaves enough results.
const candidates = Math.min(Math.max(cap * 5, 50), 200);
// 1) Embed the query. Unconfigured embeddings (or any embedding error)
// routes to the REST full-text fallback instead of erroring.
let queryVector: number[];
try {
const [vec] = await this.aiService.embedTexts(workspaceId, [
trimmed,
]);
if (!vec) return await fallback();
queryVector = vec;
} catch (err) {
if (!(err instanceof AiEmbeddingNotConfiguredException)) {
// Never leak provider/key details; log generically and fall back.
this.logger.warn(
`searchPages embed failed: ${
err instanceof Error ? err.message : 'unknown error'
}`,
);
}
return await fallback();
}
// 2) ACCESS CONTROL: the hybrid query runs IN-PROCESS (a direct
// pgvector + full-text query), so unlike the loopback REST tools it
// does NOT get CASL for free. Scope to the spaces the user can read
// (member spaces + groups), mirroring SearchService.searchPage. No
// accessible spaces => fall back to REST (which is CASL-scoped).
const accessibleSpaceIds =
await this.spaceMemberRepo.getUserSpaceIds(user.id);
if (accessibleSpaceIds.length === 0) return await fallback();
// 3) Hybrid RRF retrieval, scoped to the workspace AND accessible
// spaces.
const hits = await this.pageEmbeddingRepo.hybridSearch(
workspaceId,
queryVector,
trimmed,
accessibleSpaceIds,
candidates,
);
if (hits.length === 0) return await fallback();
// 4) Page-level permission post-filter: an accessible space does not
// imply every page in it is accessible (restricted pages). Mirror
// SearchService.searchPage's filterAccessiblePageIds pass.
const pageIds = Array.from(new Set(hits.map((h) => h.pageId)));
const accessibleIds =
await this.pagePermissionRepo.filterAccessiblePageIds({
pageIds,
userId: user.id,
});
const accessibleSet = new Set(accessibleIds);
// Keep the best (first — hits are ordered by fused score desc) chunk
// per page, capped to `cap`.
const seen = new Set<string>();
const results: { id: string; title: string; snippet: string }[] = [];
for (const hit of hits) {
if (!accessibleSet.has(hit.pageId)) continue;
if (seen.has(hit.pageId)) continue;
seen.add(hit.pageId);
results.push({
id: hit.pageId,
title: hit.title ?? '',
snippet: snippet(hit.content),
});
if (results.length >= cap) break;
}
return results;
},
}),
@@ -142,110 +231,6 @@ export class AiChatToolsService {
},
}),
semanticSearch: tool({
description:
'Semantic (vector) search across the pages the current user can ' +
'access. Finds pages by meaning, not just keywords — use it to ' +
'answer conceptual questions. Returns a compact list of relevant ' +
'pages with a short snippet. Falls back to searchPages if semantic ' +
'search is unavailable.',
inputSchema: z.object({
query: z.string().describe('The natural-language search query.'),
limit: z
.number()
.int()
.min(1)
.max(20)
.optional()
.describe('Maximum number of results (1-20).'),
}),
execute: async ({ query, limit }) => {
// ACCESS CONTROL: this tool runs IN-PROCESS (a direct pgvector query),
// so unlike the loopback REST tools it does NOT get CASL for free. We
// scope every query to the spaces the user can read, mirroring
// SearchService.searchPage (§6.7 / §8). We additionally post-filter by
// page-level permissions so restricted pages inside an accessible
// space are never returned.
const trimmed = (query ?? '').trim();
if (trimmed.length === 0) return [];
// 1) Embed the query (no-op fallback when embeddings are unconfigured
// so the agent can fall back to searchPages instead of erroring).
let queryVector: number[];
try {
const [vec] = await this.aiService.embedTexts(workspaceId, [
trimmed,
]);
if (!vec) return [];
queryVector = vec;
} catch (err) {
if (err instanceof AiEmbeddingNotConfiguredException) {
return {
unavailable: true,
reason:
'semantic search unavailable (embeddings not configured)',
};
}
// Never leak provider/key details; surface a generic unavailable.
this.logger.warn(
`semanticSearch embed failed: ${
err instanceof Error ? err.message : 'unknown error'
}`,
);
return {
unavailable: true,
reason: 'semantic search unavailable',
};
}
// 2) Resolve the spaces this user can read (member spaces + groups),
// mirroring SearchService's space scoping. No spaces => no results.
const accessibleSpaceIds =
await this.spaceMemberRepo.getUserSpaceIds(user.id);
if (accessibleSpaceIds.length === 0) return [];
// 3) Cosine ANN over the embeddings, scoped to the workspace AND the
// accessible spaces. Over-fetch a little so the page-permission
// post-filter still leaves enough results.
const cap = limit ?? 10;
const hits = await this.pageEmbeddingRepo.searchByEmbedding(
workspaceId,
queryVector,
accessibleSpaceIds,
cap * 3,
);
if (hits.length === 0) return [];
// 4) Page-level permission post-filter: a space being accessible does
// not imply every page in it is (restricted pages). Mirror
// SearchService.searchPage's filterAccessiblePageIds pass.
const pageIds = Array.from(new Set(hits.map((h) => h.pageId)));
const accessibleIds =
await this.pagePermissionRepo.filterAccessiblePageIds({
pageIds,
userId: user.id,
});
const accessibleSet = new Set(accessibleIds);
// Keep the best (lowest-distance) hit per page, capped to `limit`.
const seen = new Set<string>();
const results: { pageId: string; title: string; snippet: string }[] =
[];
for (const hit of hits) {
if (!accessibleSet.has(hit.pageId)) continue;
if (seen.has(hit.pageId)) continue;
seen.add(hit.pageId);
results.push({
pageId: hit.pageId,
title: hit.title ?? '',
snippet: snippet(hit.content),
});
if (results.length >= cap) break;
}
return results;
},
}),
// --- WRITE tools (all reversible — history/trash; §6.5 / D3) ---
createPage: tool({

View File

@@ -0,0 +1,48 @@
import { type Kysely, sql } from 'kysely';
/**
* Chunk-level lexical index for HYBRID retrieval (RRF) over `page_embeddings`.
*
* The agent's retrieval used to be either pure full-text (loopback REST over
* `pages.tsv`) OR pure vector (cosine over `page_embeddings.embedding`). Hybrid
* retrieval fuses BOTH rankings with Reciprocal Rank Fusion so exact keyword /
* identifier matches AND semantic matches both surface. The vector side already
* exists; this migration adds the missing LEXICAL side AT CHUNK GRANULARITY so
* both CTEs of the fused query rank the SAME chunk rows.
*
* `fts` is a GENERATED ALWAYS ... STORED `tsvector` built from `content` with
* the SAME text-search config as `pages.tsv`: `to_tsvector('english',
* f_unaccent(content))`. Using the identical config keeps lexical behaviour
* consistent with the existing page search (incl. unaccented Cyrillic content).
* `f_unaccent(text)` is declared IMMUTABLE (migration 20250729T213756), which is
* exactly what a GENERATED STORED column requires — so this needs NO trigger.
* The column is independent of the embedding vector dimension: it indexes text,
* not the vector, so it works for any model dimension.
*
* NOTE: `fts` is deliberately NOT added to the `PageEmbeddings` Kysely type. It
* is a generated column accessed ONLY via raw SQL (the hybrid query); adding it
* to the Kysely type would force it into the explicit-column insert in
* `insertChunks` and break inserts (a GENERATED column cannot be written to).
*/
export async function up(db: Kysely<any>): Promise<void> {
// Generated STORED tsvector mirroring pages.tsv's config. f_unaccent is
// IMMUTABLE so it is valid inside a GENERATED column expression (no trigger).
await sql`
ALTER TABLE page_embeddings
ADD COLUMN IF NOT EXISTS fts tsvector
GENERATED ALWAYS AS (to_tsvector('english', f_unaccent(content))) STORED
`.execute(db);
// GIN index for fast `fts @@ query` lexical matching on the chunk text.
await sql`
CREATE INDEX IF NOT EXISTS idx_page_embeddings_fts
ON page_embeddings USING gin(fts)
`.execute(db);
}
export async function down(db: Kysely<any>): Promise<void> {
await sql`DROP INDEX IF EXISTS idx_page_embeddings_fts`.execute(db);
await sql`
ALTER TABLE page_embeddings DROP COLUMN IF EXISTS fts
`.execute(db);
}

View File

@@ -48,6 +48,16 @@ export interface PageEmbeddingSearchHit {
distance: number;
}
/** A single hybrid (RRF-fused) search hit. Higher `score` is more relevant. */
export interface PageEmbeddingHybridHit {
pageId: string;
spaceId: string;
title: string | null;
content: string;
// Fused Reciprocal Rank Fusion score (sum of 1/(k+rank) across CTEs).
score: number;
}
@Injectable()
export class PageEmbeddingRepo {
constructor(@InjectKysely() private readonly db: KyselyDB) {}
@@ -173,6 +183,102 @@ export class PageEmbeddingRepo {
}));
}
/**
* HYBRID retrieval: fuse semantic (cosine) and lexical (full-text) chunk
* rankings with Reciprocal Rank Fusion (RRF). Scoped to a workspace AND the
* set of spaces the caller may read. Returns [] when `spaceIds` is empty.
*
* Two CTEs each rank chunks independently, then a FULL OUTER JOIN on the chunk
* `id` fuses them. RRF combines RANKS (not raw scores), so the cosine-distance
* and ts_rank scales never need normalizing — that is the whole point of RRF.
*
* score = 1/(k + rank_semantic) + 1/(k + rank_lexical)
*
* with k = 60 (Cormack et al. 2009; the default in Elasticsearch, OpenSearch
* and Weaviate) and equal 1.0/1.0 weights as a starting point. `candidates`
* is both the per-CTE over-fetch limit and the final fused LIMIT.
*
* The `model_dimensions = $dim` filter applies ONLY on the semantic side
* (cosine compares same-dimension vectors; pgvector errors otherwise). The
* lexical side (`fts`) is dimension-independent. If `websearch_to_tsquery`
* yields an EMPTY query (e.g. the text is all stopwords) the `@@` matches
* nothing and the lexical CTE is empty, so results degrade to pure-semantic —
* which is correct behaviour, not an error.
*
* `fts` is a generated column accessed only here via raw SQL (deliberately not
* in the Kysely `PageEmbeddings` type — see migration 20260618T150000).
*/
async hybridSearch(
workspaceId: string,
queryEmbedding: number[],
queryText: string,
spaceIds: string[],
// Per-CTE over-fetch AND the final fused LIMIT.
candidates: number,
): Promise<PageEmbeddingHybridHit[]> {
if (spaceIds.length === 0) return [];
const queryVector = sql`${pgvector.toSql(queryEmbedding)}::vector`;
const queryDim = queryEmbedding.length;
const spaceList = sql.join(
spaceIds.map((s) => sql`${s}`),
sql`, `,
);
const result = await sql<{
pageId: string;
spaceId: string;
title: string | null;
content: string;
score: number;
}>`
WITH semantic AS (
SELECT pe.id, pe.page_id, pe.space_id, pe.content, p.title,
row_number() OVER (ORDER BY pe.embedding <=> ${queryVector}) AS rank_ix
FROM page_embeddings pe
JOIN pages p ON p.id = pe.page_id
WHERE pe.workspace_id = ${workspaceId}
AND pe.space_id IN (${spaceList})
AND pe.model_dimensions = ${queryDim}
AND p.deleted_at IS NULL
ORDER BY pe.embedding <=> ${queryVector}
LIMIT ${candidates}
),
full_text AS (
SELECT pe.id, pe.page_id, pe.space_id, pe.content, p.title,
row_number() OVER (ORDER BY ts_rank(pe.fts, q.query) DESC) AS rank_ix
FROM page_embeddings pe
JOIN pages p ON p.id = pe.page_id,
websearch_to_tsquery('english', f_unaccent(${queryText})) AS q(query)
WHERE pe.workspace_id = ${workspaceId}
AND pe.space_id IN (${spaceList})
AND p.deleted_at IS NULL
AND pe.fts @@ q.query
ORDER BY ts_rank(pe.fts, q.query) DESC
LIMIT ${candidates}
)
SELECT
coalesce(semantic.page_id, full_text.page_id) AS "pageId",
coalesce(semantic.space_id, full_text.space_id) AS "spaceId",
coalesce(semantic.title, full_text.title) AS title,
coalesce(semantic.content, full_text.content) AS content,
coalesce(1.0/(60 + semantic.rank_ix), 0.0) * 1.0
+ coalesce(1.0/(60 + full_text.rank_ix), 0.0) * 1.0 AS score
FROM semantic
FULL OUTER JOIN full_text ON semantic.id = full_text.id
ORDER BY score DESC
LIMIT ${candidates}
`.execute(this.db);
return result.rows.map((row) => ({
pageId: row.pageId,
spaceId: row.spaceId,
title: row.title,
content: row.content,
score: Number(row.score),
}));
}
/**
* Count DISTINCT non-deleted pages that have at least one embedding row in this
* workspace — i.e. how many pages currently have stored embeddings.