feat(ai): hybrid RRF retrieval, heading-breadcrumb chunks, merged search tool
Improve agent RAG quality with three changes, plus a roadmap doc for the rest.
- Indexer: prefix each chunk with its heading path ("Page > H1 > H2"), built by
walking the ProseMirror JSON (heading nodes) so a `#` inside a fenced code block
is never mistaken for a heading. Falls back to plain-text chunking on any error.
buildChunkRows: drop indexOf-against-source offsets (breadcrumb prefixes break
verbatim matching) for a cumulative cursor — offsets are provenance-only.
- Hybrid search: new migration adds a generated `fts` tsvector column + GIN index
to page_embeddings (same english+f_unaccent config as pages.tsv). New
PageEmbeddingRepo.hybridSearch fuses cosine + full-text rankings via Reciprocal
Rank Fusion (k=60, equal weights) in one SQL query at chunk granularity.
- Tools: collapse semanticSearch + searchPages into one hybrid `searchPages` tool
with a query-rewrite-oriented description; gracefully falls back to the REST
full-text path when embeddings are unconfigured. Access control (space scope +
page-permission post-filter) preserved. Add a query-rewrite hint to the default
system prompt.
- docs/rag-improvements-plan.md: record what shipped and the deferred backlog
(reranker, attachment indexing, eval harness, tuning).
Note: requires a corpus reindex to populate breadcrumbs on existing pages.
This commit is contained in:
@@ -9,6 +9,8 @@ const DEFAULT_PROMPT = [
|
||||
'You help the current user find, read, and reason about pages in their workspace.',
|
||||
'Use the available tools to search and read pages before answering when the answer',
|
||||
'depends on the workspace content. Cite the pages you used. Be concise and accurate.',
|
||||
"When searching, rephrase the user's question into focused keyword queries, and search",
|
||||
'again with different terms if the first results are weak.',
|
||||
].join(' ');
|
||||
|
||||
/**
|
||||
|
||||
@@ -77,9 +77,28 @@ export class EmbeddingIndexerService {
|
||||
return;
|
||||
}
|
||||
|
||||
const text = this.extractText(page);
|
||||
if (!text || text.trim().length === 0) {
|
||||
// Empty page -> remove any prior embeddings so search returns nothing.
|
||||
// Prefer heading-breadcrumb chunks: each chunk is prefixed with its heading
|
||||
// path ("Page Title > H1 > H2") so the breadcrumb is embedded AND stored in
|
||||
// `content` (feeding the fts column and the agent's snippet). Walk the
|
||||
// ProseMirror JSON — NOT the markdown text — so a `#` inside a fenced code
|
||||
// block is never mistaken for a heading. Degrades to the plain-text path on
|
||||
// any error / unknown structure (returns null).
|
||||
const breadcrumbChunks = page.content
|
||||
? await this.safeBuildBreadcrumbChunks(page.content, page.title)
|
||||
: null;
|
||||
|
||||
// Fall back to plain text when breadcrumb chunking is unavailable.
|
||||
const fallbackText =
|
||||
breadcrumbChunks && breadcrumbChunks.length > 0
|
||||
? null
|
||||
: this.extractText(page);
|
||||
|
||||
// Empty page (neither path produced content) -> remove any prior embeddings
|
||||
// so search returns nothing.
|
||||
if (
|
||||
(!breadcrumbChunks || breadcrumbChunks.length === 0) &&
|
||||
(!fallbackText || fallbackText.trim().length === 0)
|
||||
) {
|
||||
await this.pageEmbeddingRepo.deleteByPage(pageId, workspaceId);
|
||||
return;
|
||||
}
|
||||
@@ -105,12 +124,17 @@ export class EmbeddingIndexerService {
|
||||
throw err;
|
||||
}
|
||||
|
||||
// Chunk the plain text.
|
||||
const splitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize: CHUNK_SIZE,
|
||||
chunkOverlap: CHUNK_OVERLAP,
|
||||
});
|
||||
const chunks = await splitter.splitText(text);
|
||||
// Use breadcrumb chunks when available; otherwise chunk the plain text.
|
||||
let chunks: string[];
|
||||
if (breadcrumbChunks && breadcrumbChunks.length > 0) {
|
||||
chunks = breadcrumbChunks;
|
||||
} else {
|
||||
const splitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize: CHUNK_SIZE,
|
||||
chunkOverlap: CHUNK_OVERLAP,
|
||||
});
|
||||
chunks = await splitter.splitText(fallbackText as string);
|
||||
}
|
||||
if (chunks.length === 0) {
|
||||
await this.pageEmbeddingRepo.deleteByPage(pageId, workspaceId);
|
||||
return;
|
||||
@@ -139,7 +163,6 @@ export class EmbeddingIndexerService {
|
||||
const rows = this.buildChunkRows(
|
||||
chunks,
|
||||
vectors,
|
||||
text,
|
||||
{ pageId, workspaceId, spaceId },
|
||||
modelName,
|
||||
);
|
||||
@@ -255,14 +278,16 @@ export class EmbeddingIndexerService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Map chunk strings + vectors to insertable rows, computing chunkStart /
|
||||
* chunkLength against the source text. A moving cursor handles repeated
|
||||
* substrings and overlap so offsets stay monotonic.
|
||||
* Map chunk strings + vectors to insertable rows. Breadcrumb-prefixed chunks
|
||||
* are NOT verbatim substrings of any source text, so chunkStart is a running
|
||||
* cumulative offset (sum of previous chunk lengths) rather than an indexOf
|
||||
* position. These offsets are informational provenance only — search returns
|
||||
* `content` and never slices by offset. chunkIndex stays a global monotonic
|
||||
* index.
|
||||
*/
|
||||
private buildChunkRows(
|
||||
chunks: string[],
|
||||
vectors: number[][],
|
||||
sourceText: string,
|
||||
ids: { pageId: string; workspaceId: string; spaceId: string },
|
||||
modelName: string,
|
||||
): PageEmbeddingChunkRow[] {
|
||||
@@ -272,11 +297,8 @@ export class EmbeddingIndexerService {
|
||||
const chunk = chunks[i];
|
||||
const embedding = vectors[i];
|
||||
if (!embedding) continue;
|
||||
const found = sourceText.indexOf(chunk, cursor);
|
||||
const chunkStart = found >= 0 ? found : cursor;
|
||||
// Advance the cursor past the start so later identical chunks resolve to
|
||||
// later occurrences (overlap keeps the next search valid).
|
||||
cursor = chunkStart + 1;
|
||||
const chunkStart = cursor;
|
||||
cursor += chunk.length;
|
||||
rows.push({
|
||||
pageId: ids.pageId,
|
||||
workspaceId: ids.workspaceId,
|
||||
@@ -295,4 +317,106 @@ export class EmbeddingIndexerService {
|
||||
}
|
||||
return rows;
|
||||
}
|
||||
|
||||
/**
|
||||
* Thin try/catch wrapper around buildBreadcrumbChunks. Any failure (malformed
|
||||
* structure, unknown node type, etc.) returns null so the caller degrades
|
||||
* gracefully to the plain-text chunking path.
|
||||
*/
|
||||
private async safeBuildBreadcrumbChunks(
|
||||
contentJson: unknown,
|
||||
pageTitle: string | null,
|
||||
): Promise<string[] | null> {
|
||||
try {
|
||||
return await this.buildBreadcrumbChunks(contentJson, pageTitle);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build heading-breadcrumb chunks by walking the ProseMirror JSON document.
|
||||
*
|
||||
* Each section (the body following a heading) is split with the same 1000/200
|
||||
* RecursiveCharacterTextSplitter, and every resulting piece is prefixed with
|
||||
* its heading path ("Page Title > H1 > H2"). Walking the JSON — not markdown
|
||||
* text — means a `#` inside a fenced code block is never treated as a heading
|
||||
* (ProseMirror heading nodes are explicit).
|
||||
*
|
||||
* Returns null when `contentJson` is not an object with an array `content`, so
|
||||
* the caller falls back to plain-text chunking.
|
||||
*/
|
||||
private async buildBreadcrumbChunks(
|
||||
contentJson: unknown,
|
||||
pageTitle: string | null,
|
||||
): Promise<string[] | null> {
|
||||
const doc = contentJson as { content?: unknown };
|
||||
if (
|
||||
typeof contentJson !== 'object' ||
|
||||
contentJson === null ||
|
||||
!Array.isArray(doc.content)
|
||||
) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const splitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize: CHUNK_SIZE,
|
||||
chunkOverlap: CHUNK_OVERLAP,
|
||||
});
|
||||
|
||||
const out: string[] = [];
|
||||
const stack: { level: number; text: string }[] = [];
|
||||
let buffer = '';
|
||||
|
||||
// Flush the accumulated body as one or more chunks under the CURRENT crumb.
|
||||
const flush = async (): Promise<void> => {
|
||||
if (buffer.trim().length === 0) {
|
||||
buffer = '';
|
||||
return;
|
||||
}
|
||||
const crumb = [pageTitle, ...stack.map((s) => s.text)]
|
||||
.filter((s) => typeof s === 'string' && s.trim().length > 0)
|
||||
.join(' > ');
|
||||
const pieces = await splitter.splitText(buffer);
|
||||
for (const piece of pieces) {
|
||||
out.push(crumb ? `${crumb}\n\n${piece}` : piece);
|
||||
}
|
||||
buffer = '';
|
||||
};
|
||||
|
||||
for (const block of doc.content as Array<{
|
||||
type?: string;
|
||||
attrs?: { level?: number };
|
||||
}>) {
|
||||
if (block?.type === 'heading') {
|
||||
// Flush the preceding body under the crumb in effect BEFORE this
|
||||
// heading, then update the heading stack.
|
||||
await flush();
|
||||
const level =
|
||||
typeof block.attrs?.level === 'number' ? block.attrs.level : 1;
|
||||
// Pop deeper-or-equal headings: a new H2 replaces a prior H2/H3/...
|
||||
while (stack.length > 0 && stack[stack.length - 1].level >= level) {
|
||||
stack.pop();
|
||||
}
|
||||
const headingText = jsonToText({
|
||||
type: 'doc',
|
||||
content: [block],
|
||||
} as never).trim();
|
||||
if (headingText.length > 0) {
|
||||
stack.push({ level, text: headingText });
|
||||
}
|
||||
} else {
|
||||
const blockText = jsonToText({
|
||||
type: 'doc',
|
||||
content: [block],
|
||||
} as never);
|
||||
buffer = buffer.length > 0 ? `${buffer}\n${blockText}` : blockText;
|
||||
}
|
||||
}
|
||||
|
||||
// Flush any trailing body after the last heading.
|
||||
await flush();
|
||||
|
||||
return out;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,37 +87,126 @@ export class AiChatToolsService {
|
||||
return {
|
||||
searchPages: tool({
|
||||
description:
|
||||
'Full-text search across the pages the current user can access. ' +
|
||||
'Returns a compact list of matching pages with a short snippet.',
|
||||
'Search the wiki for pages relevant to a query. Combines exact ' +
|
||||
'keyword/identifier matching with semantic meaning and returns the ' +
|
||||
'most relevant pages with a short snippet, best match first. ' +
|
||||
"Rephrase the user's question into a focused search query (key terms " +
|
||||
'and entities), not a full sentence. If the first results look weak ' +
|
||||
'or incomplete, search again with different wording or synonyms ' +
|
||||
'before answering.',
|
||||
inputSchema: z.object({
|
||||
query: z.string().describe('The search query.'),
|
||||
limit: z
|
||||
.number()
|
||||
.int()
|
||||
.min(1)
|
||||
.max(50)
|
||||
.max(20)
|
||||
.optional()
|
||||
.describe('Maximum number of results (1-50).'),
|
||||
.describe('Maximum number of results (1-20).'),
|
||||
}),
|
||||
execute: async ({ query, limit }) => {
|
||||
// search(query, spaceId?, limit?) -> { items, success }.
|
||||
// Items are filterSearchResult(): { id, title, highlight, ... }.
|
||||
const result = await client.search(query, undefined, limit);
|
||||
const items = Array.isArray(result?.items) ? result.items : [];
|
||||
// Keep the payload token-efficient: id + title + a short snippet only.
|
||||
return items.map((raw) => {
|
||||
const item = raw as {
|
||||
id?: string;
|
||||
slugId?: string;
|
||||
title?: string;
|
||||
highlight?: string;
|
||||
};
|
||||
return {
|
||||
id: item.id ?? item.slugId,
|
||||
title: item.title ?? '',
|
||||
snippet: snippet(item.highlight),
|
||||
};
|
||||
});
|
||||
const trimmed = (query ?? '').trim();
|
||||
if (!trimmed) return [];
|
||||
|
||||
const cap = limit ?? 10;
|
||||
|
||||
// Loopback REST full-text fallback. Used when AI search is not
|
||||
// configured, embedding fails, there are no accessible spaces, or the
|
||||
// hybrid query returns nothing — so keyword search always works.
|
||||
const fallback = async () => {
|
||||
// search(query, spaceId?, limit?) -> { items, success }.
|
||||
// Items are filterSearchResult(): { id, title, highlight, ... }.
|
||||
const result = await client.search(trimmed, undefined, cap);
|
||||
const items = Array.isArray(result?.items) ? result.items : [];
|
||||
// Keep the payload token-efficient: id + title + a short snippet.
|
||||
return items.map((raw) => {
|
||||
const item = raw as {
|
||||
id?: string;
|
||||
slugId?: string;
|
||||
title?: string;
|
||||
highlight?: string;
|
||||
};
|
||||
return {
|
||||
id: item.id ?? item.slugId,
|
||||
title: item.title ?? '',
|
||||
snippet: snippet(item.highlight),
|
||||
};
|
||||
});
|
||||
};
|
||||
|
||||
// HYBRID path: fuse semantic (vector) + lexical (full-text) rankings
|
||||
// via RRF. Over-fetch candidates so the page-permission post-filter
|
||||
// still leaves enough results.
|
||||
const candidates = Math.min(Math.max(cap * 5, 50), 200);
|
||||
|
||||
// 1) Embed the query. Unconfigured embeddings (or any embedding error)
|
||||
// routes to the REST full-text fallback instead of erroring.
|
||||
let queryVector: number[];
|
||||
try {
|
||||
const [vec] = await this.aiService.embedTexts(workspaceId, [
|
||||
trimmed,
|
||||
]);
|
||||
if (!vec) return await fallback();
|
||||
queryVector = vec;
|
||||
} catch (err) {
|
||||
if (!(err instanceof AiEmbeddingNotConfiguredException)) {
|
||||
// Never leak provider/key details; log generically and fall back.
|
||||
this.logger.warn(
|
||||
`searchPages embed failed: ${
|
||||
err instanceof Error ? err.message : 'unknown error'
|
||||
}`,
|
||||
);
|
||||
}
|
||||
return await fallback();
|
||||
}
|
||||
|
||||
// 2) ACCESS CONTROL: the hybrid query runs IN-PROCESS (a direct
|
||||
// pgvector + full-text query), so unlike the loopback REST tools it
|
||||
// does NOT get CASL for free. Scope to the spaces the user can read
|
||||
// (member spaces + groups), mirroring SearchService.searchPage. No
|
||||
// accessible spaces => fall back to REST (which is CASL-scoped).
|
||||
const accessibleSpaceIds =
|
||||
await this.spaceMemberRepo.getUserSpaceIds(user.id);
|
||||
if (accessibleSpaceIds.length === 0) return await fallback();
|
||||
|
||||
// 3) Hybrid RRF retrieval, scoped to the workspace AND accessible
|
||||
// spaces.
|
||||
const hits = await this.pageEmbeddingRepo.hybridSearch(
|
||||
workspaceId,
|
||||
queryVector,
|
||||
trimmed,
|
||||
accessibleSpaceIds,
|
||||
candidates,
|
||||
);
|
||||
if (hits.length === 0) return await fallback();
|
||||
|
||||
// 4) Page-level permission post-filter: an accessible space does not
|
||||
// imply every page in it is accessible (restricted pages). Mirror
|
||||
// SearchService.searchPage's filterAccessiblePageIds pass.
|
||||
const pageIds = Array.from(new Set(hits.map((h) => h.pageId)));
|
||||
const accessibleIds =
|
||||
await this.pagePermissionRepo.filterAccessiblePageIds({
|
||||
pageIds,
|
||||
userId: user.id,
|
||||
});
|
||||
const accessibleSet = new Set(accessibleIds);
|
||||
|
||||
// Keep the best (first — hits are ordered by fused score desc) chunk
|
||||
// per page, capped to `cap`.
|
||||
const seen = new Set<string>();
|
||||
const results: { id: string; title: string; snippet: string }[] = [];
|
||||
for (const hit of hits) {
|
||||
if (!accessibleSet.has(hit.pageId)) continue;
|
||||
if (seen.has(hit.pageId)) continue;
|
||||
seen.add(hit.pageId);
|
||||
results.push({
|
||||
id: hit.pageId,
|
||||
title: hit.title ?? '',
|
||||
snippet: snippet(hit.content),
|
||||
});
|
||||
if (results.length >= cap) break;
|
||||
}
|
||||
return results;
|
||||
},
|
||||
}),
|
||||
|
||||
@@ -142,110 +231,6 @@ export class AiChatToolsService {
|
||||
},
|
||||
}),
|
||||
|
||||
semanticSearch: tool({
|
||||
description:
|
||||
'Semantic (vector) search across the pages the current user can ' +
|
||||
'access. Finds pages by meaning, not just keywords — use it to ' +
|
||||
'answer conceptual questions. Returns a compact list of relevant ' +
|
||||
'pages with a short snippet. Falls back to searchPages if semantic ' +
|
||||
'search is unavailable.',
|
||||
inputSchema: z.object({
|
||||
query: z.string().describe('The natural-language search query.'),
|
||||
limit: z
|
||||
.number()
|
||||
.int()
|
||||
.min(1)
|
||||
.max(20)
|
||||
.optional()
|
||||
.describe('Maximum number of results (1-20).'),
|
||||
}),
|
||||
execute: async ({ query, limit }) => {
|
||||
// ACCESS CONTROL: this tool runs IN-PROCESS (a direct pgvector query),
|
||||
// so unlike the loopback REST tools it does NOT get CASL for free. We
|
||||
// scope every query to the spaces the user can read, mirroring
|
||||
// SearchService.searchPage (§6.7 / §8). We additionally post-filter by
|
||||
// page-level permissions so restricted pages inside an accessible
|
||||
// space are never returned.
|
||||
const trimmed = (query ?? '').trim();
|
||||
if (trimmed.length === 0) return [];
|
||||
|
||||
// 1) Embed the query (no-op fallback when embeddings are unconfigured
|
||||
// so the agent can fall back to searchPages instead of erroring).
|
||||
let queryVector: number[];
|
||||
try {
|
||||
const [vec] = await this.aiService.embedTexts(workspaceId, [
|
||||
trimmed,
|
||||
]);
|
||||
if (!vec) return [];
|
||||
queryVector = vec;
|
||||
} catch (err) {
|
||||
if (err instanceof AiEmbeddingNotConfiguredException) {
|
||||
return {
|
||||
unavailable: true,
|
||||
reason:
|
||||
'semantic search unavailable (embeddings not configured)',
|
||||
};
|
||||
}
|
||||
// Never leak provider/key details; surface a generic unavailable.
|
||||
this.logger.warn(
|
||||
`semanticSearch embed failed: ${
|
||||
err instanceof Error ? err.message : 'unknown error'
|
||||
}`,
|
||||
);
|
||||
return {
|
||||
unavailable: true,
|
||||
reason: 'semantic search unavailable',
|
||||
};
|
||||
}
|
||||
|
||||
// 2) Resolve the spaces this user can read (member spaces + groups),
|
||||
// mirroring SearchService's space scoping. No spaces => no results.
|
||||
const accessibleSpaceIds =
|
||||
await this.spaceMemberRepo.getUserSpaceIds(user.id);
|
||||
if (accessibleSpaceIds.length === 0) return [];
|
||||
|
||||
// 3) Cosine ANN over the embeddings, scoped to the workspace AND the
|
||||
// accessible spaces. Over-fetch a little so the page-permission
|
||||
// post-filter still leaves enough results.
|
||||
const cap = limit ?? 10;
|
||||
const hits = await this.pageEmbeddingRepo.searchByEmbedding(
|
||||
workspaceId,
|
||||
queryVector,
|
||||
accessibleSpaceIds,
|
||||
cap * 3,
|
||||
);
|
||||
if (hits.length === 0) return [];
|
||||
|
||||
// 4) Page-level permission post-filter: a space being accessible does
|
||||
// not imply every page in it is (restricted pages). Mirror
|
||||
// SearchService.searchPage's filterAccessiblePageIds pass.
|
||||
const pageIds = Array.from(new Set(hits.map((h) => h.pageId)));
|
||||
const accessibleIds =
|
||||
await this.pagePermissionRepo.filterAccessiblePageIds({
|
||||
pageIds,
|
||||
userId: user.id,
|
||||
});
|
||||
const accessibleSet = new Set(accessibleIds);
|
||||
|
||||
// Keep the best (lowest-distance) hit per page, capped to `limit`.
|
||||
const seen = new Set<string>();
|
||||
const results: { pageId: string; title: string; snippet: string }[] =
|
||||
[];
|
||||
for (const hit of hits) {
|
||||
if (!accessibleSet.has(hit.pageId)) continue;
|
||||
if (seen.has(hit.pageId)) continue;
|
||||
seen.add(hit.pageId);
|
||||
results.push({
|
||||
pageId: hit.pageId,
|
||||
title: hit.title ?? '',
|
||||
snippet: snippet(hit.content),
|
||||
});
|
||||
if (results.length >= cap) break;
|
||||
}
|
||||
return results;
|
||||
},
|
||||
}),
|
||||
|
||||
// --- WRITE tools (all reversible — history/trash; §6.5 / D3) ---
|
||||
|
||||
createPage: tool({
|
||||
|
||||
@@ -0,0 +1,48 @@
|
||||
import { type Kysely, sql } from 'kysely';
|
||||
|
||||
/**
|
||||
* Chunk-level lexical index for HYBRID retrieval (RRF) over `page_embeddings`.
|
||||
*
|
||||
* The agent's retrieval used to be either pure full-text (loopback REST over
|
||||
* `pages.tsv`) OR pure vector (cosine over `page_embeddings.embedding`). Hybrid
|
||||
* retrieval fuses BOTH rankings with Reciprocal Rank Fusion so exact keyword /
|
||||
* identifier matches AND semantic matches both surface. The vector side already
|
||||
* exists; this migration adds the missing LEXICAL side AT CHUNK GRANULARITY so
|
||||
* both CTEs of the fused query rank the SAME chunk rows.
|
||||
*
|
||||
* `fts` is a GENERATED ALWAYS ... STORED `tsvector` built from `content` with
|
||||
* the SAME text-search config as `pages.tsv`: `to_tsvector('english',
|
||||
* f_unaccent(content))`. Using the identical config keeps lexical behaviour
|
||||
* consistent with the existing page search (incl. unaccented Cyrillic content).
|
||||
* `f_unaccent(text)` is declared IMMUTABLE (migration 20250729T213756), which is
|
||||
* exactly what a GENERATED STORED column requires — so this needs NO trigger.
|
||||
* The column is independent of the embedding vector dimension: it indexes text,
|
||||
* not the vector, so it works for any model dimension.
|
||||
*
|
||||
* NOTE: `fts` is deliberately NOT added to the `PageEmbeddings` Kysely type. It
|
||||
* is a generated column accessed ONLY via raw SQL (the hybrid query); adding it
|
||||
* to the Kysely type would force it into the explicit-column insert in
|
||||
* `insertChunks` and break inserts (a GENERATED column cannot be written to).
|
||||
*/
|
||||
export async function up(db: Kysely<any>): Promise<void> {
|
||||
// Generated STORED tsvector mirroring pages.tsv's config. f_unaccent is
|
||||
// IMMUTABLE so it is valid inside a GENERATED column expression (no trigger).
|
||||
await sql`
|
||||
ALTER TABLE page_embeddings
|
||||
ADD COLUMN IF NOT EXISTS fts tsvector
|
||||
GENERATED ALWAYS AS (to_tsvector('english', f_unaccent(content))) STORED
|
||||
`.execute(db);
|
||||
|
||||
// GIN index for fast `fts @@ query` lexical matching on the chunk text.
|
||||
await sql`
|
||||
CREATE INDEX IF NOT EXISTS idx_page_embeddings_fts
|
||||
ON page_embeddings USING gin(fts)
|
||||
`.execute(db);
|
||||
}
|
||||
|
||||
export async function down(db: Kysely<any>): Promise<void> {
|
||||
await sql`DROP INDEX IF EXISTS idx_page_embeddings_fts`.execute(db);
|
||||
await sql`
|
||||
ALTER TABLE page_embeddings DROP COLUMN IF EXISTS fts
|
||||
`.execute(db);
|
||||
}
|
||||
@@ -48,6 +48,16 @@ export interface PageEmbeddingSearchHit {
|
||||
distance: number;
|
||||
}
|
||||
|
||||
/** A single hybrid (RRF-fused) search hit. Higher `score` is more relevant. */
|
||||
export interface PageEmbeddingHybridHit {
|
||||
pageId: string;
|
||||
spaceId: string;
|
||||
title: string | null;
|
||||
content: string;
|
||||
// Fused Reciprocal Rank Fusion score (sum of 1/(k+rank) across CTEs).
|
||||
score: number;
|
||||
}
|
||||
|
||||
@Injectable()
|
||||
export class PageEmbeddingRepo {
|
||||
constructor(@InjectKysely() private readonly db: KyselyDB) {}
|
||||
@@ -173,6 +183,102 @@ export class PageEmbeddingRepo {
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* HYBRID retrieval: fuse semantic (cosine) and lexical (full-text) chunk
|
||||
* rankings with Reciprocal Rank Fusion (RRF). Scoped to a workspace AND the
|
||||
* set of spaces the caller may read. Returns [] when `spaceIds` is empty.
|
||||
*
|
||||
* Two CTEs each rank chunks independently, then a FULL OUTER JOIN on the chunk
|
||||
* `id` fuses them. RRF combines RANKS (not raw scores), so the cosine-distance
|
||||
* and ts_rank scales never need normalizing — that is the whole point of RRF.
|
||||
*
|
||||
* score = 1/(k + rank_semantic) + 1/(k + rank_lexical)
|
||||
*
|
||||
* with k = 60 (Cormack et al. 2009; the default in Elasticsearch, OpenSearch
|
||||
* and Weaviate) and equal 1.0/1.0 weights as a starting point. `candidates`
|
||||
* is both the per-CTE over-fetch limit and the final fused LIMIT.
|
||||
*
|
||||
* The `model_dimensions = $dim` filter applies ONLY on the semantic side
|
||||
* (cosine compares same-dimension vectors; pgvector errors otherwise). The
|
||||
* lexical side (`fts`) is dimension-independent. If `websearch_to_tsquery`
|
||||
* yields an EMPTY query (e.g. the text is all stopwords) the `@@` matches
|
||||
* nothing and the lexical CTE is empty, so results degrade to pure-semantic —
|
||||
* which is correct behaviour, not an error.
|
||||
*
|
||||
* `fts` is a generated column accessed only here via raw SQL (deliberately not
|
||||
* in the Kysely `PageEmbeddings` type — see migration 20260618T150000).
|
||||
*/
|
||||
async hybridSearch(
|
||||
workspaceId: string,
|
||||
queryEmbedding: number[],
|
||||
queryText: string,
|
||||
spaceIds: string[],
|
||||
// Per-CTE over-fetch AND the final fused LIMIT.
|
||||
candidates: number,
|
||||
): Promise<PageEmbeddingHybridHit[]> {
|
||||
if (spaceIds.length === 0) return [];
|
||||
|
||||
const queryVector = sql`${pgvector.toSql(queryEmbedding)}::vector`;
|
||||
const queryDim = queryEmbedding.length;
|
||||
const spaceList = sql.join(
|
||||
spaceIds.map((s) => sql`${s}`),
|
||||
sql`, `,
|
||||
);
|
||||
|
||||
const result = await sql<{
|
||||
pageId: string;
|
||||
spaceId: string;
|
||||
title: string | null;
|
||||
content: string;
|
||||
score: number;
|
||||
}>`
|
||||
WITH semantic AS (
|
||||
SELECT pe.id, pe.page_id, pe.space_id, pe.content, p.title,
|
||||
row_number() OVER (ORDER BY pe.embedding <=> ${queryVector}) AS rank_ix
|
||||
FROM page_embeddings pe
|
||||
JOIN pages p ON p.id = pe.page_id
|
||||
WHERE pe.workspace_id = ${workspaceId}
|
||||
AND pe.space_id IN (${spaceList})
|
||||
AND pe.model_dimensions = ${queryDim}
|
||||
AND p.deleted_at IS NULL
|
||||
ORDER BY pe.embedding <=> ${queryVector}
|
||||
LIMIT ${candidates}
|
||||
),
|
||||
full_text AS (
|
||||
SELECT pe.id, pe.page_id, pe.space_id, pe.content, p.title,
|
||||
row_number() OVER (ORDER BY ts_rank(pe.fts, q.query) DESC) AS rank_ix
|
||||
FROM page_embeddings pe
|
||||
JOIN pages p ON p.id = pe.page_id,
|
||||
websearch_to_tsquery('english', f_unaccent(${queryText})) AS q(query)
|
||||
WHERE pe.workspace_id = ${workspaceId}
|
||||
AND pe.space_id IN (${spaceList})
|
||||
AND p.deleted_at IS NULL
|
||||
AND pe.fts @@ q.query
|
||||
ORDER BY ts_rank(pe.fts, q.query) DESC
|
||||
LIMIT ${candidates}
|
||||
)
|
||||
SELECT
|
||||
coalesce(semantic.page_id, full_text.page_id) AS "pageId",
|
||||
coalesce(semantic.space_id, full_text.space_id) AS "spaceId",
|
||||
coalesce(semantic.title, full_text.title) AS title,
|
||||
coalesce(semantic.content, full_text.content) AS content,
|
||||
coalesce(1.0/(60 + semantic.rank_ix), 0.0) * 1.0
|
||||
+ coalesce(1.0/(60 + full_text.rank_ix), 0.0) * 1.0 AS score
|
||||
FROM semantic
|
||||
FULL OUTER JOIN full_text ON semantic.id = full_text.id
|
||||
ORDER BY score DESC
|
||||
LIMIT ${candidates}
|
||||
`.execute(this.db);
|
||||
|
||||
return result.rows.map((row) => ({
|
||||
pageId: row.pageId,
|
||||
spaceId: row.spaceId,
|
||||
title: row.title,
|
||||
content: row.content,
|
||||
score: Number(row.score),
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Count DISTINCT non-deleted pages that have at least one embedding row in this
|
||||
* workspace — i.e. how many pages currently have stored embeddings.
|
||||
|
||||
Reference in New Issue
Block a user