feat(ai): hybrid RRF retrieval, heading-breadcrumb chunks, merged search tool
Improve agent RAG quality with three changes, plus a roadmap doc for the rest.
- Indexer: prefix each chunk with its heading path ("Page > H1 > H2"), built by
walking the ProseMirror JSON (heading nodes) so a `#` inside a fenced code block
is never mistaken for a heading. Falls back to plain-text chunking on any error.
buildChunkRows: drop indexOf-against-source offsets (breadcrumb prefixes break
verbatim matching) for a cumulative cursor — offsets are provenance-only.
- Hybrid search: new migration adds a generated `fts` tsvector column + GIN index
to page_embeddings (same english+f_unaccent config as pages.tsv). New
PageEmbeddingRepo.hybridSearch fuses cosine + full-text rankings via Reciprocal
Rank Fusion (k=60, equal weights) in one SQL query at chunk granularity.
- Tools: collapse semanticSearch + searchPages into one hybrid `searchPages` tool
with a query-rewrite-oriented description; gracefully falls back to the REST
full-text path when embeddings are unconfigured. Access control (space scope +
page-permission post-filter) preserved. Add a query-rewrite hint to the default
system prompt.
- docs/rag-improvements-plan.md: record what shipped and the deferred backlog
(reranker, attachment indexing, eval harness, tuning).
Note: requires a corpus reindex to populate breadcrumbs on existing pages.
This commit is contained in:
@@ -0,0 +1,48 @@
|
||||
import { type Kysely, sql } from 'kysely';
|
||||
|
||||
/**
|
||||
* Chunk-level lexical index for HYBRID retrieval (RRF) over `page_embeddings`.
|
||||
*
|
||||
* The agent's retrieval used to be either pure full-text (loopback REST over
|
||||
* `pages.tsv`) OR pure vector (cosine over `page_embeddings.embedding`). Hybrid
|
||||
* retrieval fuses BOTH rankings with Reciprocal Rank Fusion so exact keyword /
|
||||
* identifier matches AND semantic matches both surface. The vector side already
|
||||
* exists; this migration adds the missing LEXICAL side AT CHUNK GRANULARITY so
|
||||
* both CTEs of the fused query rank the SAME chunk rows.
|
||||
*
|
||||
* `fts` is a GENERATED ALWAYS ... STORED `tsvector` built from `content` with
|
||||
* the SAME text-search config as `pages.tsv`: `to_tsvector('english',
|
||||
* f_unaccent(content))`. Using the identical config keeps lexical behaviour
|
||||
* consistent with the existing page search (incl. unaccented Cyrillic content).
|
||||
* `f_unaccent(text)` is declared IMMUTABLE (migration 20250729T213756), which is
|
||||
* exactly what a GENERATED STORED column requires — so this needs NO trigger.
|
||||
* The column is independent of the embedding vector dimension: it indexes text,
|
||||
* not the vector, so it works for any model dimension.
|
||||
*
|
||||
* NOTE: `fts` is deliberately NOT added to the `PageEmbeddings` Kysely type. It
|
||||
* is a generated column accessed ONLY via raw SQL (the hybrid query); adding it
|
||||
* to the Kysely type would force it into the explicit-column insert in
|
||||
* `insertChunks` and break inserts (a GENERATED column cannot be written to).
|
||||
*/
|
||||
export async function up(db: Kysely<any>): Promise<void> {
|
||||
// Generated STORED tsvector mirroring pages.tsv's config. f_unaccent is
|
||||
// IMMUTABLE so it is valid inside a GENERATED column expression (no trigger).
|
||||
await sql`
|
||||
ALTER TABLE page_embeddings
|
||||
ADD COLUMN IF NOT EXISTS fts tsvector
|
||||
GENERATED ALWAYS AS (to_tsvector('english', f_unaccent(content))) STORED
|
||||
`.execute(db);
|
||||
|
||||
// GIN index for fast `fts @@ query` lexical matching on the chunk text.
|
||||
await sql`
|
||||
CREATE INDEX IF NOT EXISTS idx_page_embeddings_fts
|
||||
ON page_embeddings USING gin(fts)
|
||||
`.execute(db);
|
||||
}
|
||||
|
||||
export async function down(db: Kysely<any>): Promise<void> {
|
||||
await sql`DROP INDEX IF EXISTS idx_page_embeddings_fts`.execute(db);
|
||||
await sql`
|
||||
ALTER TABLE page_embeddings DROP COLUMN IF EXISTS fts
|
||||
`.execute(db);
|
||||
}
|
||||
@@ -48,6 +48,16 @@ export interface PageEmbeddingSearchHit {
|
||||
distance: number;
|
||||
}
|
||||
|
||||
/** A single hybrid (RRF-fused) search hit. Higher `score` is more relevant. */
|
||||
export interface PageEmbeddingHybridHit {
|
||||
pageId: string;
|
||||
spaceId: string;
|
||||
title: string | null;
|
||||
content: string;
|
||||
// Fused Reciprocal Rank Fusion score (sum of 1/(k+rank) across CTEs).
|
||||
score: number;
|
||||
}
|
||||
|
||||
@Injectable()
|
||||
export class PageEmbeddingRepo {
|
||||
constructor(@InjectKysely() private readonly db: KyselyDB) {}
|
||||
@@ -173,6 +183,102 @@ export class PageEmbeddingRepo {
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* HYBRID retrieval: fuse semantic (cosine) and lexical (full-text) chunk
|
||||
* rankings with Reciprocal Rank Fusion (RRF). Scoped to a workspace AND the
|
||||
* set of spaces the caller may read. Returns [] when `spaceIds` is empty.
|
||||
*
|
||||
* Two CTEs each rank chunks independently, then a FULL OUTER JOIN on the chunk
|
||||
* `id` fuses them. RRF combines RANKS (not raw scores), so the cosine-distance
|
||||
* and ts_rank scales never need normalizing — that is the whole point of RRF.
|
||||
*
|
||||
* score = 1/(k + rank_semantic) + 1/(k + rank_lexical)
|
||||
*
|
||||
* with k = 60 (Cormack et al. 2009; the default in Elasticsearch, OpenSearch
|
||||
* and Weaviate) and equal 1.0/1.0 weights as a starting point. `candidates`
|
||||
* is both the per-CTE over-fetch limit and the final fused LIMIT.
|
||||
*
|
||||
* The `model_dimensions = $dim` filter applies ONLY on the semantic side
|
||||
* (cosine compares same-dimension vectors; pgvector errors otherwise). The
|
||||
* lexical side (`fts`) is dimension-independent. If `websearch_to_tsquery`
|
||||
* yields an EMPTY query (e.g. the text is all stopwords) the `@@` matches
|
||||
* nothing and the lexical CTE is empty, so results degrade to pure-semantic —
|
||||
* which is correct behaviour, not an error.
|
||||
*
|
||||
* `fts` is a generated column accessed only here via raw SQL (deliberately not
|
||||
* in the Kysely `PageEmbeddings` type — see migration 20260618T150000).
|
||||
*/
|
||||
async hybridSearch(
|
||||
workspaceId: string,
|
||||
queryEmbedding: number[],
|
||||
queryText: string,
|
||||
spaceIds: string[],
|
||||
// Per-CTE over-fetch AND the final fused LIMIT.
|
||||
candidates: number,
|
||||
): Promise<PageEmbeddingHybridHit[]> {
|
||||
if (spaceIds.length === 0) return [];
|
||||
|
||||
const queryVector = sql`${pgvector.toSql(queryEmbedding)}::vector`;
|
||||
const queryDim = queryEmbedding.length;
|
||||
const spaceList = sql.join(
|
||||
spaceIds.map((s) => sql`${s}`),
|
||||
sql`, `,
|
||||
);
|
||||
|
||||
const result = await sql<{
|
||||
pageId: string;
|
||||
spaceId: string;
|
||||
title: string | null;
|
||||
content: string;
|
||||
score: number;
|
||||
}>`
|
||||
WITH semantic AS (
|
||||
SELECT pe.id, pe.page_id, pe.space_id, pe.content, p.title,
|
||||
row_number() OVER (ORDER BY pe.embedding <=> ${queryVector}) AS rank_ix
|
||||
FROM page_embeddings pe
|
||||
JOIN pages p ON p.id = pe.page_id
|
||||
WHERE pe.workspace_id = ${workspaceId}
|
||||
AND pe.space_id IN (${spaceList})
|
||||
AND pe.model_dimensions = ${queryDim}
|
||||
AND p.deleted_at IS NULL
|
||||
ORDER BY pe.embedding <=> ${queryVector}
|
||||
LIMIT ${candidates}
|
||||
),
|
||||
full_text AS (
|
||||
SELECT pe.id, pe.page_id, pe.space_id, pe.content, p.title,
|
||||
row_number() OVER (ORDER BY ts_rank(pe.fts, q.query) DESC) AS rank_ix
|
||||
FROM page_embeddings pe
|
||||
JOIN pages p ON p.id = pe.page_id,
|
||||
websearch_to_tsquery('english', f_unaccent(${queryText})) AS q(query)
|
||||
WHERE pe.workspace_id = ${workspaceId}
|
||||
AND pe.space_id IN (${spaceList})
|
||||
AND p.deleted_at IS NULL
|
||||
AND pe.fts @@ q.query
|
||||
ORDER BY ts_rank(pe.fts, q.query) DESC
|
||||
LIMIT ${candidates}
|
||||
)
|
||||
SELECT
|
||||
coalesce(semantic.page_id, full_text.page_id) AS "pageId",
|
||||
coalesce(semantic.space_id, full_text.space_id) AS "spaceId",
|
||||
coalesce(semantic.title, full_text.title) AS title,
|
||||
coalesce(semantic.content, full_text.content) AS content,
|
||||
coalesce(1.0/(60 + semantic.rank_ix), 0.0) * 1.0
|
||||
+ coalesce(1.0/(60 + full_text.rank_ix), 0.0) * 1.0 AS score
|
||||
FROM semantic
|
||||
FULL OUTER JOIN full_text ON semantic.id = full_text.id
|
||||
ORDER BY score DESC
|
||||
LIMIT ${candidates}
|
||||
`.execute(this.db);
|
||||
|
||||
return result.rows.map((row) => ({
|
||||
pageId: row.pageId,
|
||||
spaceId: row.spaceId,
|
||||
title: row.title,
|
||||
content: row.content,
|
||||
score: Number(row.score),
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Count DISTINCT non-deleted pages that have at least one embedding row in this
|
||||
* workspace — i.e. how many pages currently have stored embeddings.
|
||||
|
||||
Reference in New Issue
Block a user