feat(ai): hybrid RRF retrieval, heading-breadcrumb chunks, merged search tool

Improve agent RAG quality with three changes, plus a roadmap doc for the rest. - Indexer: prefix each chunk with its heading path ("Page > H1 > H2"), built by walking the ProseMirror JSON (heading nodes) so a `#` inside a fenced code block is never mistaken for a heading. Falls back to plain-text chunking on any error. buildChunkRows: drop indexOf-against-source offsets (breadcrumb prefixes break verbatim matching) for a cumulative cursor — offsets are provenance-only. - Hybrid search: new migration adds a generated `fts` tsvector column + GIN index to page_embeddings (same english+f_unaccent config as pages.tsv). New PageEmbeddingRepo.hybridSearch fuses cosine + full-text rankings via Reciprocal Rank Fusion (k=60, equal weights) in one SQL query at chunk granularity. - Tools: collapse semanticSearch + searchPages into one hybrid `searchPages` tool with a query-rewrite-oriented description; gracefully falls back to the REST full-text path when embeddings are unconfigured. Access control (space scope + page-permission post-filter) preserved. Add a query-rewrite hint to the default system prompt. - docs/rag-improvements-plan.md: record what shipped and the deferred backlog (reranker, attachment indexing, eval harness, tuning). Note: requires a corpus reindex to populate breadcrumbs on existing pages.
2026-06-18 03:43:01 +03:00
parent 91a63f0b2c
commit c8e41e8916
6 changed files with 555 additions and 145 deletions
--- a/apps/server/src/database/repos/ai-chat/page-embedding.repo.ts
+++ b/apps/server/src/database/repos/ai-chat/page-embedding.repo.ts
@@ -48,6 +48,16 @@ export interface PageEmbeddingSearchHit {
  distance: number;
 }

+/** A single hybrid (RRF-fused) search hit. Higher `score` is more relevant. */
+export interface PageEmbeddingHybridHit {
+  pageId: string;
+  spaceId: string;
+  title: string | null;
+  content: string;
+  // Fused Reciprocal Rank Fusion score (sum of 1/(k+rank) across CTEs).
+  score: number;
+}
+
@Injectable()
 export class PageEmbeddingRepo {
  constructor(@InjectKysely() private readonly db: KyselyDB) {}
@@ -173,6 +183,102 @@ export class PageEmbeddingRepo {
    }));
  }

+  /**
+   * HYBRID retrieval: fuse semantic (cosine) and lexical (full-text) chunk
+   * rankings with Reciprocal Rank Fusion (RRF). Scoped to a workspace AND the
+   * set of spaces the caller may read. Returns [] when `spaceIds` is empty.
+   *
+   * Two CTEs each rank chunks independently, then a FULL OUTER JOIN on the chunk
+   * `id` fuses them. RRF combines RANKS (not raw scores), so the cosine-distance
+   * and ts_rank scales never need normalizing — that is the whole point of RRF.
+   *
+   *   score = 1/(k + rank_semantic) + 1/(k + rank_lexical)
+   *
+   * with k = 60 (Cormack et al. 2009; the default in Elasticsearch, OpenSearch
+   * and Weaviate) and equal 1.0/1.0 weights as a starting point. `candidates`
+   * is both the per-CTE over-fetch limit and the final fused LIMIT.
+   *
+   * The `model_dimensions = $dim` filter applies ONLY on the semantic side
+   * (cosine compares same-dimension vectors; pgvector errors otherwise). The
+   * lexical side (`fts`) is dimension-independent. If `websearch_to_tsquery`
+   * yields an EMPTY query (e.g. the text is all stopwords) the `@@` matches
+   * nothing and the lexical CTE is empty, so results degrade to pure-semantic —
+   * which is correct behaviour, not an error.
+   *
+   * `fts` is a generated column accessed only here via raw SQL (deliberately not
+   * in the Kysely `PageEmbeddings` type — see migration 20260618T150000).
+   */
+  async hybridSearch(
+    workspaceId: string,
+    queryEmbedding: number[],
+    queryText: string,
+    spaceIds: string[],
+    // Per-CTE over-fetch AND the final fused LIMIT.
+    candidates: number,
+  ): Promise<PageEmbeddingHybridHit[]> {
+    if (spaceIds.length === 0) return [];
+
+    const queryVector = sql`${pgvector.toSql(queryEmbedding)}::vector`;
+    const queryDim = queryEmbedding.length;
+    const spaceList = sql.join(
+      spaceIds.map((s) => sql`${s}`),
+      sql`, `,
+    );
+
+    const result = await sql<{
+      pageId: string;
+      spaceId: string;
+      title: string | null;
+      content: string;
+      score: number;
+    }>`
+      WITH semantic AS (
+        SELECT pe.id, pe.page_id, pe.space_id, pe.content, p.title,
+               row_number() OVER (ORDER BY pe.embedding <=> ${queryVector}) AS rank_ix
+        FROM page_embeddings pe
+        JOIN pages p ON p.id = pe.page_id
+        WHERE pe.workspace_id = ${workspaceId}
+          AND pe.space_id IN (${spaceList})
+          AND pe.model_dimensions = ${queryDim}
+          AND p.deleted_at IS NULL
+        ORDER BY pe.embedding <=> ${queryVector}
+        LIMIT ${candidates}
+      ),
+      full_text AS (
+        SELECT pe.id, pe.page_id, pe.space_id, pe.content, p.title,
+               row_number() OVER (ORDER BY ts_rank(pe.fts, q.query) DESC) AS rank_ix
+        FROM page_embeddings pe
+        JOIN pages p ON p.id = pe.page_id,
+             websearch_to_tsquery('english', f_unaccent(${queryText})) AS q(query)
+        WHERE pe.workspace_id = ${workspaceId}
+          AND pe.space_id IN (${spaceList})
+          AND p.deleted_at IS NULL
+          AND pe.fts @@ q.query
+        ORDER BY ts_rank(pe.fts, q.query) DESC
+        LIMIT ${candidates}
+      )
+      SELECT
+        coalesce(semantic.page_id, full_text.page_id)   AS "pageId",
+        coalesce(semantic.space_id, full_text.space_id) AS "spaceId",
+        coalesce(semantic.title, full_text.title)       AS title,
+        coalesce(semantic.content, full_text.content)   AS content,
+        coalesce(1.0/(60 + semantic.rank_ix), 0.0) * 1.0
+          + coalesce(1.0/(60 + full_text.rank_ix), 0.0) * 1.0 AS score
+      FROM semantic
+      FULL OUTER JOIN full_text ON semantic.id = full_text.id
+      ORDER BY score DESC
+      LIMIT ${candidates}
+    `.execute(this.db);
+
+    return result.rows.map((row) => ({
+      pageId: row.pageId,
+      spaceId: row.spaceId,
+      title: row.title,
+      content: row.content,
+      score: Number(row.score),
+    }));
+  }
+
  /**
   * Count DISTINCT non-deleted pages that have at least one embedding row in this
   * workspace — i.e. how many pages currently have stored embeddings.