fix(ai): stop RAG coverage bar sticking below 100% on empty pages

"Indexed N of M pages" stayed at e.g. "27 of 34" forever even after a
successful full reindex. The numerator counted pages that have embeddings
while the denominator counted ALL non-deleted pages, so empty / text-less
pages (which legitimately store zero embeddings) could never be reached.

Add PageRepo.countEmbeddablePages: counts non-deleted pages that have
non-empty textContent OR already have a stored embedding row, and use it as
the totalPages denominator in AiSettingsService.getMasked. The "has
embeddings" clause covers pages indexed from the content JSON (null
textContent) and guarantees indexedPages <= totalPages. No DB migration.
This commit is contained in:
vvzvlad
2026-06-18 03:33:38 +03:00
parent 4be849f8a1
commit 91a63f0b2c
2 changed files with 42 additions and 1 deletions

View File

@@ -195,6 +195,44 @@ export class PageRepo {
return Number(row?.count ?? 0);
}
/**
* Count non-deleted pages in a workspace that have EMBEDDABLE content — i.e.
* pages the RAG indexer can actually produce embeddings for. Used as the
* denominator of the "Indexed N of M pages" coverage indicator so empty /
* text-less pages (which legitimately store zero embeddings) don't keep the
* bar below 100% forever.
*
* A page qualifies if it has non-empty textContent OR already has stored
* embeddings. The second clause covers pages whose text the indexer extracted
* from the content JSON when textContent was null, and guarantees this total is
* always >= countIndexedPages (the indexed count can never exceed it).
*/
async countEmbeddablePages(workspaceId: string): Promise<number> {
const row = await this.db
.selectFrom('pages as p')
.where('p.workspaceId', '=', workspaceId)
.where('p.deletedAt', 'is', null)
.where((eb) =>
eb.or([
// Has extractable body text. The regex matches any non-whitespace
// character, mirroring the indexer's `text.trim().length === 0` check
// (raw SQL -> use the snake_case column name).
sql<boolean>`p.text_content ~ '[^[:space:]]'`,
// OR already has at least one (non-deleted) embedding row.
eb.exists(
eb
.selectFrom('pageEmbeddings as pe')
.select(sql`1`.as('one'))
.whereRef('pe.pageId', '=', 'p.id')
.where('pe.deletedAt', 'is', null),
),
]),
)
.select((eb) => eb.fn.countAll().as('count'))
.executeTakeFirst();
return Number(row?.count ?? 0);
}
/**
* IDs of all non-deleted pages in a workspace. Used by the RAG bulk reindex to
* (re)build embeddings for every existing page.

View File

@@ -160,9 +160,12 @@ export class AiSettingsService {
hasEmbeddingApiKey = !!creds?.embeddingApiKeyEnc;
}
// totalPages now counts only pages with embeddable content (non-empty text
// or already-stored embeddings), so empty/text-less pages don't keep the
// "Indexed N of M pages" bar below 100% forever.
const [indexedPages, totalPages] = await Promise.all([
this.pageEmbeddingRepo.countIndexedPages(workspaceId),
this.pageRepo.countByWorkspace(workspaceId),
this.pageRepo.countEmbeddablePages(workspaceId),
]);
return {