From 85b38d6946d9d6dbca42a9b120c33d78d25e5b4c Mon Sep 17 00:00:00 2001 From: claude code agent 227 Date: Sun, 28 Jun 2026 23:39:20 +0300 Subject: [PATCH] fix(ai): address reindex-progress review round 1 (PR #242) F1: clear the "Reindex now" spinner once the poll cap fires. Gate the reindexing part of the button's loading state on the active poll window (reindexDeadline !== null) so a run that outlives the 120s cap no longer leaves the button stuck-disabled with a stale `reindexing: true`; the admin can restart. F2: rewrite reindexWorkspace JSDoc to describe the EMBEDDABLE page set (text OR existing embeddings), matching getEmbeddablePageIds / countEmbeddablePages instead of the old "every non-deleted page". F3: extract the shared embeddable-content predicate into a private PageRepo.embeddablePredicate helper, called by both countEmbeddablePages and getEmbeddablePageIds, removing the verbatim duplication. Behavior is identical (lockstep int-spec stays green). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../components/ai-provider-settings.tsx | 13 +++- .../embedding/embedding-indexer.service.ts | 14 +++- .../src/database/repos/page/page.repo.ts | 73 ++++++++++--------- 3 files changed, 62 insertions(+), 38 deletions(-) diff --git a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx index a06d1e0f..832f8436 100644 --- a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx +++ b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx @@ -1084,7 +1084,18 @@ export default function AiProviderSettings() { // background job keeps running, so also stay loading while the // server reports `reindexing` (this also blocks a redundant // re-trigger mid-run; the server de-dupes regardless). - loading={reindexMutation.isPending || settings?.reindexing === true} + // + // Gate the `reindexing` part on the active poll window + // (reindexDeadline !== null): once the 120s poll cap fires it nulls + // reindexDeadline and stops refetching, so `settings.reindexing` + // can be a stale `true` from the last poll. Without this gate the + // spinner would stay stuck (and the button disabled) forever for a + // run that outlives the cap — clearing it here lets the admin + // restart. + loading={ + reindexMutation.isPending || + (reindexDeadline !== null && settings?.reindexing === true) + } onClick={() => reindexMutation.mutate(undefined, { // Begin bounded polling so the counter climbs as the async diff --git a/apps/server/src/core/ai-chat/embedding/embedding-indexer.service.ts b/apps/server/src/core/ai-chat/embedding/embedding-indexer.service.ts index 9c97a971..243da2ea 100644 --- a/apps/server/src/core/ai-chat/embedding/embedding-indexer.service.ts +++ b/apps/server/src/core/ai-chat/embedding/embedding-indexer.service.ts @@ -185,9 +185,17 @@ export class EmbeddingIndexerService { } /** - * (Re)build embeddings for EVERY non-deleted page in a workspace. Used by the - * bulk reindex (WORKSPACE_CREATE_EMBEDDINGS, fired when AI Search is enabled - * and by the manual "Reindex now" action). + * (Re)build embeddings for the EMBEDDABLE page set of a workspace — the same + * set countEmbeddablePages counts (via getEmbeddablePageIds): non-deleted pages + * that have non-empty textContent OR already have a stored embedding row, NOT + * every non-deleted page. Iterating this set keeps the live `total` equal to + * the steady-state denominator, so the progress counter climbs 0 -> total and + * matches the before/after DB coverage exactly. Text-less pages are correctly + * skipped (reindexPage no-ops on them); a page that lost its text but still has + * stale embeddings stays in the set (the EXISTS clause) so it is visited and + * its stale rows are cleared. Used by the bulk reindex + * (WORKSPACE_CREATE_EMBEDDINGS, fired when AI Search is enabled and by the + * manual "Reindex now" action). * * Resolves the embeddings model once up front: if the workspace has no * embeddings provider configured, the whole batch is skipped (otherwise each diff --git a/apps/server/src/database/repos/page/page.repo.ts b/apps/server/src/database/repos/page/page.repo.ts index a9b79c35..72a979ce 100644 --- a/apps/server/src/database/repos/page/page.repo.ts +++ b/apps/server/src/database/repos/page/page.repo.ts @@ -12,6 +12,7 @@ import { executeWithCursorPagination } from '@docmost/db/pagination/cursor-pagin import { validate as isValidUUID } from 'uuid'; import { ExpressionBuilder, sql } from 'kysely'; import { DB } from '@docmost/db/types/db'; +import { DbInterface } from '@docmost/db/types/db.interface'; import { jsonArrayFrom, jsonObjectFrom } from 'kysely/helpers/postgres'; import { SpaceMemberRepo } from '@docmost/db/repos/space/space-member.repo'; import { EventEmitter2 } from '@nestjs/event-emitter'; @@ -243,27 +244,43 @@ export class PageRepo { .selectFrom('pages as p') .where('p.workspaceId', '=', workspaceId) .where('p.deletedAt', 'is', null) - .where((eb) => - eb.or([ - // Has extractable body text. The regex matches any non-whitespace - // character, mirroring the indexer's `text.trim().length === 0` check - // (raw SQL -> use the snake_case column name). - sql`p.text_content ~ '[^[:space:]]'`, - // OR already has at least one (non-deleted) embedding row. - eb.exists( - eb - .selectFrom('pageEmbeddings as pe') - .select(sql`1`.as('one')) - .whereRef('pe.pageId', '=', 'p.id') - .where('pe.deletedAt', 'is', null), - ), - ]), - ) + .where((eb) => this.embeddablePredicate(eb)) .select((eb) => eb.fn.countAll().as('count')) .executeTakeFirst(); return Number(row?.count ?? 0); } + /** + * The "embeddable content" qualifying predicate, shared verbatim by + * countEmbeddablePages (the steady-state denominator) and getEmbeddablePageIds + * (the set the bulk reindex iterates). Both MUST use the exact same condition + * or the live total and steady-state total diverge — extracting it here is what + * guarantees that, replacing the previous hand-duplicated copy. Callers supply + * the trivial workspaceId/deletedAt filters inline; this returns only the + * non-trivial OR clause, evaluated against the `p` alias of `pages`. + * + * A page qualifies if it has non-empty textContent OR already has a stored + * (non-deleted) embedding row. + */ + private embeddablePredicate( + eb: ExpressionBuilder, + ) { + return eb.or([ + // Has extractable body text. The regex matches any non-whitespace + // character, mirroring the indexer's `text.trim().length === 0` check + // (raw SQL -> use the snake_case column name). + sql`p.text_content ~ '[^[:space:]]'`, + // OR already has at least one (non-deleted) embedding row. + eb.exists( + eb + .selectFrom('pageEmbeddings as pe') + .select(sql`1`.as('one')) + .whereRef('pe.pageId', '=', 'p.id') + .where('pe.deletedAt', 'is', null), + ), + ]); + } + /** * IDs of the EMBEDDABLE page set for a workspace — the exact same set that * `countEmbeddablePages` counts (a page qualifies if it has non-empty @@ -273,9 +290,11 @@ export class PageRepo { * every non-deleted page (which would push the denominator above the * steady-state value mid-run). * - * IMPORTANT: the WHERE here MUST stay in lockstep with `countEmbeddablePages` - * — if one changes, change both, or the live total and steady-state total - * diverge again. Dropping text-less pages is correct: `reindexPage` no-ops on + * IMPORTANT: the qualifying WHERE is shared with `countEmbeddablePages` via the + * private `embeddablePredicate` helper, so the two can no longer drift — if the + * embeddable definition changes, change it once there and both stay in lockstep + * (else the live total and steady-state total diverge again). Dropping + * text-less pages is correct: `reindexPage` no-ops on * a page with no extractable content anyway, and a page that lost its text but * still has stale embeddings IS in this set (the EXISTS clause), so it is still * visited and its stale rows are cleared. @@ -286,21 +305,7 @@ export class PageRepo { .select('p.id') .where('p.workspaceId', '=', workspaceId) .where('p.deletedAt', 'is', null) - .where((eb) => - eb.or([ - // Has extractable body text (mirrors countEmbeddablePages: any - // non-whitespace char; raw SQL -> snake_case column name). - sql`p.text_content ~ '[^[:space:]]'`, - // OR already has at least one (non-deleted) embedding row. - eb.exists( - eb - .selectFrom('pageEmbeddings as pe') - .select(sql`1`.as('one')) - .whereRef('pe.pageId', '=', 'p.id') - .where('pe.deletedAt', 'is', null), - ), - ]), - ) + .where((eb) => this.embeddablePredicate(eb)) .execute(); return rows.map((r) => r.id); }