fix(ai): address reindex-progress review round 1 (PR #242)

F1: clear the "Reindex now" spinner once the poll cap fires. Gate the
reindexing part of the button's loading state on the active poll window
(reindexDeadline !== null) so a run that outlives the 120s cap no longer
leaves the button stuck-disabled with a stale `reindexing: true`; the
admin can restart.

F2: rewrite reindexWorkspace JSDoc to describe the EMBEDDABLE page set
(text OR existing embeddings), matching getEmbeddablePageIds /
countEmbeddablePages instead of the old "every non-deleted page".

F3: extract the shared embeddable-content predicate into a private
PageRepo.embeddablePredicate helper, called by both countEmbeddablePages
and getEmbeddablePageIds, removing the verbatim duplication. Behavior is
identical (lockstep int-spec stays green).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
claude code agent 227
2026-06-28 23:39:20 +03:00
parent bf09eec4e1
commit 85b38d6946
3 changed files with 62 additions and 38 deletions

View File

@@ -1084,7 +1084,18 @@ export default function AiProviderSettings() {
// background job keeps running, so also stay loading while the
// server reports `reindexing` (this also blocks a redundant
// re-trigger mid-run; the server de-dupes regardless).
loading={reindexMutation.isPending || settings?.reindexing === true}
//
// Gate the `reindexing` part on the active poll window
// (reindexDeadline !== null): once the 120s poll cap fires it nulls
// reindexDeadline and stops refetching, so `settings.reindexing`
// can be a stale `true` from the last poll. Without this gate the
// spinner would stay stuck (and the button disabled) forever for a
// run that outlives the cap — clearing it here lets the admin
// restart.
loading={
reindexMutation.isPending ||
(reindexDeadline !== null && settings?.reindexing === true)
}
onClick={() =>
reindexMutation.mutate(undefined, {
// Begin bounded polling so the counter climbs as the async

View File

@@ -185,9 +185,17 @@ export class EmbeddingIndexerService {
}
/**
* (Re)build embeddings for EVERY non-deleted page in a workspace. Used by the
* bulk reindex (WORKSPACE_CREATE_EMBEDDINGS, fired when AI Search is enabled
* and by the manual "Reindex now" action).
* (Re)build embeddings for the EMBEDDABLE page set of a workspace — the same
* set countEmbeddablePages counts (via getEmbeddablePageIds): non-deleted pages
* that have non-empty textContent OR already have a stored embedding row, NOT
* every non-deleted page. Iterating this set keeps the live `total` equal to
* the steady-state denominator, so the progress counter climbs 0 -> total and
* matches the before/after DB coverage exactly. Text-less pages are correctly
* skipped (reindexPage no-ops on them); a page that lost its text but still has
* stale embeddings stays in the set (the EXISTS clause) so it is visited and
* its stale rows are cleared. Used by the bulk reindex
* (WORKSPACE_CREATE_EMBEDDINGS, fired when AI Search is enabled and by the
* manual "Reindex now" action).
*
* Resolves the embeddings model once up front: if the workspace has no
* embeddings provider configured, the whole batch is skipped (otherwise each

View File

@@ -12,6 +12,7 @@ import { executeWithCursorPagination } from '@docmost/db/pagination/cursor-pagin
import { validate as isValidUUID } from 'uuid';
import { ExpressionBuilder, sql } from 'kysely';
import { DB } from '@docmost/db/types/db';
import { DbInterface } from '@docmost/db/types/db.interface';
import { jsonArrayFrom, jsonObjectFrom } from 'kysely/helpers/postgres';
import { SpaceMemberRepo } from '@docmost/db/repos/space/space-member.repo';
import { EventEmitter2 } from '@nestjs/event-emitter';
@@ -243,27 +244,43 @@ export class PageRepo {
.selectFrom('pages as p')
.where('p.workspaceId', '=', workspaceId)
.where('p.deletedAt', 'is', null)
.where((eb) =>
eb.or([
// Has extractable body text. The regex matches any non-whitespace
// character, mirroring the indexer's `text.trim().length === 0` check
// (raw SQL -> use the snake_case column name).
sql<boolean>`p.text_content ~ '[^[:space:]]'`,
// OR already has at least one (non-deleted) embedding row.
eb.exists(
eb
.selectFrom('pageEmbeddings as pe')
.select(sql`1`.as('one'))
.whereRef('pe.pageId', '=', 'p.id')
.where('pe.deletedAt', 'is', null),
),
]),
)
.where((eb) => this.embeddablePredicate(eb))
.select((eb) => eb.fn.countAll().as('count'))
.executeTakeFirst();
return Number(row?.count ?? 0);
}
/**
* The "embeddable content" qualifying predicate, shared verbatim by
* countEmbeddablePages (the steady-state denominator) and getEmbeddablePageIds
* (the set the bulk reindex iterates). Both MUST use the exact same condition
* or the live total and steady-state total diverge — extracting it here is what
* guarantees that, replacing the previous hand-duplicated copy. Callers supply
* the trivial workspaceId/deletedAt filters inline; this returns only the
* non-trivial OR clause, evaluated against the `p` alias of `pages`.
*
* A page qualifies if it has non-empty textContent OR already has a stored
* (non-deleted) embedding row.
*/
private embeddablePredicate(
eb: ExpressionBuilder<DbInterface & { p: DbInterface['pages'] }, 'p'>,
) {
return eb.or([
// Has extractable body text. The regex matches any non-whitespace
// character, mirroring the indexer's `text.trim().length === 0` check
// (raw SQL -> use the snake_case column name).
sql<boolean>`p.text_content ~ '[^[:space:]]'`,
// OR already has at least one (non-deleted) embedding row.
eb.exists(
eb
.selectFrom('pageEmbeddings as pe')
.select(sql`1`.as('one'))
.whereRef('pe.pageId', '=', 'p.id')
.where('pe.deletedAt', 'is', null),
),
]);
}
/**
* IDs of the EMBEDDABLE page set for a workspace — the exact same set that
* `countEmbeddablePages` counts (a page qualifies if it has non-empty
@@ -273,9 +290,11 @@ export class PageRepo {
* every non-deleted page (which would push the denominator above the
* steady-state value mid-run).
*
* IMPORTANT: the WHERE here MUST stay in lockstep with `countEmbeddablePages`
* — if one changes, change both, or the live total and steady-state total
* diverge again. Dropping text-less pages is correct: `reindexPage` no-ops on
* IMPORTANT: the qualifying WHERE is shared with `countEmbeddablePages` via the
* private `embeddablePredicate` helper, so the two can no longer drift — if the
* embeddable definition changes, change it once there and both stay in lockstep
* (else the live total and steady-state total diverge again). Dropping
* text-less pages is correct: `reindexPage` no-ops on
* a page with no extractable content anyway, and a page that lost its text but
* still has stale embeddings IS in this set (the EXISTS clause), so it is still
* visited and its stale rows are cleared.
@@ -286,21 +305,7 @@ export class PageRepo {
.select('p.id')
.where('p.workspaceId', '=', workspaceId)
.where('p.deletedAt', 'is', null)
.where((eb) =>
eb.or([
// Has extractable body text (mirrors countEmbeddablePages: any
// non-whitespace char; raw SQL -> snake_case column name).
sql<boolean>`p.text_content ~ '[^[:space:]]'`,
// OR already has at least one (non-deleted) embedding row.
eb.exists(
eb
.selectFrom('pageEmbeddings as pe')
.select(sql`1`.as('one'))
.whereRef('pe.pageId', '=', 'p.id')
.where('pe.deletedAt', 'is', null),
),
]),
)
.where((eb) => this.embeddablePredicate(eb))
.execute();
return rows.map((r) => r.id);
}