From d0eae6908659bce1c7fda65c09c6c8b7b3289695 Mon Sep 17 00:00:00 2001 From: claude code agent 227 Date: Mon, 29 Jun 2026 16:12:36 +0300 Subject: [PATCH] fix(ai): raise reindex pre-seed TTL to the client poll cap; cover predicate clause; align docs (F11-F13) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit F11: PRE_SEED_TTL_SECONDS 45->120 (= client REINDEX_POLL_CAP_MS). At concurrency 1 a queued reindex can wait past the old 45s; if the pre-seed expired while pending, getMasked fell back to the COUNT and reported done, so the client stopped polling and missed the climb. Tie the pre-seed TTL to the client cap. F12: extend the lockstep integration spec — insertPage takes content; a text_content=null + text-node-content page is IN and a math-only page is OUT, pinning the structural "type":"text" clause (and the jsonb space-after-colon). F13: list all three embeddable clauses in the reindex JSDoc/inline comments. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../embedding/embedding-indexer.service.ts | 39 ++++++++------ .../ai/ai-settings.service.spec.ts | 6 ++- .../integrations/ai/ai-settings.service.ts | 38 +++++++++---- .../page-embeddable-ids-lockstep.int-spec.ts | 53 +++++++++++++++---- 4 files changed, 99 insertions(+), 37 deletions(-) diff --git a/apps/server/src/core/ai-chat/embedding/embedding-indexer.service.ts b/apps/server/src/core/ai-chat/embedding/embedding-indexer.service.ts index 243da2ea..9d1a3a09 100644 --- a/apps/server/src/core/ai-chat/embedding/embedding-indexer.service.ts +++ b/apps/server/src/core/ai-chat/embedding/embedding-indexer.service.ts @@ -187,15 +187,19 @@ export class EmbeddingIndexerService { /** * (Re)build embeddings for the EMBEDDABLE page set of a workspace — the same * set countEmbeddablePages counts (via getEmbeddablePageIds): non-deleted pages - * that have non-empty textContent OR already have a stored embedding row, NOT - * every non-deleted page. Iterating this set keeps the live `total` equal to - * the steady-state denominator, so the progress counter climbs 0 -> total and - * matches the before/after DB coverage exactly. Text-less pages are correctly - * skipped (reindexPage no-ops on them); a page that lost its text but still has - * stale embeddings stays in the set (the EXISTS clause) so it is visited and - * its stale rows are cleared. Used by the bulk reindex - * (WORKSPACE_CREATE_EMBEDDINGS, fired when AI Search is enabled and by the - * manual "Reindex now" action). + * that qualify under any of the three clauses of `embeddablePredicate` — + * non-empty textContent, OR an empty/null textContent whose ProseMirror + * `content` JSON has at least one text node (`"type":"text"`) that `jsonToText` + * can extract, OR an already-stored (non-deleted) embedding row — NOT every + * non-deleted page. Iterating this set keeps the live `total` equal to the + * steady-state denominator, so the progress counter climbs 0 -> total and + * matches the before/after DB coverage exactly. A page with truly no + * extractable text (empty textContent AND content with only non-text/atom + * nodes such as math) is correctly skipped (reindexPage no-ops on it); a page + * that lost its text but still has stale embeddings stays in the set (the + * EXISTS clause) so it is visited and its stale rows are cleared. Used by the + * bulk reindex (WORKSPACE_CREATE_EMBEDDINGS, fired when AI Search is enabled + * and by the manual "Reindex now" action). * * Resolves the embeddings model once up front: if the workspace has no * embeddings provider configured, the whole batch is skipped (otherwise each @@ -223,13 +227,16 @@ export class EmbeddingIndexerService { throw err; } - // Iterate the EMBEDDABLE set (same predicate as countEmbeddablePages), NOT - // every non-deleted page: this makes `total` here equal the steady-state - // denominator, so the live counter climbs 0 -> total and matches the - // before/after DB count exactly (no 478 -> 500 -> 478 denominator jump). - // Text-less pages are correctly skipped — reindexPage no-ops on them, and - // a page that lost its text but still has stale embeddings IS in this set - // (the EXISTS clause) so it is still visited and its stale rows cleared. + // Iterate the EMBEDDABLE set (same three-clause predicate as + // countEmbeddablePages), NOT every non-deleted page: this makes `total` + // here equal the steady-state denominator, so the live counter climbs + // 0 -> total and matches the before/after DB count exactly (no + // 478 -> 500 -> 478 denominator jump). Pages whose text lives in the + // ProseMirror `content` JSON (a text node) even with empty text_content ARE + // in this set (the content-JSON clause) and get embedded; a page with no + // extractable text at all is correctly skipped — reindexPage no-ops on it — + // and a page that lost its text but still has stale embeddings IS in this + // set (the EXISTS clause) so it is still visited and its stale rows cleared. const pageIds = await this.pageRepo.getEmbeddablePageIds(workspaceId); const total = pageIds.length; const startedAt = Date.now(); diff --git a/apps/server/src/integrations/ai/ai-settings.service.spec.ts b/apps/server/src/integrations/ai/ai-settings.service.spec.ts index 557a0566..073b4737 100644 --- a/apps/server/src/integrations/ai/ai-settings.service.spec.ts +++ b/apps/server/src/integrations/ai/ai-settings.service.spec.ts @@ -182,7 +182,11 @@ describe('AiSettingsService.reindex progress seed', () => { ); const ttl = reindexProgress.start.mock.calls[0][2]; expect(ttl).toBeGreaterThan(0); - expect(ttl).toBeLessThanOrEqual(60); // short, not the full 1h record TTL + // Short pre-seed TTL, distinct from the full 1h (3600s) record TTL, but + // pinned to the client poll cap (120s) so a still-pending run can't expire + // into a false "done" while the client is still polling (F11). + expect(ttl).toBe(120); + expect(ttl).toBeLessThanOrEqual(120); expect(aiQueue.add).toHaveBeenCalledTimes(1); // Seed must precede the enqueue so the first poll already reports done=0. expect(order).toEqual(['start', 'add']); diff --git a/apps/server/src/integrations/ai/ai-settings.service.ts b/apps/server/src/integrations/ai/ai-settings.service.ts index f8525871..6be0b5be 100644 --- a/apps/server/src/integrations/ai/ai-settings.service.ts +++ b/apps/server/src/integrations/ai/ai-settings.service.ts @@ -33,14 +33,27 @@ export function parsePositiveInt(raw: unknown): number | undefined { /** * TTL (seconds) for the enqueue-time progress PRE-SEED written by `reindex()` - * before the worker starts. Deliberately SHORT: if `aiQueue.add()` de-duplicates - * against a job that is just finishing (the worker's finally already ran - * `clear()` but removeOnComplete hasn't yet removed the job), no new worker runs - * to overwrite/clear this seed — so a short TTL lets the phantom "reindexing: - * 0 of N" expire in seconds instead of sticking for the full 1h record TTL. A - * worker that DOES start re-seeds with the full TTL, so a real run is unaffected. + * before the worker starts. Deliberately SHORT relative to the full 1h record + * TTL: if `aiQueue.add()` de-duplicates against a job that is just finishing + * (the worker's finally already ran `clear()` but removeOnComplete hasn't yet + * removed the job), no new worker runs to overwrite/clear this seed — so this + * shorter TTL lets the phantom "reindexing: 0 of N" expire instead of sticking + * for the full 1h record TTL. A worker that DOES start re-seeds with the full + * TTL, so a real run is unaffected. + * + * It MUST be >= the client poll cap (REINDEX_POLL_CAP_MS = 120000ms in + * ai-provider-settings.tsx) though: the AI_QUEUE worker runs at concurrency 1 + * and shares the queue with page-level embedding jobs, so a queued reindex can + * wait well beyond a few dozen seconds before the worker re-seeds with the full + * TTL. If the pre-seed expired while the job is still pending, `get()` returns + * null and getMasked() falls back to the steady-state COUNT (indexedPages == + * totalPages, reindexing=false) — the client reads that as "done & fully + * indexed", clears its deadline and STOPS polling, so the admin never sees the + * real climb. Pinning the pre-seed TTL to the client cap means a deduped phantom + * is bounded to ~120s — the same window the client already polls — and a genuine + * pending run never expires-into-"done" inside that window. */ -const PRE_SEED_TTL_SECONDS = 45; +const PRE_SEED_TTL_SECONDS = 120; /** * Shape of the partial update accepted by `update`. Mirrors the validated @@ -128,10 +141,13 @@ export class AiSettingsService { let seeded = false; if ((await this.reindexProgress.get(workspaceId)) === null) { const totalPages = await this.pageRepo.countEmbeddablePages(workspaceId); - // Short TTL: if add() below de-duplicates against a just-finishing job - // whose worker already clear()ed but isn't removed yet, no worker runs to - // clear this seed — the short TTL expires the phantom record in seconds - // rather than leaving a stuck "reindexing: 0 of N" for the full record TTL. + // Short TTL (vs the full 1h record TTL): if add() below de-duplicates + // against a just-finishing job whose worker already clear()ed but isn't + // removed yet, no worker runs to clear this seed — the shorter TTL expires + // the phantom record rather than leaving a stuck "reindexing: 0 of N" for + // the full record TTL. It is kept >= the client poll cap (120s) so a + // genuine but still-pending run never expires into a false "done" while + // the client is still polling (see PRE_SEED_TTL_SECONDS). await this.reindexProgress.start( workspaceId, totalPages, diff --git a/apps/server/test/integration/page-embeddable-ids-lockstep.int-spec.ts b/apps/server/test/integration/page-embeddable-ids-lockstep.int-spec.ts index ae7ffde5..db16399c 100644 --- a/apps/server/test/integration/page-embeddable-ids-lockstep.int-spec.ts +++ b/apps/server/test/integration/page-embeddable-ids-lockstep.int-spec.ts @@ -42,10 +42,14 @@ describe('PageRepo embeddable-page set: getEmbeddablePageIds <-> countEmbeddable await destroyTestDb(); }); - // Insert a page with explicit text_content / deleted_at (createPage in db.ts - // sets neither), returning its id so the test can assert membership. + // Insert a page with explicit text_content / content / deleted_at (createPage + // in db.ts sets none), returning its id so the test can assert membership. + // `content` is the ProseMirror doc JSON (jsonb): postgres.js serializes a plain + // object to JSON for jsonb columns, so we pass it through only when supplied so + // the rest of the rows keep the DB default. async function insertPage(args: { textContent: string | null; + content?: unknown; deletedAt?: Date | null; }): Promise { const id = randomUUID(); @@ -58,6 +62,7 @@ describe('PageRepo embeddable-page set: getEmbeddablePageIds <-> countEmbeddable spaceId, workspaceId, textContent: args.textContent, + ...(args.content !== undefined ? { content: args.content as any } : {}), deletedAt: args.deletedAt ?? null, }) .execute(); @@ -96,29 +101,59 @@ describe('PageRepo embeddable-page set: getEmbeddablePageIds <-> countEmbeddable // so the reindex can clear them). const noTextLiveEmbedding = await insertPage({ textContent: null }); await insertEmbedding(noTextLiveEmbedding); + // (c) non-deleted page with EMPTY text_content but ProseMirror `content` JSON + // carrying a real text node — the content-JSON clause. This pins BOTH the + // third OR-clause AND the space-after-colon: jsonb stores the key/value + // separator as `"type": "text"` (a space after the colon), which is why + // the predicate needs `[[:space:]]*`. `reindexPage` extracts this text, so + // the page IS embeddable and the reindex MUST visit it. + const noTextContentDoc = await insertPage({ + textContent: null, + content: { + type: 'doc', + content: [ + { type: 'paragraph', content: [{ type: 'text', text: 'hello' }] }, + ], + }, + }); // OUT of the set ---------------------------------------------------------- - // (c) non-deleted, text_content NULL, no embeddings. + // (d) non-deleted, text_content NULL, no embeddings. await insertPage({ textContent: null }); - // (d) non-deleted, whitespace-only text (regex requires a non-space char). + // (e) non-deleted, whitespace-only text (regex requires a non-space char). await insertPage({ textContent: ' \n\t ' }); - // (e) deleted page WITH body text — excluded by the non-deleted predicate. + // (f) deleted page WITH body text — excluded by the non-deleted predicate. await insertPage({ textContent: 'deleted but had text', deletedAt: new Date(), }); - // (f) non-deleted, no text, with ONLY a DELETED embedding row — the EXISTS + // (g) non-deleted, no text, with ONLY a DELETED embedding row — the EXISTS // subquery filters pe.deleted_at IS NULL, so this stays out. const onlyDeletedEmbedding = await insertPage({ textContent: null }); await insertEmbedding(onlyDeletedEmbedding, { deletedAt: new Date() }); + // (h) non-deleted, empty text_content, content JSON with ONLY a math atom + // node — its LaTeX lives in `attrs.text` (a `"text":` KEY, not a + // `"type":"text"` text node) and has no text serializer, so `jsonToText` + // yields nothing and the page produces zero embeddings. The predicate + // keys on the structural `"type":"text"` marker, so this stays OUT (a + // bare `"text":` match would wrongly inflate the denominator). + await insertPage({ + textContent: null, + content: { + type: 'doc', + content: [{ type: 'mathBlock', attrs: { text: 'E=mc^2' } }], + }, + }); const ids = await repo.getEmbeddablePageIds(workspaceId); const count = await repo.countEmbeddablePages(workspaceId); // The two queries agree on the size (the load-bearing lockstep invariant)... expect(ids.length).toBe(count); - // ...and the set is exactly the two qualifying pages, nothing else. - expect(new Set(ids)).toEqual(new Set([withText, noTextLiveEmbedding])); - expect(count).toBe(2); + // ...and the set is exactly the three qualifying pages, nothing else. + expect(new Set(ids)).toEqual( + new Set([withText, noTextLiveEmbedding, noTextContentDoc]), + ); + expect(count).toBe(3); }); });