From d0eae6908659bce1c7fda65c09c6c8b7b3289695 Mon Sep 17 00:00:00 2001
From: claude code agent 227 <claude_code@vvzvlad.xyz>
Date: Mon, 29 Jun 2026 16:12:36 +0300
Subject: [PATCH] fix(ai): raise reindex pre-seed TTL to the client poll cap;
 cover predicate clause; align docs (F11-F13)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

F11: PRE_SEED_TTL_SECONDS 45->120 (= client REINDEX_POLL_CAP_MS). At concurrency
     1 a queued reindex can wait past the old 45s; if the pre-seed expired while
     pending, getMasked fell back to the COUNT and reported done, so the client
     stopped polling and missed the climb. Tie the pre-seed TTL to the client cap.
F12: extend the lockstep integration spec — insertPage takes content; a
     text_content=null + text-node-content page is IN and a math-only page is OUT,
     pinning the structural "type":"text" clause (and the jsonb space-after-colon).
F13: list all three embeddable clauses in the reindex JSDoc/inline comments.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../embedding/embedding-indexer.service.ts    | 39 ++++++++------
 .../ai/ai-settings.service.spec.ts            |  6 ++-
 .../integrations/ai/ai-settings.service.ts    | 38 +++++++++----
 .../page-embeddable-ids-lockstep.int-spec.ts  | 53 +++++++++++++++----
 4 files changed, 99 insertions(+), 37 deletions(-)

diff --git a/apps/server/src/core/ai-chat/embedding/embedding-indexer.service.ts b/apps/server/src/core/ai-chat/embedding/embedding-indexer.service.ts
index 243da2ea..9d1a3a09 100644
--- a/apps/server/src/core/ai-chat/embedding/embedding-indexer.service.ts
+++ b/apps/server/src/core/ai-chat/embedding/embedding-indexer.service.ts
@@ -187,15 +187,19 @@ export class EmbeddingIndexerService {
   /**
    * (Re)build embeddings for the EMBEDDABLE page set of a workspace — the same
    * set countEmbeddablePages counts (via getEmbeddablePageIds): non-deleted pages
-   * that have non-empty textContent OR already have a stored embedding row, NOT
-   * every non-deleted page. Iterating this set keeps the live `total` equal to
-   * the steady-state denominator, so the progress counter climbs 0 -> total and
-   * matches the before/after DB coverage exactly. Text-less pages are correctly
-   * skipped (reindexPage no-ops on them); a page that lost its text but still has
-   * stale embeddings stays in the set (the EXISTS clause) so it is visited and
-   * its stale rows are cleared. Used by the bulk reindex
-   * (WORKSPACE_CREATE_EMBEDDINGS, fired when AI Search is enabled and by the
-   * manual "Reindex now" action).
+   * that qualify under any of the three clauses of `embeddablePredicate` —
+   * non-empty textContent, OR an empty/null textContent whose ProseMirror
+   * `content` JSON has at least one text node (`"type":"text"`) that `jsonToText`
+   * can extract, OR an already-stored (non-deleted) embedding row — NOT every
+   * non-deleted page. Iterating this set keeps the live `total` equal to the
+   * steady-state denominator, so the progress counter climbs 0 -> total and
+   * matches the before/after DB coverage exactly. A page with truly no
+   * extractable text (empty textContent AND content with only non-text/atom
+   * nodes such as math) is correctly skipped (reindexPage no-ops on it); a page
+   * that lost its text but still has stale embeddings stays in the set (the
+   * EXISTS clause) so it is visited and its stale rows are cleared. Used by the
+   * bulk reindex (WORKSPACE_CREATE_EMBEDDINGS, fired when AI Search is enabled
+   * and by the manual "Reindex now" action).
    *
    * Resolves the embeddings model once up front: if the workspace has no
    * embeddings provider configured, the whole batch is skipped (otherwise each
@@ -223,13 +227,16 @@ export class EmbeddingIndexerService {
         throw err;
       }
 
-      // Iterate the EMBEDDABLE set (same predicate as countEmbeddablePages), NOT
-      // every non-deleted page: this makes `total` here equal the steady-state
-      // denominator, so the live counter climbs 0 -> total and matches the
-      // before/after DB count exactly (no 478 -> 500 -> 478 denominator jump).
-      // Text-less pages are correctly skipped — reindexPage no-ops on them, and
-      // a page that lost its text but still has stale embeddings IS in this set
-      // (the EXISTS clause) so it is still visited and its stale rows cleared.
+      // Iterate the EMBEDDABLE set (same three-clause predicate as
+      // countEmbeddablePages), NOT every non-deleted page: this makes `total`
+      // here equal the steady-state denominator, so the live counter climbs
+      // 0 -> total and matches the before/after DB count exactly (no
+      // 478 -> 500 -> 478 denominator jump). Pages whose text lives in the
+      // ProseMirror `content` JSON (a text node) even with empty text_content ARE
+      // in this set (the content-JSON clause) and get embedded; a page with no
+      // extractable text at all is correctly skipped — reindexPage no-ops on it —
+      // and a page that lost its text but still has stale embeddings IS in this
+      // set (the EXISTS clause) so it is still visited and its stale rows cleared.
       const pageIds = await this.pageRepo.getEmbeddablePageIds(workspaceId);
       const total = pageIds.length;
       const startedAt = Date.now();
diff --git a/apps/server/src/integrations/ai/ai-settings.service.spec.ts b/apps/server/src/integrations/ai/ai-settings.service.spec.ts
index 557a0566..073b4737 100644
--- a/apps/server/src/integrations/ai/ai-settings.service.spec.ts
+++ b/apps/server/src/integrations/ai/ai-settings.service.spec.ts
@@ -182,7 +182,11 @@ describe('AiSettingsService.reindex progress seed', () => {
     );
     const ttl = reindexProgress.start.mock.calls[0][2];
     expect(ttl).toBeGreaterThan(0);
-    expect(ttl).toBeLessThanOrEqual(60); // short, not the full 1h record TTL
+    // Short pre-seed TTL, distinct from the full 1h (3600s) record TTL, but
+    // pinned to the client poll cap (120s) so a still-pending run can't expire
+    // into a false "done" while the client is still polling (F11).
+    expect(ttl).toBe(120);
+    expect(ttl).toBeLessThanOrEqual(120);
     expect(aiQueue.add).toHaveBeenCalledTimes(1);
     // Seed must precede the enqueue so the first poll already reports done=0.
     expect(order).toEqual(['start', 'add']);
diff --git a/apps/server/src/integrations/ai/ai-settings.service.ts b/apps/server/src/integrations/ai/ai-settings.service.ts
index f8525871..6be0b5be 100644
--- a/apps/server/src/integrations/ai/ai-settings.service.ts
+++ b/apps/server/src/integrations/ai/ai-settings.service.ts
@@ -33,14 +33,27 @@ export function parsePositiveInt(raw: unknown): number | undefined {
 
 /**
  * TTL (seconds) for the enqueue-time progress PRE-SEED written by `reindex()`
- * before the worker starts. Deliberately SHORT: if `aiQueue.add()` de-duplicates
- * against a job that is just finishing (the worker's finally already ran
- * `clear()` but removeOnComplete hasn't yet removed the job), no new worker runs
- * to overwrite/clear this seed — so a short TTL lets the phantom "reindexing:
- * 0 of N" expire in seconds instead of sticking for the full 1h record TTL. A
- * worker that DOES start re-seeds with the full TTL, so a real run is unaffected.
+ * before the worker starts. Deliberately SHORT relative to the full 1h record
+ * TTL: if `aiQueue.add()` de-duplicates against a job that is just finishing
+ * (the worker's finally already ran `clear()` but removeOnComplete hasn't yet
+ * removed the job), no new worker runs to overwrite/clear this seed — so this
+ * shorter TTL lets the phantom "reindexing: 0 of N" expire instead of sticking
+ * for the full 1h record TTL. A worker that DOES start re-seeds with the full
+ * TTL, so a real run is unaffected.
+ *
+ * It MUST be >= the client poll cap (REINDEX_POLL_CAP_MS = 120000ms in
+ * ai-provider-settings.tsx) though: the AI_QUEUE worker runs at concurrency 1
+ * and shares the queue with page-level embedding jobs, so a queued reindex can
+ * wait well beyond a few dozen seconds before the worker re-seeds with the full
+ * TTL. If the pre-seed expired while the job is still pending, `get()` returns
+ * null and getMasked() falls back to the steady-state COUNT (indexedPages ==
+ * totalPages, reindexing=false) — the client reads that as "done & fully
+ * indexed", clears its deadline and STOPS polling, so the admin never sees the
+ * real climb. Pinning the pre-seed TTL to the client cap means a deduped phantom
+ * is bounded to ~120s — the same window the client already polls — and a genuine
+ * pending run never expires-into-"done" inside that window.
  */
-const PRE_SEED_TTL_SECONDS = 45;
+const PRE_SEED_TTL_SECONDS = 120;
 
 /**
  * Shape of the partial update accepted by `update`. Mirrors the validated
@@ -128,10 +141,13 @@ export class AiSettingsService {
     let seeded = false;
     if ((await this.reindexProgress.get(workspaceId)) === null) {
       const totalPages = await this.pageRepo.countEmbeddablePages(workspaceId);
-      // Short TTL: if add() below de-duplicates against a just-finishing job
-      // whose worker already clear()ed but isn't removed yet, no worker runs to
-      // clear this seed — the short TTL expires the phantom record in seconds
-      // rather than leaving a stuck "reindexing: 0 of N" for the full record TTL.
+      // Short TTL (vs the full 1h record TTL): if add() below de-duplicates
+      // against a just-finishing job whose worker already clear()ed but isn't
+      // removed yet, no worker runs to clear this seed — the shorter TTL expires
+      // the phantom record rather than leaving a stuck "reindexing: 0 of N" for
+      // the full record TTL. It is kept >= the client poll cap (120s) so a
+      // genuine but still-pending run never expires into a false "done" while
+      // the client is still polling (see PRE_SEED_TTL_SECONDS).
       await this.reindexProgress.start(
         workspaceId,
         totalPages,
diff --git a/apps/server/test/integration/page-embeddable-ids-lockstep.int-spec.ts b/apps/server/test/integration/page-embeddable-ids-lockstep.int-spec.ts
index ae7ffde5..db16399c 100644
--- a/apps/server/test/integration/page-embeddable-ids-lockstep.int-spec.ts
+++ b/apps/server/test/integration/page-embeddable-ids-lockstep.int-spec.ts
@@ -42,10 +42,14 @@ describe('PageRepo embeddable-page set: getEmbeddablePageIds <-> countEmbeddable
     await destroyTestDb();
   });
 
-  // Insert a page with explicit text_content / deleted_at (createPage in db.ts
-  // sets neither), returning its id so the test can assert membership.
+  // Insert a page with explicit text_content / content / deleted_at (createPage
+  // in db.ts sets none), returning its id so the test can assert membership.
+  // `content` is the ProseMirror doc JSON (jsonb): postgres.js serializes a plain
+  // object to JSON for jsonb columns, so we pass it through only when supplied so
+  // the rest of the rows keep the DB default.
   async function insertPage(args: {
     textContent: string | null;
+    content?: unknown;
     deletedAt?: Date | null;
   }): Promise<string> {
     const id = randomUUID();
@@ -58,6 +62,7 @@ describe('PageRepo embeddable-page set: getEmbeddablePageIds <-> countEmbeddable
         spaceId,
         workspaceId,
         textContent: args.textContent,
+        ...(args.content !== undefined ? { content: args.content as any } : {}),
         deletedAt: args.deletedAt ?? null,
       })
       .execute();
@@ -96,29 +101,59 @@ describe('PageRepo embeddable-page set: getEmbeddablePageIds <-> countEmbeddable
     //     so the reindex can clear them).
     const noTextLiveEmbedding = await insertPage({ textContent: null });
     await insertEmbedding(noTextLiveEmbedding);
+    // (c) non-deleted page with EMPTY text_content but ProseMirror `content` JSON
+    //     carrying a real text node — the content-JSON clause. This pins BOTH the
+    //     third OR-clause AND the space-after-colon: jsonb stores the key/value
+    //     separator as `"type": "text"` (a space after the colon), which is why
+    //     the predicate needs `[[:space:]]*`. `reindexPage` extracts this text, so
+    //     the page IS embeddable and the reindex MUST visit it.
+    const noTextContentDoc = await insertPage({
+      textContent: null,
+      content: {
+        type: 'doc',
+        content: [
+          { type: 'paragraph', content: [{ type: 'text', text: 'hello' }] },
+        ],
+      },
+    });
 
     // OUT of the set ----------------------------------------------------------
-    // (c) non-deleted, text_content NULL, no embeddings.
+    // (d) non-deleted, text_content NULL, no embeddings.
     await insertPage({ textContent: null });
-    // (d) non-deleted, whitespace-only text (regex requires a non-space char).
+    // (e) non-deleted, whitespace-only text (regex requires a non-space char).
     await insertPage({ textContent: '   \n\t  ' });
-    // (e) deleted page WITH body text — excluded by the non-deleted predicate.
+    // (f) deleted page WITH body text — excluded by the non-deleted predicate.
     await insertPage({
       textContent: 'deleted but had text',
       deletedAt: new Date(),
     });
-    // (f) non-deleted, no text, with ONLY a DELETED embedding row — the EXISTS
+    // (g) non-deleted, no text, with ONLY a DELETED embedding row — the EXISTS
     //     subquery filters pe.deleted_at IS NULL, so this stays out.
     const onlyDeletedEmbedding = await insertPage({ textContent: null });
     await insertEmbedding(onlyDeletedEmbedding, { deletedAt: new Date() });
+    // (h) non-deleted, empty text_content, content JSON with ONLY a math atom
+    //     node — its LaTeX lives in `attrs.text` (a `"text":` KEY, not a
+    //     `"type":"text"` text node) and has no text serializer, so `jsonToText`
+    //     yields nothing and the page produces zero embeddings. The predicate
+    //     keys on the structural `"type":"text"` marker, so this stays OUT (a
+    //     bare `"text":` match would wrongly inflate the denominator).
+    await insertPage({
+      textContent: null,
+      content: {
+        type: 'doc',
+        content: [{ type: 'mathBlock', attrs: { text: 'E=mc^2' } }],
+      },
+    });
 
     const ids = await repo.getEmbeddablePageIds(workspaceId);
     const count = await repo.countEmbeddablePages(workspaceId);
 
     // The two queries agree on the size (the load-bearing lockstep invariant)...
     expect(ids.length).toBe(count);
-    // ...and the set is exactly the two qualifying pages, nothing else.
-    expect(new Set(ids)).toEqual(new Set([withText, noTextLiveEmbedding]));
-    expect(count).toBe(2);
+    // ...and the set is exactly the three qualifying pages, nothing else.
+    expect(new Set(ids)).toEqual(
+      new Set([withText, noTextLiveEmbedding, noTextContentDoc]),
+    );
+    expect(count).toBe(3);
   });
 });