fix(ai): include content-bearing pages in reindex coverage; correct progress race & hot path (F6-F10)

F6: extend embeddablePredicate to pages with body content but null text_content, keyed on the text-node marker "type":"text" (not a bare "text": key, which also matched math nodes' attrs.text and would leave math-only pages stuck below 100%). Numerator and denominator share the predicate; tests assert the compiled WHERE is byte-identical and a math-only doc is excluded. F7: correct the start() JSDoc (both totals are the real page count). F8: nextReindexPollInterval reuses isReindexComplete. F9: getMasked reads progress first and skips the two COUNTs while a reindex is active. F10: pre-seed the progress entry with a short 45s TTL so a deduped enqueue's phantom "0 of N" expires quickly instead of sticking for the 1h TTL. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-29 14:37:26 +03:00
parent bdc033e689
commit 91f24fc062
7 changed files with 298 additions and 33 deletions
--- a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
+++ b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
@@ -198,16 +198,18 @@ export function nextReindexPollInterval(args: {
  if (now > deadline) return false;
  // Active run → keep polling even if the momentary counts already look full.
  if (status?.reindexing) return intervalMs;
-  // Finished and fully indexed (incl. an empty workspace, 0 >= 0) → stop.
-  if (status && status.indexedPages >= status.totalPages) return false;
+  // Finished and fully indexed (incl. an empty workspace, 0 >= 0) → stop. Reuse
+  // isReindexComplete so the completeness check lives in exactly one place.
+  if (isReindexComplete(status)) return false;
  // Within the deadline and not yet done → keep polling.
  return intervalMs;
 }

 /**
 * Whether the reindex poll deadline should be cleared: the server reports no
- * active run AND the count is complete. Mirrors the stop condition of
- * `nextReindexPollInterval` (sans the cap, which the effect handles via time).
+ * active run AND the count is complete. The single source of truth for the
+ * "reindex finished" check — `nextReindexPollInterval` reuses it for its stop
+ * condition (sans the cap, which the effect handles via time).
 */
 export function isReindexComplete(status?: ReindexStatus): boolean {
  return (
--- a/apps/server/src/database/repos/page/page.repo.embeddable.spec.ts
+++ b/apps/server/src/database/repos/page/page.repo.embeddable.spec.ts
@@ -0,0 +1,167 @@
+import { PageRepo } from './page.repo';
+import {
+  DummyDriver,
+  Kysely,
+  PostgresAdapter,
+  PostgresIntrospector,
+  PostgresQueryCompiler,
+} from 'kysely';
+
+/**
+ * F6 regression guard for the embeddable-page predicate.
+ *
+ * The predicate is shared by `countEmbeddablePages` (the "Indexed N of M" coverage
+ * denominator) and `getEmbeddablePageIds` (the exact set a full reindex iterates).
+ * It MUST select pages whose `text_content` was never backfilled (null/empty) but
+ * whose ProseMirror `content` JSON still carries body text — `reindexPage` builds
+ * its chunks straight from `content`, so without a content clause such a page is
+ * silently SKIPPED by a mass reindex even though it is fully embeddable.
+ *
+ * The content clause keys on the structural text-node marker `"type":"text"`, NOT
+ * a bare `"text":` key. The bare key also appears as the `attrs.text` of atom
+ * nodes that carry NO extractable text — notably math (`mathBlock`/`mathInline`),
+ * whose LaTeX lives in `attrs.text` and has no `generateText` serializer. A
+ * math-ONLY page therefore yields empty `text_content` and zero embeddings; if the
+ * predicate matched its `attrs.text` it would land in the denominator but
+ * `reindexPage` would no-op on it, pinning "Indexed N of M" below 100% forever —
+ * the exact bug this feature fixes. The `"type":"text"` marker matches only real
+ * text nodes (what `jsonToText` extracts), keeping the predicate consistent with
+ * what gets indexed.
+ *
+ * There is no real Postgres here: a recording Kysely (DummyDriver wired to the
+ * Postgres query compiler) compiles the queries to SQL so we can assert the WHERE
+ * predicate ORs in the narrowed content clause alongside the existing text_content
+ * and stored-embeddings clauses — and that BOTH callers compile the identical
+ * clause (denominator and reindex set can never diverge).
+ */
+function makeRecordingDb() {
+  const sqls: string[] = [];
+  const db = new Kysely<any>({
+    dialect: {
+      createAdapter: () => new PostgresAdapter(),
+      createDriver: () =>
+        new (class extends DummyDriver {
+          async acquireConnection() {
+            return {
+              executeQuery: async (compiled: { sql: string }) => {
+                sqls.push(compiled.sql);
+                return { rows: [] };
+              },
+              // eslint-disable-next-line @typescript-eslint/no-empty-function
+              streamQuery: async function* () {},
+            } as any;
+          }
+        })(),
+      createIntrospector: (d: Kysely<any>) => new PostgresIntrospector(d),
+      createQueryCompiler: () => new PostgresQueryCompiler(),
+    },
+  });
+  return { db, sqls };
+}
+
+// The narrowed content clause, as it appears in the compiled SQL. Keying on the
+// structural `"type":"text"` marker (not a bare `"text":` key) is what excludes
+// math-only pages whose only `"text"` key is the atom node's `attrs.text`.
+const NARROWED_CLAUSE = `"type"[[:space:]]*:[[:space:]]*"text"`;
+const BARE_TEXT_KEY = `"text"[[:space:]]*:`;
+
+describe('PageRepo embeddable predicate — content-bearing pages (F6)', () => {
+  it('selects content-bearing pages via the narrowed "type":"text" node marker', async () => {
+    const { db, sqls } = makeRecordingDb();
+    const repo = new PageRepo(db as any, {} as any, { emit: jest.fn() } as any);
+
+    await repo.getEmbeddablePageIds('ws-1');
+
+    expect(sqls).toHaveLength(1);
+    const sql = sqls[0];
+
+    // Clause 1 (existing): pages with extractable text_content.
+    expect(sql).toContain('text_content');
+    // Clause 3 (the F6 fix, now narrowed): a page whose content JSON carries a
+    // real text node is selected even when text_content is null/empty, so a full
+    // reindex visits it instead of silently skipping it.
+    expect(sql).toContain('content::text');
+    expect(sql).toContain(NARROWED_CLAUSE);
+    // It must NOT use the old bare `"text":` key, which also matches the
+    // `attrs.text` of math-only atom pages (false-positive denominator inflation).
+    expect(sql).not.toContain(BARE_TEXT_KEY);
+    // Clause 2 (existing): pages that already have stored embeddings stay in the
+    // set so a reindex can clear their stale rows.
+    expect(sql.toLowerCase()).toContain('embeddings');
+  });
+
+  it('countEmbeddablePages compiles the SAME narrowed clause as getEmbeddablePageIds', async () => {
+    // Consistency is the core requirement: the denominator (countEmbeddablePages)
+    // and the reindex set (getEmbeddablePageIds) MUST share the identical
+    // predicate, else the live "done" counter and the steady-state total diverge.
+    const { db, sqls } = makeRecordingDb();
+    const repo = new PageRepo(db as any, {} as any, { emit: jest.fn() } as any);
+
+    await repo.countEmbeddablePages('ws-1');
+    await repo.getEmbeddablePageIds('ws-1');
+
+    expect(sqls).toHaveLength(2);
+    const [countSql, idsSql] = sqls;
+
+    // Both carry the narrowed content clause...
+    expect(countSql).toContain(NARROWED_CLAUSE);
+    expect(idsSql).toContain(NARROWED_CLAUSE);
+    // ...neither carries the bare key...
+    expect(countSql).not.toContain(BARE_TEXT_KEY);
+    expect(idsSql).not.toContain(BARE_TEXT_KEY);
+    // ...and the full OR predicate (text_content + content node + embeddings
+    // EXISTS) is byte-identical between the two queries, so they can't drift.
+    const where = (s: string) => s.slice(s.indexOf('where'));
+    expect(where(countSql)).toEqual(where(idsSql));
+  });
+
+  it('the content regex matches a text-bearing doc but NOT a math-only doc', () => {
+    // Semantic check of the predicate against sample `content::text` payloads.
+    // Note: `jsonb::text` is NOT identical to JSON.stringify — Postgres renders a
+    // space after each colon (`"type": "text"`), which is exactly why the POSIX
+    // clause uses `[[:space:]]*`. The clause `"type"[[:space:]]*:[[:space:]]*"text"`
+    // maps to the JS regex below (`[[:space:]]` -> `\s`, tolerating both forms);
+    // we evaluate it the way Postgres would.
+    const re = /"type"\s*:\s*"text"/;
+
+    // A real paragraph with a text node -> embeddable.
+    const textDoc = JSON.stringify({
+      type: 'doc',
+      content: [
+        {
+          type: 'paragraph',
+          content: [{ type: 'text', text: 'hello world' }],
+        },
+      ],
+    });
+    // A doc whose ONLY node is a math atom. Its LaTeX is in `attrs.text`, there is
+    // no text node, and `jsonToText`/`generateText` has no serializer for it -> it
+    // yields empty text_content and zero embeddings, so it must NOT qualify.
+    const mathOnlyDoc = JSON.stringify({
+      type: 'doc',
+      content: [
+        { type: 'mathBlock', attrs: { text: 'E = mc^2' } },
+        { type: 'mathInline', attrs: { text: '\\alpha' } },
+      ],
+    });
+    // An empty doc has no text node either.
+    const emptyDoc = JSON.stringify({ type: 'doc', content: [] });
+
+    expect(re.test(textDoc)).toBe(true);
+    expect(re.test(mathOnlyDoc)).toBe(false);
+    expect(re.test(emptyDoc)).toBe(false);
+    // Sanity: the OLD bare-key regex WOULD have wrongly matched the math-only doc,
+    // which is precisely the false positive the narrowing removes.
+    expect(/"text"\s*:/.test(mathOnlyDoc)).toBe(true);
+
+    // A user literally TYPING `"type":"text"` in prose can't false-positive on an
+    // otherwise text-less page: in `content::text` the typed value's quotes are
+    // escaped (`\"type\":\"text\"`), so the literal-quote regex does not match the
+    // escaped form. (And such a page is a genuine text node anyway.)
+    const escapedLiteral = JSON.stringify({
+      type: 'doc',
+      content: [{ type: 'someAtom', attrs: { note: '"type":"text"' } }],
+    });
+    expect(re.test(escapedLiteral)).toBe(false);
+  });
+});
--- a/apps/server/src/database/repos/page/page.repo.ts
+++ b/apps/server/src/database/repos/page/page.repo.ts
@@ -234,9 +234,9 @@ export class PageRepo {
   * text-less pages (which legitimately store zero embeddings) don't keep the
   * bar below 100% forever.
   *
-   * A page qualifies if it has non-empty textContent OR already has stored
-   * embeddings. The second clause covers pages whose text the indexer extracted
-   * from the content JSON when textContent was null, and guarantees this total is
+   * A page qualifies if it has non-empty textContent, OR its content JSON has at
+   * least one text node (`"type":"text"`) when textContent was never backfilled,
+   * OR it already has stored embeddings. The last clause guarantees this total is
   * always >= countIndexedPages (the indexed count can never exceed it).
   */
  async countEmbeddablePages(workspaceId: string): Promise<number> {
@@ -259,8 +259,10 @@ export class PageRepo {
   * the trivial workspaceId/deletedAt filters inline; this returns only the
   * non-trivial OR clause, evaluated against the `p` alias of `pages`.
   *
-   * A page qualifies if it has non-empty textContent OR already has a stored
-   * (non-deleted) embedding row.
+   * A page qualifies if it has non-empty textContent, OR its ProseMirror
+   * `content` JSON has at least one text node (`"type":"text"`) even though
+   * textContent was never backfilled, OR it already has a stored (non-deleted)
+   * embedding row.
   */
  private embeddablePredicate(
    eb: ExpressionBuilder<DbInterface & { p: DbInterface['pages'] }, 'p'>,
@@ -270,6 +272,25 @@ export class PageRepo {
      // character, mirroring the indexer's `text.trim().length === 0` check
      // (raw SQL -> use the snake_case column name).
      sql<boolean>`p.text_content ~ '[^[:space:]]'`,
+      // OR the ProseMirror `content` JSON has at least one text node (`"type":
+      // "text"`) the indexer can extract, even when `text_content` is null/empty
+      // (never backfilled): `reindexPage` runs `jsonToText` (generateText) over
+      // `content`, which only emits the text of ProseMirror text nodes, so such a
+      // page IS embeddable and a full reindex MUST visit it (otherwise it is
+      // silently skipped). A text node always serialises as
+      // `{"type":"text","text":"..."}`, so we key on the structural `"type":
+      // "text"` marker — NOT a bare `"text":` key, which also appears as the
+      // `attrs.text` of atom nodes that carry NO extractable text (e.g. math
+      // `mathBlock`/`mathInline`, whose LaTeX lives in `attrs.text` and has no
+      // text serializer). A math-only page thus produces empty `text_content` and
+      // zero embeddings; matching its `attrs.text` here would wrongly inflate the
+      // denominator and keep "Indexed N of M" below 100% forever. An empty doc
+      // (no text nodes) has no `"type":"text"` and is correctly excluded. A user
+      // who literally types `"type":"text"` in their prose can't false-positive:
+      // in `content::text` that text value's quotes are escaped (`\"type\"...`),
+      // so the literal-quote regex won't match the escaped form (and such a page
+      // is a real text node anyway).
+      sql<boolean>`p.content::text ~ '"type"[[:space:]]*:[[:space:]]*"text"'`,
      // OR already has at least one (non-deleted) embedding row.
      eb.exists(
        eb
@@ -284,7 +305,9 @@ export class PageRepo {
  /**
   * IDs of the EMBEDDABLE page set for a workspace — the exact same set that
   * `countEmbeddablePages` counts (a page qualifies if it has non-empty
-   * textContent OR already has a stored embedding row). The bulk reindex
+   * textContent, OR content JSON with at least one text node (`"type":"text"`)
+   * and an empty/null textContent, OR already has a stored embedding row). The
+   * bulk reindex
   * iterates THIS set so the live "done" counter reaches exactly
   * `countEmbeddablePages` (the steady-state denominator), instead of iterating
   * every non-deleted page (which would push the denominator above the
--- a/apps/server/src/integrations/ai/ai-settings.service.spec.ts
+++ b/apps/server/src/integrations/ai/ai-settings.service.spec.ts
@@ -172,7 +172,17 @@ describe('AiSettingsService.reindex progress seed', () => {

    await service.reindex(WORKSPACE_ID);

-    expect(reindexProgress.start).toHaveBeenCalledWith(WORKSPACE_ID, 478);
+    // The pre-seed carries the real page count AND a SHORT ttl (3rd arg) so a
+    // de-duplicated enqueue against a just-finishing job can't leave a phantom
+    // "reindexing: 0 of N" stuck for the full record TTL (F10).
+    expect(reindexProgress.start).toHaveBeenCalledWith(
+      WORKSPACE_ID,
+      478,
+      expect.any(Number),
+    );
+    const ttl = reindexProgress.start.mock.calls[0][2];
+    expect(ttl).toBeGreaterThan(0);
+    expect(ttl).toBeLessThanOrEqual(60); // short, not the full 1h record TTL
    expect(aiQueue.add).toHaveBeenCalledTimes(1);
    // Seed must precede the enqueue so the first poll already reports done=0.
    expect(order).toEqual(['start', 'add']);
@@ -204,7 +214,11 @@ describe('AiSettingsService.reindex progress seed', () => {

    await expect(service.reindex(WORKSPACE_ID)).rejects.toBe(boom);

-    expect(reindexProgress.start).toHaveBeenCalledWith(WORKSPACE_ID, 478);
+    expect(reindexProgress.start).toHaveBeenCalledWith(
+      WORKSPACE_ID,
+      478,
+      expect.any(Number),
+    );
    expect(reindexProgress.clear).toHaveBeenCalledWith(WORKSPACE_ID);
  });

--- a/apps/server/src/integrations/ai/ai-settings.service.ts
+++ b/apps/server/src/integrations/ai/ai-settings.service.ts
@@ -31,6 +31,17 @@ export function parsePositiveInt(raw: unknown): number | undefined {
  return Number.isFinite(n) && n > 0 ? Math.floor(n) : undefined;
 }

+/**
+ * TTL (seconds) for the enqueue-time progress PRE-SEED written by `reindex()`
+ * before the worker starts. Deliberately SHORT: if `aiQueue.add()` de-duplicates
+ * against a job that is just finishing (the worker's finally already ran
+ * `clear()` but removeOnComplete hasn't yet removed the job), no new worker runs
+ * to overwrite/clear this seed — so a short TTL lets the phantom "reindexing:
+ * 0 of N" expire in seconds instead of sticking for the full 1h record TTL. A
+ * worker that DOES start re-seeds with the full TTL, so a real run is unaffected.
+ */
+const PRE_SEED_TTL_SECONDS = 45;
+
 /**
 * Shape of the partial update accepted by `update`. Mirrors the validated
 * controller DTO. `apiKey` / `embeddingApiKey` are write-only: undefined =
@@ -117,7 +128,15 @@ export class AiSettingsService {
    let seeded = false;
    if ((await this.reindexProgress.get(workspaceId)) === null) {
      const totalPages = await this.pageRepo.countEmbeddablePages(workspaceId);
-      await this.reindexProgress.start(workspaceId, totalPages);
+      // Short TTL: if add() below de-duplicates against a just-finishing job
+      // whose worker already clear()ed but isn't removed yet, no worker runs to
+      // clear this seed — the short TTL expires the phantom record in seconds
+      // rather than leaving a stuck "reindexing: 0 of N" for the full record TTL.
+      await this.reindexProgress.start(
+        workspaceId,
+        totalPages,
+        PRE_SEED_TTL_SECONDS,
+      );
      seeded = true;
    }

@@ -286,22 +305,33 @@ export class AiSettingsService {
      hasSttApiKey = !!creds?.sttApiKeyEnc;
    }

-    // totalPages now counts only pages with embeddable content (non-empty text
-    // or already-stored embeddings), so empty/text-less pages don't keep the
-    // "Indexed N of M pages" bar below 100% forever.
-    const [indexedPages, totalPages] = await Promise.all([
-      this.pageEmbeddingRepo.countIndexedPages(workspaceId),
-      this.pageRepo.countEmbeddablePages(workspaceId),
-    ]);
-
    // While a reindex run is active, report its LIVE progress (done climbs 0 ->
-    // total) so the settings UI can watch it advance. Without this the counter
-    // never drops: the per-page reindex hard-replaces rows in its own small
-    // transaction, so countIndexedPages stays ~= total for the whole run. With
-    // no active record we fall back to the steady-state DB coverage count, which
+    // total) so the settings UI can watch it advance. Read progress FIRST and
+    // short-circuit: this endpoint is polled every ~5s for the whole run, so when
+    // a record is active we skip the two coverage COUNTs entirely (their results
+    // would be discarded anyway). Without the live progress the counter never
+    // drops: the per-page reindex hard-replaces rows in its own small
+    // transaction, so countIndexedPages stays ~= total for the whole run. With no
+    // active record we fall back to the steady-state DB coverage count, which
    // preserves the existing display and the client's "done == total -> stop
    // polling" condition (the run ends -> record cleared -> DB count == total).
+    //
+    // The fallback `totalPages` counts only pages with embeddable content
+    // (non-empty text, content-borne text, or already-stored embeddings), so
+    // empty/text-less pages don't keep the "Indexed N of M pages" bar below 100%
+    // forever.
    const progress = await this.reindexProgress.get(workspaceId);
+    let indexedPages: number;
+    let totalPages: number;
+    if (progress) {
+      indexedPages = progress.done;
+      totalPages = progress.total;
+    } else {
+      [indexedPages, totalPages] = await Promise.all([
+        this.pageEmbeddingRepo.countIndexedPages(workspaceId),
+        this.pageRepo.countEmbeddablePages(workspaceId),
+      ]);
+    }

    return {
      driver: provider.driver,
@@ -321,8 +351,8 @@ export class AiSettingsService {
      hasApiKey,
      hasEmbeddingApiKey,
      hasSttApiKey,
-      indexedPages: progress ? progress.done : indexedPages,
-      totalPages: progress ? progress.total : totalPages,
+      indexedPages,
+      totalPages,
      // Optional hint for the client: a reindex run is currently in progress.
      reindexing: progress != null,
    };
--- a/apps/server/src/integrations/ai/embedding-reindex-progress.service.spec.ts
+++ b/apps/server/src/integrations/ai/embedding-reindex-progress.service.spec.ts
@@ -115,6 +115,22 @@ describe('EmbeddingReindexProgressService', () => {
      expect(multiObj.exec).toHaveBeenCalledTimes(1);
    });

+    it('defaults the expire TTL to the full 1h record TTL', async () => {
+      const { redis, multiObj } = makeRedis();
+      await makeService(redis).start(WORKSPACE_ID, 478);
+      // Default ttl = full record TTL (60 * 60) so a real run never expires
+      // mid-flight before the worker refreshes it on each increment.
+      expect(multiObj.expire).toHaveBeenCalledWith(KEY, 60 * 60);
+    });
+
+    it('honours an explicit short ttlSeconds for the enqueue-time pre-seed (F10)', async () => {
+      const { redis, multiObj } = makeRedis();
+      // The reindex() pre-seed passes a short ttl so a phantom record left by a
+      // de-duplicated enqueue expires in seconds, not after the full 1h TTL.
+      await makeService(redis).start(WORKSPACE_ID, 478, 45);
+      expect(multiObj.expire).toHaveBeenCalledWith(KEY, 45);
+    });
+
    it('swallows a thrown Redis error (best-effort)', async () => {
      const { redis } = makeRedis({
        execImpl: () => Promise.reject(new Error('redis down')),
--- a/apps/server/src/integrations/ai/embedding-reindex-progress.service.ts
+++ b/apps/server/src/integrations/ai/embedding-reindex-progress.service.ts
@@ -65,12 +65,25 @@ export class EmbeddingReindexProgressService {

  /**
   * Begin (or reset) the progress record for a workspace: `total` pages, `done`
-   * back to 0, `startedAt` now. Called at reindex enqueue time (placeholder
-   * total, so the very first status poll already reports done=0) and again at
-   * the worker start (overwriting `total` with the real page count). Resets
-   * `done` to 0 so a re-trigger never inherits a stale count.
+   * back to 0, `startedAt` now. Called twice for a run, BOTH with the real page
+   * count (countEmbeddablePages) so the two totals coincide: once at reindex
+   * enqueue time (so the very first status poll already reports done=0) and again
+   * at the worker start (which re-asserts the same total and resets `done`).
+   * Resets `done` to 0 so a re-trigger never inherits a stale count.
+   *
+   * `ttlSeconds` lets the caller pick the record's lifetime. The enqueue-time
+   * pre-seed passes a SHORT ttl: if `aiQueue.add()` de-duplicates against a job
+   * that is just finishing (its worker hasn't yet removed the job but already
+   * ran its `clear()`), no new worker starts to clear this phantom seed, so a
+   * short ttl lets it expire in seconds instead of sticking for the full TTL.
+   * The worker's own `start()` at the begin of a real run overwrites this entry
+   * and raises the ttl back to the default full TTL.
   */
-  async start(workspaceId: string, total: number): Promise<void> {
+  async start(
+    workspaceId: string,
+    total: number,
+    ttlSeconds: number = TTL_SECONDS,
+  ): Promise<void> {
    const key = this.key(workspaceId);
    try {
      await this.redis
@@ -80,7 +93,7 @@ export class EmbeddingReindexProgressService {
          done: '0',
          startedAt: String(Date.now()),
        })
-        .expire(key, TTL_SECONDS)
+        .expire(key, ttlSeconds)
        .exec();
    } catch (err) {
      this.logger.warn(