fix(ai): include content-bearing pages in reindex coverage; correct progress race & hot path (F6-F10)

F6: extend embeddablePredicate to pages with body content but null text_content,
    keyed on the text-node marker "type":"text" (not a bare "text": key, which
    also matched math nodes' attrs.text and would leave math-only pages stuck
    below 100%). Numerator and denominator share the predicate; tests assert the
    compiled WHERE is byte-identical and a math-only doc is excluded.
F7: correct the start() JSDoc (both totals are the real page count).
F8: nextReindexPollInterval reuses isReindexComplete.
F9: getMasked reads progress first and skips the two COUNTs while a reindex is active.
F10: pre-seed the progress entry with a short 45s TTL so a deduped enqueue's
     phantom "0 of N" expires quickly instead of sticking for the 1h TTL.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
claude code agent 227
2026-06-29 14:37:26 +03:00
parent bdc033e689
commit 91f24fc062
7 changed files with 298 additions and 33 deletions

View File

@@ -198,16 +198,18 @@ export function nextReindexPollInterval(args: {
if (now > deadline) return false;
// Active run → keep polling even if the momentary counts already look full.
if (status?.reindexing) return intervalMs;
// Finished and fully indexed (incl. an empty workspace, 0 >= 0) → stop.
if (status && status.indexedPages >= status.totalPages) return false;
// Finished and fully indexed (incl. an empty workspace, 0 >= 0) → stop. Reuse
// isReindexComplete so the completeness check lives in exactly one place.
if (isReindexComplete(status)) return false;
// Within the deadline and not yet done → keep polling.
return intervalMs;
}
/**
* Whether the reindex poll deadline should be cleared: the server reports no
* active run AND the count is complete. Mirrors the stop condition of
* `nextReindexPollInterval` (sans the cap, which the effect handles via time).
* active run AND the count is complete. The single source of truth for the
* "reindex finished" check — `nextReindexPollInterval` reuses it for its stop
* condition (sans the cap, which the effect handles via time).
*/
export function isReindexComplete(status?: ReindexStatus): boolean {
return (

View File

@@ -0,0 +1,167 @@
import { PageRepo } from './page.repo';
import {
DummyDriver,
Kysely,
PostgresAdapter,
PostgresIntrospector,
PostgresQueryCompiler,
} from 'kysely';
/**
* F6 regression guard for the embeddable-page predicate.
*
* The predicate is shared by `countEmbeddablePages` (the "Indexed N of M" coverage
* denominator) and `getEmbeddablePageIds` (the exact set a full reindex iterates).
* It MUST select pages whose `text_content` was never backfilled (null/empty) but
* whose ProseMirror `content` JSON still carries body text — `reindexPage` builds
* its chunks straight from `content`, so without a content clause such a page is
* silently SKIPPED by a mass reindex even though it is fully embeddable.
*
* The content clause keys on the structural text-node marker `"type":"text"`, NOT
* a bare `"text":` key. The bare key also appears as the `attrs.text` of atom
* nodes that carry NO extractable text — notably math (`mathBlock`/`mathInline`),
* whose LaTeX lives in `attrs.text` and has no `generateText` serializer. A
* math-ONLY page therefore yields empty `text_content` and zero embeddings; if the
* predicate matched its `attrs.text` it would land in the denominator but
* `reindexPage` would no-op on it, pinning "Indexed N of M" below 100% forever —
* the exact bug this feature fixes. The `"type":"text"` marker matches only real
* text nodes (what `jsonToText` extracts), keeping the predicate consistent with
* what gets indexed.
*
* There is no real Postgres here: a recording Kysely (DummyDriver wired to the
* Postgres query compiler) compiles the queries to SQL so we can assert the WHERE
* predicate ORs in the narrowed content clause alongside the existing text_content
* and stored-embeddings clauses — and that BOTH callers compile the identical
* clause (denominator and reindex set can never diverge).
*/
function makeRecordingDb() {
const sqls: string[] = [];
const db = new Kysely<any>({
dialect: {
createAdapter: () => new PostgresAdapter(),
createDriver: () =>
new (class extends DummyDriver {
async acquireConnection() {
return {
executeQuery: async (compiled: { sql: string }) => {
sqls.push(compiled.sql);
return { rows: [] };
},
// eslint-disable-next-line @typescript-eslint/no-empty-function
streamQuery: async function* () {},
} as any;
}
})(),
createIntrospector: (d: Kysely<any>) => new PostgresIntrospector(d),
createQueryCompiler: () => new PostgresQueryCompiler(),
},
});
return { db, sqls };
}
// The narrowed content clause, as it appears in the compiled SQL. Keying on the
// structural `"type":"text"` marker (not a bare `"text":` key) is what excludes
// math-only pages whose only `"text"` key is the atom node's `attrs.text`.
const NARROWED_CLAUSE = `"type"[[:space:]]*:[[:space:]]*"text"`;
const BARE_TEXT_KEY = `"text"[[:space:]]*:`;
describe('PageRepo embeddable predicate — content-bearing pages (F6)', () => {
it('selects content-bearing pages via the narrowed "type":"text" node marker', async () => {
const { db, sqls } = makeRecordingDb();
const repo = new PageRepo(db as any, {} as any, { emit: jest.fn() } as any);
await repo.getEmbeddablePageIds('ws-1');
expect(sqls).toHaveLength(1);
const sql = sqls[0];
// Clause 1 (existing): pages with extractable text_content.
expect(sql).toContain('text_content');
// Clause 3 (the F6 fix, now narrowed): a page whose content JSON carries a
// real text node is selected even when text_content is null/empty, so a full
// reindex visits it instead of silently skipping it.
expect(sql).toContain('content::text');
expect(sql).toContain(NARROWED_CLAUSE);
// It must NOT use the old bare `"text":` key, which also matches the
// `attrs.text` of math-only atom pages (false-positive denominator inflation).
expect(sql).not.toContain(BARE_TEXT_KEY);
// Clause 2 (existing): pages that already have stored embeddings stay in the
// set so a reindex can clear their stale rows.
expect(sql.toLowerCase()).toContain('embeddings');
});
it('countEmbeddablePages compiles the SAME narrowed clause as getEmbeddablePageIds', async () => {
// Consistency is the core requirement: the denominator (countEmbeddablePages)
// and the reindex set (getEmbeddablePageIds) MUST share the identical
// predicate, else the live "done" counter and the steady-state total diverge.
const { db, sqls } = makeRecordingDb();
const repo = new PageRepo(db as any, {} as any, { emit: jest.fn() } as any);
await repo.countEmbeddablePages('ws-1');
await repo.getEmbeddablePageIds('ws-1');
expect(sqls).toHaveLength(2);
const [countSql, idsSql] = sqls;
// Both carry the narrowed content clause...
expect(countSql).toContain(NARROWED_CLAUSE);
expect(idsSql).toContain(NARROWED_CLAUSE);
// ...neither carries the bare key...
expect(countSql).not.toContain(BARE_TEXT_KEY);
expect(idsSql).not.toContain(BARE_TEXT_KEY);
// ...and the full OR predicate (text_content + content node + embeddings
// EXISTS) is byte-identical between the two queries, so they can't drift.
const where = (s: string) => s.slice(s.indexOf('where'));
expect(where(countSql)).toEqual(where(idsSql));
});
it('the content regex matches a text-bearing doc but NOT a math-only doc', () => {
// Semantic check of the predicate against sample `content::text` payloads.
// Note: `jsonb::text` is NOT identical to JSON.stringify — Postgres renders a
// space after each colon (`"type": "text"`), which is exactly why the POSIX
// clause uses `[[:space:]]*`. The clause `"type"[[:space:]]*:[[:space:]]*"text"`
// maps to the JS regex below (`[[:space:]]` -> `\s`, tolerating both forms);
// we evaluate it the way Postgres would.
const re = /"type"\s*:\s*"text"/;
// A real paragraph with a text node -> embeddable.
const textDoc = JSON.stringify({
type: 'doc',
content: [
{
type: 'paragraph',
content: [{ type: 'text', text: 'hello world' }],
},
],
});
// A doc whose ONLY node is a math atom. Its LaTeX is in `attrs.text`, there is
// no text node, and `jsonToText`/`generateText` has no serializer for it -> it
// yields empty text_content and zero embeddings, so it must NOT qualify.
const mathOnlyDoc = JSON.stringify({
type: 'doc',
content: [
{ type: 'mathBlock', attrs: { text: 'E = mc^2' } },
{ type: 'mathInline', attrs: { text: '\\alpha' } },
],
});
// An empty doc has no text node either.
const emptyDoc = JSON.stringify({ type: 'doc', content: [] });
expect(re.test(textDoc)).toBe(true);
expect(re.test(mathOnlyDoc)).toBe(false);
expect(re.test(emptyDoc)).toBe(false);
// Sanity: the OLD bare-key regex WOULD have wrongly matched the math-only doc,
// which is precisely the false positive the narrowing removes.
expect(/"text"\s*:/.test(mathOnlyDoc)).toBe(true);
// A user literally TYPING `"type":"text"` in prose can't false-positive on an
// otherwise text-less page: in `content::text` the typed value's quotes are
// escaped (`\"type\":\"text\"`), so the literal-quote regex does not match the
// escaped form. (And such a page is a genuine text node anyway.)
const escapedLiteral = JSON.stringify({
type: 'doc',
content: [{ type: 'someAtom', attrs: { note: '"type":"text"' } }],
});
expect(re.test(escapedLiteral)).toBe(false);
});
});

View File

@@ -234,9 +234,9 @@ export class PageRepo {
* text-less pages (which legitimately store zero embeddings) don't keep the
* bar below 100% forever.
*
* A page qualifies if it has non-empty textContent OR already has stored
* embeddings. The second clause covers pages whose text the indexer extracted
* from the content JSON when textContent was null, and guarantees this total is
* A page qualifies if it has non-empty textContent, OR its content JSON has at
* least one text node (`"type":"text"`) when textContent was never backfilled,
* OR it already has stored embeddings. The last clause guarantees this total is
* always >= countIndexedPages (the indexed count can never exceed it).
*/
async countEmbeddablePages(workspaceId: string): Promise<number> {
@@ -259,8 +259,10 @@ export class PageRepo {
* the trivial workspaceId/deletedAt filters inline; this returns only the
* non-trivial OR clause, evaluated against the `p` alias of `pages`.
*
* A page qualifies if it has non-empty textContent OR already has a stored
* (non-deleted) embedding row.
* A page qualifies if it has non-empty textContent, OR its ProseMirror
* `content` JSON has at least one text node (`"type":"text"`) even though
* textContent was never backfilled, OR it already has a stored (non-deleted)
* embedding row.
*/
private embeddablePredicate(
eb: ExpressionBuilder<DbInterface & { p: DbInterface['pages'] }, 'p'>,
@@ -270,6 +272,25 @@ export class PageRepo {
// character, mirroring the indexer's `text.trim().length === 0` check
// (raw SQL -> use the snake_case column name).
sql<boolean>`p.text_content ~ '[^[:space:]]'`,
// OR the ProseMirror `content` JSON has at least one text node (`"type":
// "text"`) the indexer can extract, even when `text_content` is null/empty
// (never backfilled): `reindexPage` runs `jsonToText` (generateText) over
// `content`, which only emits the text of ProseMirror text nodes, so such a
// page IS embeddable and a full reindex MUST visit it (otherwise it is
// silently skipped). A text node always serialises as
// `{"type":"text","text":"..."}`, so we key on the structural `"type":
// "text"` marker — NOT a bare `"text":` key, which also appears as the
// `attrs.text` of atom nodes that carry NO extractable text (e.g. math
// `mathBlock`/`mathInline`, whose LaTeX lives in `attrs.text` and has no
// text serializer). A math-only page thus produces empty `text_content` and
// zero embeddings; matching its `attrs.text` here would wrongly inflate the
// denominator and keep "Indexed N of M" below 100% forever. An empty doc
// (no text nodes) has no `"type":"text"` and is correctly excluded. A user
// who literally types `"type":"text"` in their prose can't false-positive:
// in `content::text` that text value's quotes are escaped (`\"type\"...`),
// so the literal-quote regex won't match the escaped form (and such a page
// is a real text node anyway).
sql<boolean>`p.content::text ~ '"type"[[:space:]]*:[[:space:]]*"text"'`,
// OR already has at least one (non-deleted) embedding row.
eb.exists(
eb
@@ -284,7 +305,9 @@ export class PageRepo {
/**
* IDs of the EMBEDDABLE page set for a workspace — the exact same set that
* `countEmbeddablePages` counts (a page qualifies if it has non-empty
* textContent OR already has a stored embedding row). The bulk reindex
* textContent, OR content JSON with at least one text node (`"type":"text"`)
* and an empty/null textContent, OR already has a stored embedding row). The
* bulk reindex
* iterates THIS set so the live "done" counter reaches exactly
* `countEmbeddablePages` (the steady-state denominator), instead of iterating
* every non-deleted page (which would push the denominator above the

View File

@@ -172,7 +172,17 @@ describe('AiSettingsService.reindex progress seed', () => {
await service.reindex(WORKSPACE_ID);
expect(reindexProgress.start).toHaveBeenCalledWith(WORKSPACE_ID, 478);
// The pre-seed carries the real page count AND a SHORT ttl (3rd arg) so a
// de-duplicated enqueue against a just-finishing job can't leave a phantom
// "reindexing: 0 of N" stuck for the full record TTL (F10).
expect(reindexProgress.start).toHaveBeenCalledWith(
WORKSPACE_ID,
478,
expect.any(Number),
);
const ttl = reindexProgress.start.mock.calls[0][2];
expect(ttl).toBeGreaterThan(0);
expect(ttl).toBeLessThanOrEqual(60); // short, not the full 1h record TTL
expect(aiQueue.add).toHaveBeenCalledTimes(1);
// Seed must precede the enqueue so the first poll already reports done=0.
expect(order).toEqual(['start', 'add']);
@@ -204,7 +214,11 @@ describe('AiSettingsService.reindex progress seed', () => {
await expect(service.reindex(WORKSPACE_ID)).rejects.toBe(boom);
expect(reindexProgress.start).toHaveBeenCalledWith(WORKSPACE_ID, 478);
expect(reindexProgress.start).toHaveBeenCalledWith(
WORKSPACE_ID,
478,
expect.any(Number),
);
expect(reindexProgress.clear).toHaveBeenCalledWith(WORKSPACE_ID);
});

View File

@@ -31,6 +31,17 @@ export function parsePositiveInt(raw: unknown): number | undefined {
return Number.isFinite(n) && n > 0 ? Math.floor(n) : undefined;
}
/**
* TTL (seconds) for the enqueue-time progress PRE-SEED written by `reindex()`
* before the worker starts. Deliberately SHORT: if `aiQueue.add()` de-duplicates
* against a job that is just finishing (the worker's finally already ran
* `clear()` but removeOnComplete hasn't yet removed the job), no new worker runs
* to overwrite/clear this seed — so a short TTL lets the phantom "reindexing:
* 0 of N" expire in seconds instead of sticking for the full 1h record TTL. A
* worker that DOES start re-seeds with the full TTL, so a real run is unaffected.
*/
const PRE_SEED_TTL_SECONDS = 45;
/**
* Shape of the partial update accepted by `update`. Mirrors the validated
* controller DTO. `apiKey` / `embeddingApiKey` are write-only: undefined =
@@ -117,7 +128,15 @@ export class AiSettingsService {
let seeded = false;
if ((await this.reindexProgress.get(workspaceId)) === null) {
const totalPages = await this.pageRepo.countEmbeddablePages(workspaceId);
await this.reindexProgress.start(workspaceId, totalPages);
// Short TTL: if add() below de-duplicates against a just-finishing job
// whose worker already clear()ed but isn't removed yet, no worker runs to
// clear this seed — the short TTL expires the phantom record in seconds
// rather than leaving a stuck "reindexing: 0 of N" for the full record TTL.
await this.reindexProgress.start(
workspaceId,
totalPages,
PRE_SEED_TTL_SECONDS,
);
seeded = true;
}
@@ -286,22 +305,33 @@ export class AiSettingsService {
hasSttApiKey = !!creds?.sttApiKeyEnc;
}
// totalPages now counts only pages with embeddable content (non-empty text
// or already-stored embeddings), so empty/text-less pages don't keep the
// "Indexed N of M pages" bar below 100% forever.
const [indexedPages, totalPages] = await Promise.all([
this.pageEmbeddingRepo.countIndexedPages(workspaceId),
this.pageRepo.countEmbeddablePages(workspaceId),
]);
// While a reindex run is active, report its LIVE progress (done climbs 0 ->
// total) so the settings UI can watch it advance. Without this the counter
// never drops: the per-page reindex hard-replaces rows in its own small
// transaction, so countIndexedPages stays ~= total for the whole run. With
// no active record we fall back to the steady-state DB coverage count, which
// total) so the settings UI can watch it advance. Read progress FIRST and
// short-circuit: this endpoint is polled every ~5s for the whole run, so when
// a record is active we skip the two coverage COUNTs entirely (their results
// would be discarded anyway). Without the live progress the counter never
// drops: the per-page reindex hard-replaces rows in its own small
// transaction, so countIndexedPages stays ~= total for the whole run. With no
// active record we fall back to the steady-state DB coverage count, which
// preserves the existing display and the client's "done == total -> stop
// polling" condition (the run ends -> record cleared -> DB count == total).
//
// The fallback `totalPages` counts only pages with embeddable content
// (non-empty text, content-borne text, or already-stored embeddings), so
// empty/text-less pages don't keep the "Indexed N of M pages" bar below 100%
// forever.
const progress = await this.reindexProgress.get(workspaceId);
let indexedPages: number;
let totalPages: number;
if (progress) {
indexedPages = progress.done;
totalPages = progress.total;
} else {
[indexedPages, totalPages] = await Promise.all([
this.pageEmbeddingRepo.countIndexedPages(workspaceId),
this.pageRepo.countEmbeddablePages(workspaceId),
]);
}
return {
driver: provider.driver,
@@ -321,8 +351,8 @@ export class AiSettingsService {
hasApiKey,
hasEmbeddingApiKey,
hasSttApiKey,
indexedPages: progress ? progress.done : indexedPages,
totalPages: progress ? progress.total : totalPages,
indexedPages,
totalPages,
// Optional hint for the client: a reindex run is currently in progress.
reindexing: progress != null,
};

View File

@@ -115,6 +115,22 @@ describe('EmbeddingReindexProgressService', () => {
expect(multiObj.exec).toHaveBeenCalledTimes(1);
});
it('defaults the expire TTL to the full 1h record TTL', async () => {
const { redis, multiObj } = makeRedis();
await makeService(redis).start(WORKSPACE_ID, 478);
// Default ttl = full record TTL (60 * 60) so a real run never expires
// mid-flight before the worker refreshes it on each increment.
expect(multiObj.expire).toHaveBeenCalledWith(KEY, 60 * 60);
});
it('honours an explicit short ttlSeconds for the enqueue-time pre-seed (F10)', async () => {
const { redis, multiObj } = makeRedis();
// The reindex() pre-seed passes a short ttl so a phantom record left by a
// de-duplicated enqueue expires in seconds, not after the full 1h TTL.
await makeService(redis).start(WORKSPACE_ID, 478, 45);
expect(multiObj.expire).toHaveBeenCalledWith(KEY, 45);
});
it('swallows a thrown Redis error (best-effort)', async () => {
const { redis } = makeRedis({
execImpl: () => Promise.reject(new Error('redis down')),

View File

@@ -65,12 +65,25 @@ export class EmbeddingReindexProgressService {
/**
* Begin (or reset) the progress record for a workspace: `total` pages, `done`
* back to 0, `startedAt` now. Called at reindex enqueue time (placeholder
* total, so the very first status poll already reports done=0) and again at
* the worker start (overwriting `total` with the real page count). Resets
* `done` to 0 so a re-trigger never inherits a stale count.
* back to 0, `startedAt` now. Called twice for a run, BOTH with the real page
* count (countEmbeddablePages) so the two totals coincide: once at reindex
* enqueue time (so the very first status poll already reports done=0) and again
* at the worker start (which re-asserts the same total and resets `done`).
* Resets `done` to 0 so a re-trigger never inherits a stale count.
*
* `ttlSeconds` lets the caller pick the record's lifetime. The enqueue-time
* pre-seed passes a SHORT ttl: if `aiQueue.add()` de-duplicates against a job
* that is just finishing (its worker hasn't yet removed the job but already
* ran its `clear()`), no new worker starts to clear this phantom seed, so a
* short ttl lets it expire in seconds instead of sticking for the full TTL.
* The worker's own `start()` at the begin of a real run overwrites this entry
* and raises the ttl back to the default full TTL.
*/
async start(workspaceId: string, total: number): Promise<void> {
async start(
workspaceId: string,
total: number,
ttlSeconds: number = TTL_SECONDS,
): Promise<void> {
const key = this.key(workspaceId);
try {
await this.redis
@@ -80,7 +93,7 @@ export class EmbeddingReindexProgressService {
done: '0',
startedAt: String(Date.now()),
})
.expire(key, TTL_SECONDS)
.expire(key, ttlSeconds)
.exec();
} catch (err) {
this.logger.warn(