From 80c900eb548334a062895bff2a28993a20c52961 Mon Sep 17 00:00:00 2001
From: vvzvlad <git@vvzvlad.xyz>
Date: Thu, 18 Jun 2026 03:07:02 +0300
Subject: [PATCH] fix(ai): make RAG indexer observable and bound hung embedding
 calls

The bulk embedding reindex could hang on a single page forever
("Indexed 27 of 34 pages") with zero log output:
- all progress logs were debug-level, suppressed in production (pino info);
- embedMany() had no timeout, so a slow/hung embeddings endpoint blocked
  the sequential per-page loop indefinitely.

Changes:
- ai.service.embedTexts: bound embedMany with AbortSignal.timeout
  (configurable via AI_EMBEDDING_TIMEOUT_MS, default 120000ms); on timeout
  throw a clear, greppable message, classified by both signal.aborted and
  the error name (TimeoutError/AbortError/ResponseAborted) so a real
  provider error racing the timer keeps its diagnostics.
- embedding-indexer.reindexWorkspace: promote lifecycle/progress logs to
  info; log "[i/N] indexing page <id>" BEFORE the await so a hang names the
  stuck page; warn on slow pages (>30s); add timing + final summary.
- .env.example: document AI_EMBEDDING_TIMEOUT_MS.
---
 .env.example                                  |  4 ++
 .../embedding/embedding-indexer.service.ts    | 44 +++++++++++++-----
 apps/server/src/integrations/ai/ai.service.ts | 46 ++++++++++++++++++-
 3 files changed, 80 insertions(+), 14 deletions(-)
diff --git a/.env.example b/.env.example
index 99e47021..fbd32428 100644
--- a/.env.example
+++ b/.env.example
@@ -77,3 +77,7 @@ MCP_DOCMOST_PASSWORD=
 # the workspace MCP toggle and network isolation (do not expose the port publicly).
 # MCP_TOKEN=
 # MCP_SESSION_IDLE_MS=1800000
+
+# Per-embedding-call timeout in milliseconds for the RAG indexer.
+# A slow/hung embeddings endpoint fails after this and the batch continues.
+# AI_EMBEDDING_TIMEOUT_MS=120000
diff --git a/apps/server/src/core/ai-chat/embedding/embedding-indexer.service.ts b/apps/server/src/core/ai-chat/embedding/embedding-indexer.service.ts
index 1bc4ee06..25c73f29 100644
--- a/apps/server/src/core/ai-chat/embedding/embedding-indexer.service.ts
+++ b/apps/server/src/core/ai-chat/embedding/embedding-indexer.service.ts
@@ -25,6 +25,10 @@ import { jsonToText } from '../../../collaboration/collaboration.util';
 const CHUNK_SIZE = 1000;
 const CHUNK_OVERLAP = 200;
 
+// A single page taking longer than this during a bulk reindex is logged at
+// WARN as an early "slow page" signal before the hard embedding timeout.
+const SLOW_PAGE_MS = 30_000;
+
 /**
  * Vector-RAG indexer (§6.7 stage D / §14[M1]). Turns a page's plain text into
  * chunk embeddings and persists them so the `semanticSearch` agent tool can do
@@ -168,7 +172,7 @@ export class EmbeddingIndexerService {
       await this.aiService.getEmbeddingModel(workspaceId);
     } catch (err) {
       if (err instanceof AiEmbeddingNotConfiguredException) {
-        this.logger.debug(
+        this.logger.log(
           `reindexWorkspace: embeddings not configured for workspace ${workspaceId}, skipping`,
         );
         return;
@@ -177,28 +181,44 @@ export class EmbeddingIndexerService {
     }
 
     const pageIds = await this.pageRepo.getIdsByWorkspace(workspaceId);
-    this.logger.debug(
-      `reindexWorkspace: reindexing ${pageIds.length} page(s) for workspace ${workspaceId}`,
+    const total = pageIds.length;
+    const startedAt = Date.now();
+    this.logger.log(
+      `reindexWorkspace: starting reindex of ${total} page(s) for workspace ${workspaceId}`,
     );
 
     let failed = 0;
-    for (const pageId of pageIds) {
+    for (let i = 0; i < total; i++) {
+      const pageId = pageIds[i];
+      const position = i + 1;
+      // Log BEFORE the await: if the embedding call hangs, this is the last line
+      // in the log and it names the exact page that is stuck.
+      this.logger.log(
+        `reindexWorkspace: [${position}/${total}] indexing page ${pageId} (workspace ${workspaceId})`,
+      );
+      const pageStartedAt = Date.now();
       try {
         await this.reindexPage(pageId);
+        const elapsed = Date.now() - pageStartedAt;
+        if (elapsed >= SLOW_PAGE_MS) {
+          this.logger.warn(
+            `reindexWorkspace: [${position}/${total}] page ${pageId} took ${elapsed}ms`,
+          );
+        }
       } catch (err) {
-        // Per-page isolation: one failure must not abort the whole batch.
+        // Per-page isolation: one failure (incl. an embedding timeout) must not
+        // abort the whole batch.
         failed++;
         this.logger.error(
-          `reindexWorkspace: failed to reindex page ${pageId}: ${describeProviderError(
-            err,
-          )}`,
+          `reindexWorkspace: [${position}/${total}] failed to reindex page ${pageId} ` +
+            `after ${Date.now() - pageStartedAt}ms: ${describeProviderError(err)}`,
         );
       }
     }
-    this.logger.debug(
-      `reindexWorkspace: done for workspace ${workspaceId} (${
-        pageIds.length - failed
-      }/${pageIds.length} pages)`,
+
+    this.logger.log(
+      `reindexWorkspace: done for workspace ${workspaceId}: ` +
+        `${total - failed}/${total} indexed, ${failed} failed in ${Date.now() - startedAt}ms`,
     );
   }
 
diff --git a/apps/server/src/integrations/ai/ai.service.ts b/apps/server/src/integrations/ai/ai.service.ts
index 8d063920..f2b6a155 100644
--- a/apps/server/src/integrations/ai/ai.service.ts
+++ b/apps/server/src/integrations/ai/ai.service.ts
@@ -114,8 +114,50 @@ export class AiService {
   async embedTexts(workspaceId: string, texts: string[]): Promise<number[][]> {
     if (texts.length === 0) return [];
     const model = await this.getEmbeddingModel(workspaceId);
-    const { embeddings } = await embedMany({ model, values: texts });
-    return embeddings;
+    // Bound the embedding call: a slow/hung embeddings endpoint must fail loudly
+    // (and let the caller move on to the next page) instead of blocking forever.
+    // The single signal caps the WHOLE call, including the SDK's internal
+    // retries/backoff (embedMany defaults to maxRetries: 2).
+    const timeoutMs = AiService.embeddingTimeoutMs();
+    const signal = AbortSignal.timeout(timeoutMs);
+    try {
+      const { embeddings } = await embedMany({
+        model,
+        values: texts,
+        abortSignal: signal,
+      });
+      return embeddings;
+    } catch (err) {
+      // AbortSignal.timeout aborts with an opaque TimeoutError; surface a clear,
+      // greppable message so a hung/slow embeddings endpoint is obvious in logs.
+      // Classify by the error itself (name) AND the signal, not the flag alone:
+      // a genuine provider error that loses a race with the timer would also see
+      // `signal.aborted === true`, and must keep its real diagnostics.
+      // Mirror the SDK's own isAbortError (@ai-sdk/provider-utils): it treats
+      // TimeoutError, AbortError and ResponseAborted (Next.js) as aborts.
+      const abortLike =
+        err instanceof Error &&
+        (err.name === 'TimeoutError' ||
+          err.name === 'AbortError' ||
+          err.name === 'ResponseAborted');
+      if (signal.aborted && abortLike) {
+        throw new Error(
+          `Embedding request timed out after ${timeoutMs}ms ` +
+            `(workspace ${workspaceId}, ${texts.length} chunk(s)). ` +
+            `Increase AI_EMBEDDING_TIMEOUT_MS or check the embeddings endpoint.`,
+        );
+      }
+      throw err;
+    }
+  }
+
+  /**
+   * Per-embedding-call timeout in ms. Configurable via AI_EMBEDDING_TIMEOUT_MS;
+   * falls back to 120000 (2 min) when unset or invalid.
+   */
+  private static embeddingTimeoutMs(): number {
+    const raw = Number(process.env.AI_EMBEDDING_TIMEOUT_MS);
+    return Number.isFinite(raw) && raw > 0 ? raw : 120_000;
   }
 
   /**