fix(ai-embedding): abort bulk reindex on fatal provider errors
reindexWorkspace isolated every per-page failure, so an invalid/missing API key (401 "User not found") made all pages fail identically while the batch kept issuing hundreds of doomed requests against the provider. Add isFatalProviderError() (401/403 auth, 402 billing) and abort the whole batch on such errors; 429 rate-limit and embedding timeouts stay per-page isolated. Adds unit tests for the predicate and a regression test for the abort/iterate control flow. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,80 @@
|
||||
import { EmbeddingIndexerService } from './embedding-indexer.service';
|
||||
import { PageRepo } from '@docmost/db/repos/page/page.repo';
|
||||
import { PageEmbeddingRepo } from '@docmost/db/repos/ai-chat/page-embedding.repo';
|
||||
import { KyselyDB } from '@docmost/db/types/kysely.types';
|
||||
import { AiService } from '../../../integrations/ai/ai.service';
|
||||
|
||||
/**
|
||||
* Unit tests for EmbeddingIndexerService.reindexWorkspace's batch control flow.
|
||||
*
|
||||
* The constructor body only stores its deps, so the service can be unit-built
|
||||
* with lightweight mocks — no Nest module graph. We stub only the methods that
|
||||
* reindexWorkspace actually touches:
|
||||
* - aiService.getEmbeddingModel -> a model string so the up-front configured
|
||||
* check passes,
|
||||
* - pageRepo.getIdsByWorkspace -> three page ids,
|
||||
* - service.reindexPage -> spied per test to drive the per-page outcome.
|
||||
*
|
||||
* The point under test is the catch block: a FATAL provider error (auth/billing)
|
||||
* must abort the whole batch (re-throw, stop iterating), while a non-fatal error
|
||||
* keeps per-page isolation (failed++, continue to the next page).
|
||||
*/
|
||||
describe('EmbeddingIndexerService.reindexWorkspace fail-fast', () => {
|
||||
const WORKSPACE_ID = 'ws-1';
|
||||
|
||||
function makeService() {
|
||||
const pageRepo = {
|
||||
getIdsByWorkspace: jest.fn().mockResolvedValue(['p1', 'p2', 'p3']),
|
||||
};
|
||||
const pageEmbeddingRepo = {};
|
||||
const aiService = {
|
||||
getEmbeddingModel: jest.fn().mockResolvedValue('some-model'),
|
||||
};
|
||||
const db = {};
|
||||
|
||||
const service = new EmbeddingIndexerService(
|
||||
pageRepo as unknown as PageRepo,
|
||||
pageEmbeddingRepo as unknown as PageEmbeddingRepo,
|
||||
aiService as unknown as AiService,
|
||||
db as unknown as KyselyDB,
|
||||
);
|
||||
return { service, pageRepo, aiService };
|
||||
}
|
||||
|
||||
it('aborts after the first page on a FATAL (401) provider error', async () => {
|
||||
const { service } = makeService();
|
||||
// A 401 "User not found" recurs identically on every page -> must abort.
|
||||
const reindexPage = jest
|
||||
.spyOn(service, 'reindexPage')
|
||||
.mockRejectedValue({ statusCode: 401, message: 'User not found' });
|
||||
|
||||
await expect(service.reindexWorkspace(WORKSPACE_ID)).rejects.toMatchObject({
|
||||
statusCode: 401,
|
||||
});
|
||||
// Aborted on the first page: pages 2 and 3 were never attempted.
|
||||
expect(reindexPage).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('keeps per-page isolation on a non-fatal error (plain Error, no statusCode)', async () => {
|
||||
const { service } = makeService();
|
||||
// No statusCode -> non-fatal -> isolate per page and continue.
|
||||
const reindexPage = jest
|
||||
.spyOn(service, 'reindexPage')
|
||||
.mockRejectedValue(new Error('boom'));
|
||||
|
||||
// Resolves (does not throw) even though every page failed.
|
||||
await expect(service.reindexWorkspace(WORKSPACE_ID)).resolves.toBeUndefined();
|
||||
// All three pages were attempted despite the failures.
|
||||
expect(reindexPage).toHaveBeenCalledTimes(3);
|
||||
});
|
||||
|
||||
it('processes every page on the all-success path', async () => {
|
||||
const { service } = makeService();
|
||||
const reindexPage = jest
|
||||
.spyOn(service, 'reindexPage')
|
||||
.mockResolvedValue(undefined);
|
||||
|
||||
await expect(service.reindexWorkspace(WORKSPACE_ID)).resolves.toBeUndefined();
|
||||
expect(reindexPage).toHaveBeenCalledTimes(3);
|
||||
});
|
||||
});
|
||||
@@ -10,7 +10,10 @@ import { InjectKysely } from 'nestjs-kysely';
|
||||
import { executeTx } from '@docmost/db/utils';
|
||||
import { AiService } from '../../../integrations/ai/ai.service';
|
||||
import { AiEmbeddingNotConfiguredException } from '../../../integrations/ai/ai-embedding-not-configured.exception';
|
||||
import { describeProviderError } from '../../../integrations/ai/ai-error.util';
|
||||
import {
|
||||
describeProviderError,
|
||||
isFatalProviderError,
|
||||
} from '../../../integrations/ai/ai-error.util';
|
||||
import { jsonToText } from '../../../collaboration/collaboration.util';
|
||||
|
||||
// NOTE: the `page_embeddings.embedding` column is now dimension-agnostic
|
||||
@@ -229,8 +232,19 @@ export class EmbeddingIndexerService {
|
||||
);
|
||||
}
|
||||
} catch (err) {
|
||||
// Per-page isolation: one failure (incl. an embedding timeout) must not
|
||||
// abort the whole batch.
|
||||
// A fatal provider error (invalid/missing key, no credits) recurs
|
||||
// identically on EVERY remaining page. Abort the whole batch instead of
|
||||
// issuing hundreds of doomed requests against the provider.
|
||||
if (isFatalProviderError(err)) {
|
||||
this.logger.error(
|
||||
`reindexWorkspace: aborting at [${position}/${total}] for workspace ` +
|
||||
`${workspaceId} — fatal provider error, remaining pages would fail ` +
|
||||
`identically: ${describeProviderError(err)}`,
|
||||
);
|
||||
throw err;
|
||||
}
|
||||
// Per-page isolation: one non-fatal failure (incl. an embedding timeout)
|
||||
// must not abort the whole batch.
|
||||
failed++;
|
||||
this.logger.error(
|
||||
`reindexWorkspace: [${position}/${total}] failed to reindex page ${pageId} ` +
|
||||
|
||||
Reference in New Issue
Block a user