feat(ai): server-side voice dictation (STT) with mic in chat and editor

Add push-to-talk voice dictation that transcribes recorded audio on the
server via the workspace's OpenAI-compatible AI provider (Whisper /
gpt-4o-transcribe / self-hosted whisper), then inserts the text.

Backend:
- New `stt_api_key_enc` column + migration; STT creds parity with chat/
  embeddings (sttModel/sttBaseUrl/sttApiKey, write-only key, fallbacks to
  chat baseUrl/key). Both provider whitelists updated (service + repo).
- AiService.getTranscriptionModel + AiTranscriptionService.
- Gated POST /ai-chat/transcribe (dictation flag → 403, JWT + workspace
  scope + throttle, 25MB cap, MIME whitelist, never logs audio/key).
- New `settings.ai.dictation` workspace flag (DTO + service + audit).

Frontend:
- Wire up the Voice/STT settings card (model/base URL/key) and the
  Voice-dictation toggle.
- New `features/dictation`: useDictation (MediaRecorder state machine),
  MicButton, transcribe service; integrated into the chat composer and a
  new editor-toolbar dictation group, both gated by ai.dictation.
This commit is contained in:
vvzvlad
2026-06-18 18:45:33 +03:00
parent 49eba22201
commit 874bdd021c
24 changed files with 845 additions and 39 deletions

View File

@@ -1,4 +1,5 @@
import {
BadRequestException,
Body,
Controller,
ForbiddenException,
@@ -9,6 +10,7 @@ import {
Req,
Res,
UseGuards,
UseInterceptors,
} from '@nestjs/common';
import { Throttle } from '@nestjs/throttler';
import { FastifyReply, FastifyRequest } from 'fastify';
@@ -22,7 +24,9 @@ import { AiChatRepo } from '@docmost/db/repos/ai-chat/ai-chat.repo';
import { AiChatMessageRepo } from '@docmost/db/repos/ai-chat/ai-chat-message.repo';
import { UserThrottlerGuard } from '../../integrations/throttle/user-throttler.guard';
import { AI_CHAT_THROTTLER } from '../../integrations/throttle/throttler-names';
import { FileInterceptor } from '../../common/interceptors/file.interceptor';
import { AiChatService, AiChatStreamBody } from './ai-chat.service';
import { AiTranscriptionService } from './ai-transcription.service';
import {
ChatIdDto,
GetChatMessagesDto,
@@ -43,6 +47,7 @@ export class AiChatController {
private readonly aiChatService: AiChatService,
private readonly aiChatRepo: AiChatRepo,
private readonly aiChatMessageRepo: AiChatMessageRepo,
private readonly aiTranscription: AiTranscriptionService,
) {}
/** List the requesting user's chats in this workspace (paginated). */
@@ -180,6 +185,74 @@ export class AiChatController {
}
}
/**
* Transcribe an uploaded audio clip to text using the workspace STT model.
* Gated by settings.ai.dictation (403 when disabled). Returns { text }.
*/
@HttpCode(HttpStatus.OK)
@UseGuards(JwtAuthGuard, UserThrottlerGuard)
@Throttle({ [AI_CHAT_THROTTLER]: { limit: 20, ttl: 60000 } })
@Post('transcribe')
@UseInterceptors(FileInterceptor)
async transcribe(
@Req() req: any,
@AuthWorkspace() workspace: Workspace,
): Promise<{ text: string }> {
// Gate: dictation must be explicitly enabled for the workspace.
const settings = (workspace.settings ?? {}) as {
ai?: { dictation?: boolean };
};
if (settings.ai?.dictation !== true) {
throw new ForbiddenException('Dictation is disabled');
}
let file = null;
try {
// Whisper hard-caps uploads at 25MB; allow a single file.
file = await req.file({ limits: { fileSize: 25 * 1024 * 1024, files: 1 } });
} catch (err: any) {
if (err?.statusCode === 413) {
throw new BadRequestException('Audio file too large (max 25MB)');
}
throw err;
}
if (!file) throw new BadRequestException('No audio uploaded');
// Whitelist audio container types produced by browser MediaRecorder
// (Chrome/FF: webm/opus, Safari: mp4) plus common STT-accepted formats.
const allowedMime = new Set([
'audio/webm',
'audio/ogg',
'audio/mp4',
'audio/mpeg',
'audio/wav',
'audio/x-wav',
'audio/wave',
'audio/m4a',
'audio/x-m4a',
]);
// MediaRecorder mimetypes carry parameters (e.g. "audio/webm;codecs=opus");
// compare only the base type.
const baseMime = file.mimetype.split(';')[0].trim().toLowerCase();
if (!allowedMime.has(baseMime)) {
throw new BadRequestException('Unsupported audio format');
}
let buf: Buffer;
try {
buf = await file.toBuffer();
} catch (err: any) {
// With @fastify/multipart throwFileSizeLimit:true, the 25MB cap is enforced
// when the stream is consumed (here), not at req.file().
if (err?.statusCode === 413) {
throw new BadRequestException('Audio file too large (max 25MB)');
}
throw err;
}
const text = await this.aiTranscription.transcribe(workspace.id, buf);
return { text };
}
/**
* Ensure the chat exists, belongs to this workspace, AND was created by the
* requesting user (per-user isolation). Throws ForbiddenException otherwise.

View File

@@ -3,6 +3,7 @@ import { AiModule } from '../../integrations/ai/ai.module';
import { TokenModule } from '../auth/token.module';
import { AiChatController } from './ai-chat.controller';
import { AiChatService } from './ai-chat.service';
import { AiTranscriptionService } from './ai-transcription.service';
import { AiChatToolsService } from './tools/ai-chat-tools.service';
import { EmbeddingModule } from './embedding/embedding.module';
import { ExternalMcpModule } from './external-mcp/external-mcp.module';
@@ -21,6 +22,6 @@ import { ExternalMcpModule } from './external-mcp/external-mcp.module';
@Module({
imports: [AiModule, TokenModule, EmbeddingModule, ExternalMcpModule],
controllers: [AiChatController],
providers: [AiChatService, AiChatToolsService],
providers: [AiChatService, AiTranscriptionService, AiChatToolsService],
})
export class AiChatModule {}

View File

@@ -0,0 +1,20 @@
import { Injectable } from '@nestjs/common';
import { experimental_transcribe as transcribe } from 'ai';
import { AiService } from '../../integrations/ai/ai.service';
/**
* Transcribes uploaded audio to text using the per-workspace STT model.
* Thin wrapper over the AI SDK's experimental_transcribe; never logs the
* audio or the key.
*/
@Injectable()
export class AiTranscriptionService {
constructor(private readonly ai: AiService) {}
// Transcribe an uploaded audio buffer using the workspace STT model.
async transcribe(workspaceId: string, audio: Uint8Array): Promise<string> {
const model = await this.ai.getTranscriptionModel(workspaceId);
const { text } = await transcribe({ model, audio });
return text.trim();
}
}

View File

@@ -49,6 +49,10 @@ export class UpdateWorkspaceDto extends PartialType(CreateWorkspaceDto) {
@IsBoolean()
aiChat: boolean;
@IsOptional()
@IsBoolean()
aiDictation: boolean;
@IsOptional()
@IsInt()
@Min(1)

View File

@@ -497,6 +497,20 @@ export class WorkspaceService {
);
}
if (typeof updateWorkspaceDto.aiDictation !== 'undefined') {
const prev = settingsBefore?.ai?.dictation ?? false;
if (prev !== updateWorkspaceDto.aiDictation) {
before.aiDictation = prev;
after.aiDictation = updateWorkspaceDto.aiDictation;
}
await this.workspaceRepo.updateAiSettings(
workspaceId,
'dictation',
updateWorkspaceDto.aiDictation,
trx,
);
}
delete updateWorkspaceDto.restrictApiToAdmins;
delete updateWorkspaceDto.aiSearch;
delete updateWorkspaceDto.generativeAi;
@@ -504,6 +518,7 @@ export class WorkspaceService {
delete updateWorkspaceDto.mcpEnabled;
delete updateWorkspaceDto.allowMemberTemplates;
delete updateWorkspaceDto.aiChat;
delete updateWorkspaceDto.aiDictation;
await this.workspaceRepo.updateWorkspace(
updateWorkspaceDto,

View File

@@ -0,0 +1,18 @@
import { type Kysely } from 'kysely';
export async function up(db: Kysely<any>): Promise<void> {
// Encrypted, STT-specific provider key. Separate from `api_key_enc`
// (the chat key) so the transcription model can use a different token.
// When NULL, the STT model falls back to `api_key_enc`.
await db.schema
.alterTable('ai_provider_credentials')
.addColumn('stt_api_key_enc', 'text', (col) => col)
.execute();
}
export async function down(db: Kysely<any>): Promise<void> {
await db.schema
.alterTable('ai_provider_credentials')
.dropColumn('stt_api_key_enc')
.execute();
}

View File

@@ -98,4 +98,42 @@ export class AiProviderCredentialsRepo {
.where('driver', '=', driver)
.execute();
}
// Upsert the STT-specific encrypted key. If no row exists yet this inserts one
// with `apiKeyEnc` left null (the column is nullable). On conflict only
// `sttApiKeyEnc` / `updatedAt` are touched, so the chat & embedding keys are kept.
async upsertSttKey(
workspaceId: string,
driver: string,
sttApiKeyEnc: string,
trx?: KyselyTransaction,
): Promise<AiProviderCredentials> {
const db = dbOrTx(this.db, trx);
return db
.insertInto('aiProviderCredentials')
.values({ workspaceId, driver, sttApiKeyEnc })
.onConflict((oc) =>
oc.columns(['workspaceId', 'driver']).doUpdateSet({
sttApiKeyEnc,
updatedAt: new Date(),
}),
)
.returningAll()
.executeTakeFirst();
}
// Clear only the STT-specific key; the chat & embedding keys are kept.
async clearSttKey(
workspaceId: string,
driver: string,
trx?: KyselyTransaction,
): Promise<void> {
const db = dbOrTx(this.db, trx);
await db
.updateTable('aiProviderCredentials')
.set({ sttApiKeyEnc: null, updatedAt: new Date() })
.where('workspaceId', '=', workspaceId)
.where('driver', '=', driver)
.execute();
}
}

View File

@@ -239,7 +239,7 @@ export class WorkspaceRepo {
// is a real jsonb object, never a double-encoded string. The CASE self-heals
// workspaces whose settings.ai.provider was previously corrupted into an
// array/string.
const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'systemPrompt'];
const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'systemPrompt'];
const entries = Object.entries(provider).filter(
([k, v]) => v !== undefined && ALLOWED.includes(k),
);

View File

@@ -14,6 +14,8 @@ export interface AiProviderCredentials {
apiKeyEnc: string | null;
// Encrypted, embedding-specific provider key. Falls back to apiKeyEnc when null.
embeddingApiKeyEnc: string | null;
// Encrypted, STT-specific provider key. Falls back to apiKeyEnc when null.
sttApiKeyEnc: string | null;
createdAt: Generated<Timestamp>;
updatedAt: Generated<Timestamp>;
}

View File

@@ -28,6 +28,9 @@ export interface UpdateAiSettingsInput {
systemPrompt?: string;
apiKey?: string;
embeddingApiKey?: string;
sttModel?: string;
sttBaseUrl?: string;
sttApiKey?: string;
}
/**
@@ -113,6 +116,7 @@ export class AiSettingsService {
driver: provider.driver,
chatModel: provider.chatModel,
embeddingModel: provider.embeddingModel,
sttModel: provider.sttModel,
baseUrl: provider.baseUrl,
systemPrompt: provider.systemPrompt,
};
@@ -122,6 +126,10 @@ export class AiSettingsService {
// unconditionally.
config.embeddingBaseUrl = provider.embeddingBaseUrl || provider.baseUrl;
// Effective STT base URL: the STT-specific value, else the chat base URL.
// Set unconditionally, same rationale as embeddingBaseUrl.
config.sttBaseUrl = provider.sttBaseUrl || provider.baseUrl;
if (provider.driver !== 'ollama') {
const creds = await this.aiProviderCredentialsRepo.find(
workspaceId,
@@ -134,6 +142,10 @@ export class AiSettingsService {
config.embeddingApiKey = creds?.embeddingApiKeyEnc
? this.secretBox.decryptSecret(creds.embeddingApiKeyEnc)
: config.apiKey;
// Effective STT key: the STT-specific key, else the chat key.
config.sttApiKey = creds?.sttApiKeyEnc
? this.secretBox.decryptSecret(creds.sttApiKeyEnc)
: config.apiKey;
}
return config;
@@ -151,6 +163,7 @@ export class AiSettingsService {
let hasApiKey = false;
let hasEmbeddingApiKey = false;
let hasSttApiKey = false;
if (provider.driver) {
const creds = await this.aiProviderCredentialsRepo.find(
workspaceId,
@@ -158,6 +171,7 @@ export class AiSettingsService {
);
hasApiKey = !!creds?.apiKeyEnc;
hasEmbeddingApiKey = !!creds?.embeddingApiKeyEnc;
hasSttApiKey = !!creds?.sttApiKeyEnc;
}
// totalPages now counts only pages with embeddable content (non-empty text
@@ -174,9 +188,12 @@ export class AiSettingsService {
embeddingModel: provider.embeddingModel,
baseUrl: provider.baseUrl,
embeddingBaseUrl: provider.embeddingBaseUrl,
sttModel: provider.sttModel,
sttBaseUrl: provider.sttBaseUrl,
systemPrompt: provider.systemPrompt,
hasApiKey,
hasEmbeddingApiKey,
hasSttApiKey,
indexedPages,
totalPages,
};
@@ -197,7 +214,7 @@ export class AiSettingsService {
workspaceId: string,
dto: UpdateAiSettingsInput,
): Promise<MaskedAiSettings> {
const { apiKey, embeddingApiKey, ...nonSecret } = dto;
const { apiKey, embeddingApiKey, sttApiKey, ...nonSecret } = dto;
// Persist non-secret provider fields (only those present in the partial).
const providerPatch: Partial<AiProviderSettings> = {};
@@ -207,6 +224,8 @@ export class AiSettingsService {
'embeddingModel',
'baseUrl',
'embeddingBaseUrl',
'sttModel',
'sttBaseUrl',
'systemPrompt',
] as const) {
if (nonSecret[key] !== undefined) {
@@ -222,7 +241,11 @@ export class AiSettingsService {
// Key handling (write-only). Both keys share the same target driver and the
// same "driver required" guard, resolved once.
if (apiKey !== undefined || embeddingApiKey !== undefined) {
if (
apiKey !== undefined ||
embeddingApiKey !== undefined ||
sttApiKey !== undefined
) {
const stored = await this.readProvider(workspaceId);
const targetDriver = dto.driver ?? stored.driver;
if (!targetDriver) {
@@ -264,6 +287,23 @@ export class AiSettingsService {
);
}
}
// STT key.
if (sttApiKey !== undefined) {
if (sttApiKey === '') {
await this.aiProviderCredentialsRepo.clearSttKey(
workspaceId,
targetDriver,
);
} else {
const enc = this.secretBox.encryptSecret(sttApiKey);
await this.aiProviderCredentialsRepo.upsertSttKey(
workspaceId,
targetDriver,
enc,
);
}
}
}
return this.getMasked(workspaceId);

View File

@@ -0,0 +1,13 @@
import { ServiceUnavailableException } from '@nestjs/common';
/**
* Thrown when no usable STT (speech-to-text) config exists for the workspace
* (missing driver / sttModel). Distinct from the chat & embedding variants so
* the transcription endpoint can 503 independently of chat/embeddings being
* configured.
*/
export class AiSttNotConfiguredException extends ServiceUnavailableException {
constructor() {
super('AI STT model not configured');
}
}

View File

@@ -4,6 +4,7 @@ import {
generateText,
type EmbeddingModel,
type LanguageModel,
type TranscriptionModel,
} from 'ai';
import { createOpenAI } from '@ai-sdk/openai';
import { createGoogleGenerativeAI } from '@ai-sdk/google';
@@ -11,6 +12,7 @@ import { createOllama } from 'ai-sdk-ollama';
import { AiSettingsService } from './ai-settings.service';
import { AiNotConfiguredException } from './ai-not-configured.exception';
import { AiEmbeddingNotConfiguredException } from './ai-embedding-not-configured.exception';
import { AiSttNotConfiguredException } from './ai-stt-not-configured.exception';
import { describeProviderError } from './ai-error.util';
/**
@@ -106,6 +108,26 @@ export class AiService {
}
}
/**
* Resolve the workspace config and build the transcription (STT) model.
* STT always speaks the OpenAI-compatible /v1/audio/transcriptions API
* (only @ai-sdk/openai exposes .transcription()), regardless of the chat
* driver. sttBaseUrl falls back to the chat baseUrl; the API key falls back
* to the chat key (resolved by AiSettingsService.resolve). Built PER WORKSPACE
* on demand; the decrypted key is never logged.
*
* Throws AiSttNotConfiguredException (-> 503) when no STT model is set.
*/
async getTranscriptionModel(workspaceId: string): Promise<TranscriptionModel> {
const cfg = await this.aiSettings.resolve(workspaceId);
if (!cfg?.sttModel) throw new AiSttNotConfiguredException();
const baseURL = cfg.sttBaseUrl || cfg.baseUrl; // stt-specific, else chat
// apiKey may be unused for keyless self-hosted whisper; pass a placeholder.
return createOpenAI({ apiKey: cfg.sttApiKey ?? 'unused', baseURL }).transcription(
cfg.sttModel,
);
}
/**
* Embed a batch of texts with the workspace embedding model. Returns one
* vector per input, in the same order. Thin wrapper over the AI SDK's

View File

@@ -21,6 +21,9 @@ export interface AiProviderSettings {
baseUrl?: string;
// Embedding-specific base URL. Falls back to `baseUrl` when empty/unset.
embeddingBaseUrl?: string;
sttModel?: string;
// STT-specific base URL. Falls back to baseUrl when empty/unset.
sttBaseUrl?: string;
systemPrompt?: string;
}
@@ -31,12 +34,15 @@ export interface AiProviderSettings {
*
* `embeddingBaseUrl` / `embeddingApiKey` are the embedding-specific endpoint and
* key, already resolved with the chat-value fallback applied by `resolve`.
* `sttBaseUrl` / `sttApiKey` are likewise the STT-specific endpoint and key,
* already resolved with the chat-value fallback applied by `resolve`.
*/
export interface ResolvedAiConfig extends Partial<AiProviderSettings> {
driver?: AiDriver;
chatModel?: string;
apiKey?: string;
embeddingApiKey?: string;
sttApiKey?: string;
}
/**
@@ -50,9 +56,12 @@ export interface MaskedAiSettings {
embeddingModel?: string;
baseUrl?: string;
embeddingBaseUrl?: string;
sttModel?: string;
sttBaseUrl?: string;
systemPrompt?: string;
hasApiKey: boolean;
hasEmbeddingApiKey: boolean;
hasSttApiKey: boolean;
// RAG indexing coverage for the settings UI.
indexedPages: number;
totalPages: number;

View File

@@ -4,10 +4,10 @@ import { AI_DRIVERS, AiDriver } from '../ai.types';
/**
* Admin update payload for the workspace AI provider settings.
*
* `apiKey` / `embeddingApiKey` are write-only (§8.2): provided → stored
* encrypted, '' → cleared, absent → left untouched. They are NEVER returned by
* any endpoint. The global ValidationPipe runs with `whitelist: true`, so
* unknown fields are stripped.
* `apiKey` / `embeddingApiKey` / `sttApiKey` are write-only (§8.2): provided →
* stored encrypted, '' → cleared, absent → left untouched. They are NEVER
* returned by any endpoint. The global ValidationPipe runs with
* `whitelist: true`, so unknown fields are stripped.
*/
export class UpdateAiSettingsDto {
@IsOptional()
@@ -41,4 +41,16 @@ export class UpdateAiSettingsDto {
@IsOptional()
@IsString()
embeddingApiKey?: string;
@IsOptional()
@IsString()
sttModel?: string;
@IsOptional()
@IsString()
sttBaseUrl?: string;
@IsOptional()
@IsString()
sttApiKey?: string;
}