feat(ai): server-side voice dictation (STT) with mic in chat and editor

Add push-to-talk voice dictation that transcribes recorded audio on the
server via the workspace's OpenAI-compatible AI provider (Whisper /
gpt-4o-transcribe / self-hosted whisper), then inserts the text.

Backend:
- New `stt_api_key_enc` column + migration; STT creds parity with chat/
  embeddings (sttModel/sttBaseUrl/sttApiKey, write-only key, fallbacks to
  chat baseUrl/key). Both provider whitelists updated (service + repo).
- AiService.getTranscriptionModel + AiTranscriptionService.
- Gated POST /ai-chat/transcribe (dictation flag → 403, JWT + workspace
  scope + throttle, 25MB cap, MIME whitelist, never logs audio/key).
- New `settings.ai.dictation` workspace flag (DTO + service + audit).

Frontend:
- Wire up the Voice/STT settings card (model/base URL/key) and the
  Voice-dictation toggle.
- New `features/dictation`: useDictation (MediaRecorder state machine),
  MicButton, transcribe service; integrated into the chat composer and a
  new editor-toolbar dictation group, both gated by ai.dictation.
This commit is contained in:
vvzvlad
2026-06-18 18:45:33 +03:00
parent 49eba22201
commit 874bdd021c
24 changed files with 845 additions and 39 deletions

View File

@@ -28,6 +28,9 @@ export interface UpdateAiSettingsInput {
systemPrompt?: string;
apiKey?: string;
embeddingApiKey?: string;
sttModel?: string;
sttBaseUrl?: string;
sttApiKey?: string;
}
/**
@@ -113,6 +116,7 @@ export class AiSettingsService {
driver: provider.driver,
chatModel: provider.chatModel,
embeddingModel: provider.embeddingModel,
sttModel: provider.sttModel,
baseUrl: provider.baseUrl,
systemPrompt: provider.systemPrompt,
};
@@ -122,6 +126,10 @@ export class AiSettingsService {
// unconditionally.
config.embeddingBaseUrl = provider.embeddingBaseUrl || provider.baseUrl;
// Effective STT base URL: the STT-specific value, else the chat base URL.
// Set unconditionally, same rationale as embeddingBaseUrl.
config.sttBaseUrl = provider.sttBaseUrl || provider.baseUrl;
if (provider.driver !== 'ollama') {
const creds = await this.aiProviderCredentialsRepo.find(
workspaceId,
@@ -134,6 +142,10 @@ export class AiSettingsService {
config.embeddingApiKey = creds?.embeddingApiKeyEnc
? this.secretBox.decryptSecret(creds.embeddingApiKeyEnc)
: config.apiKey;
// Effective STT key: the STT-specific key, else the chat key.
config.sttApiKey = creds?.sttApiKeyEnc
? this.secretBox.decryptSecret(creds.sttApiKeyEnc)
: config.apiKey;
}
return config;
@@ -151,6 +163,7 @@ export class AiSettingsService {
let hasApiKey = false;
let hasEmbeddingApiKey = false;
let hasSttApiKey = false;
if (provider.driver) {
const creds = await this.aiProviderCredentialsRepo.find(
workspaceId,
@@ -158,6 +171,7 @@ export class AiSettingsService {
);
hasApiKey = !!creds?.apiKeyEnc;
hasEmbeddingApiKey = !!creds?.embeddingApiKeyEnc;
hasSttApiKey = !!creds?.sttApiKeyEnc;
}
// totalPages now counts only pages with embeddable content (non-empty text
@@ -174,9 +188,12 @@ export class AiSettingsService {
embeddingModel: provider.embeddingModel,
baseUrl: provider.baseUrl,
embeddingBaseUrl: provider.embeddingBaseUrl,
sttModel: provider.sttModel,
sttBaseUrl: provider.sttBaseUrl,
systemPrompt: provider.systemPrompt,
hasApiKey,
hasEmbeddingApiKey,
hasSttApiKey,
indexedPages,
totalPages,
};
@@ -197,7 +214,7 @@ export class AiSettingsService {
workspaceId: string,
dto: UpdateAiSettingsInput,
): Promise<MaskedAiSettings> {
const { apiKey, embeddingApiKey, ...nonSecret } = dto;
const { apiKey, embeddingApiKey, sttApiKey, ...nonSecret } = dto;
// Persist non-secret provider fields (only those present in the partial).
const providerPatch: Partial<AiProviderSettings> = {};
@@ -207,6 +224,8 @@ export class AiSettingsService {
'embeddingModel',
'baseUrl',
'embeddingBaseUrl',
'sttModel',
'sttBaseUrl',
'systemPrompt',
] as const) {
if (nonSecret[key] !== undefined) {
@@ -222,7 +241,11 @@ export class AiSettingsService {
// Key handling (write-only). Both keys share the same target driver and the
// same "driver required" guard, resolved once.
if (apiKey !== undefined || embeddingApiKey !== undefined) {
if (
apiKey !== undefined ||
embeddingApiKey !== undefined ||
sttApiKey !== undefined
) {
const stored = await this.readProvider(workspaceId);
const targetDriver = dto.driver ?? stored.driver;
if (!targetDriver) {
@@ -264,6 +287,23 @@ export class AiSettingsService {
);
}
}
// STT key.
if (sttApiKey !== undefined) {
if (sttApiKey === '') {
await this.aiProviderCredentialsRepo.clearSttKey(
workspaceId,
targetDriver,
);
} else {
const enc = this.secretBox.encryptSecret(sttApiKey);
await this.aiProviderCredentialsRepo.upsertSttKey(
workspaceId,
targetDriver,
enc,
);
}
}
}
return this.getMasked(workspaceId);

View File

@@ -0,0 +1,13 @@
import { ServiceUnavailableException } from '@nestjs/common';
/**
* Thrown when no usable STT (speech-to-text) config exists for the workspace
* (missing driver / sttModel). Distinct from the chat & embedding variants so
* the transcription endpoint can 503 independently of chat/embeddings being
* configured.
*/
export class AiSttNotConfiguredException extends ServiceUnavailableException {
constructor() {
super('AI STT model not configured');
}
}

View File

@@ -4,6 +4,7 @@ import {
generateText,
type EmbeddingModel,
type LanguageModel,
type TranscriptionModel,
} from 'ai';
import { createOpenAI } from '@ai-sdk/openai';
import { createGoogleGenerativeAI } from '@ai-sdk/google';
@@ -11,6 +12,7 @@ import { createOllama } from 'ai-sdk-ollama';
import { AiSettingsService } from './ai-settings.service';
import { AiNotConfiguredException } from './ai-not-configured.exception';
import { AiEmbeddingNotConfiguredException } from './ai-embedding-not-configured.exception';
import { AiSttNotConfiguredException } from './ai-stt-not-configured.exception';
import { describeProviderError } from './ai-error.util';
/**
@@ -106,6 +108,26 @@ export class AiService {
}
}
/**
* Resolve the workspace config and build the transcription (STT) model.
* STT always speaks the OpenAI-compatible /v1/audio/transcriptions API
* (only @ai-sdk/openai exposes .transcription()), regardless of the chat
* driver. sttBaseUrl falls back to the chat baseUrl; the API key falls back
* to the chat key (resolved by AiSettingsService.resolve). Built PER WORKSPACE
* on demand; the decrypted key is never logged.
*
* Throws AiSttNotConfiguredException (-> 503) when no STT model is set.
*/
async getTranscriptionModel(workspaceId: string): Promise<TranscriptionModel> {
const cfg = await this.aiSettings.resolve(workspaceId);
if (!cfg?.sttModel) throw new AiSttNotConfiguredException();
const baseURL = cfg.sttBaseUrl || cfg.baseUrl; // stt-specific, else chat
// apiKey may be unused for keyless self-hosted whisper; pass a placeholder.
return createOpenAI({ apiKey: cfg.sttApiKey ?? 'unused', baseURL }).transcription(
cfg.sttModel,
);
}
/**
* Embed a batch of texts with the workspace embedding model. Returns one
* vector per input, in the same order. Thin wrapper over the AI SDK's

View File

@@ -21,6 +21,9 @@ export interface AiProviderSettings {
baseUrl?: string;
// Embedding-specific base URL. Falls back to `baseUrl` when empty/unset.
embeddingBaseUrl?: string;
sttModel?: string;
// STT-specific base URL. Falls back to baseUrl when empty/unset.
sttBaseUrl?: string;
systemPrompt?: string;
}
@@ -31,12 +34,15 @@ export interface AiProviderSettings {
*
* `embeddingBaseUrl` / `embeddingApiKey` are the embedding-specific endpoint and
* key, already resolved with the chat-value fallback applied by `resolve`.
* `sttBaseUrl` / `sttApiKey` are likewise the STT-specific endpoint and key,
* already resolved with the chat-value fallback applied by `resolve`.
*/
export interface ResolvedAiConfig extends Partial<AiProviderSettings> {
driver?: AiDriver;
chatModel?: string;
apiKey?: string;
embeddingApiKey?: string;
sttApiKey?: string;
}
/**
@@ -50,9 +56,12 @@ export interface MaskedAiSettings {
embeddingModel?: string;
baseUrl?: string;
embeddingBaseUrl?: string;
sttModel?: string;
sttBaseUrl?: string;
systemPrompt?: string;
hasApiKey: boolean;
hasEmbeddingApiKey: boolean;
hasSttApiKey: boolean;
// RAG indexing coverage for the settings UI.
indexedPages: number;
totalPages: number;

View File

@@ -4,10 +4,10 @@ import { AI_DRIVERS, AiDriver } from '../ai.types';
/**
* Admin update payload for the workspace AI provider settings.
*
* `apiKey` / `embeddingApiKey` are write-only (§8.2): provided → stored
* encrypted, '' → cleared, absent → left untouched. They are NEVER returned by
* any endpoint. The global ValidationPipe runs with `whitelist: true`, so
* unknown fields are stripped.
* `apiKey` / `embeddingApiKey` / `sttApiKey` are write-only (§8.2): provided →
* stored encrypted, '' → cleared, absent → left untouched. They are NEVER
* returned by any endpoint. The global ValidationPipe runs with
* `whitelist: true`, so unknown fields are stripped.
*/
export class UpdateAiSettingsDto {
@IsOptional()
@@ -41,4 +41,16 @@ export class UpdateAiSettingsDto {
@IsOptional()
@IsString()
embeddingApiKey?: string;
@IsOptional()
@IsString()
sttModel?: string;
@IsOptional()
@IsString()
sttBaseUrl?: string;
@IsOptional()
@IsString()
sttApiKey?: string;
}