feat(dictation): reason-модель — говорящий tooltip на серой иконке + общий резолвер ошибок (#309) #314

Merged
vvzvlad merged 4 commits from feat/309-dictation-reasons into develop 2026-07-03 23:14:58 +03:00
14 changed files with 697 additions and 153 deletions
@@ -1274,6 +1274,10 @@
"Voice dictation is not configured": "Voice dictation is not configured",
"Microphone is unavailable or already in use": "Microphone is unavailable or already in use",
"Audio recording is not available in this browser/context": "Audio recording is not available in this browser/context",
"Dictation": "Dictation",
"Dictation becomes available once the page finishes connecting": "Dictation becomes available once the page finishes connecting",
"No connection to the collaboration server — dictation unavailable": "No connection to the collaboration server — dictation unavailable",
"This page is read-only": "This page is read-only",
"Request format": "Request format",
"How transcription requests are sent to the endpoint": "How transcription requests are sent to the endpoint",
"OpenAI-compatible (multipart/form-data)": "OpenAI-compatible (multipart/form-data)",
@@ -393,6 +393,17 @@
"No speech detected": "Речь не распознана",
"Transcription failed": "Не удалось распознать речь",
"Voice dictation is not configured": "Голосовой ввод не настроен",
"Start dictation": "Начать диктовку",
"Stop recording": "Остановить запись",
"Microphone access denied": "Доступ к микрофону запрещён",
"No microphone found": "Микрофон не найден",
"Microphone is unavailable or already in use": "Микрофон недоступен или уже используется",
"Could not start recording": "Не удалось начать запись",
"Audio recording is not available in this browser/context": "Запись аудио недоступна в этом браузере/контексте",
"Dictation": "Диктовка",
"Dictation becomes available once the page finishes connecting": "Диктовка станет доступна после подключения к документу",
"No connection to the collaboration server — dictation unavailable": "Нет связи с сервером совместного редактирования — диктовка недоступна",
"This page is read-only": "Страница открыта только для чтения",
"Embed PDF": "Встроить PDF",
"Upload and embed a PDF file.": "Загрузите и встроите PDF-файл.",
"Embed as PDF": "Встроить как PDF",
@@ -0,0 +1,92 @@
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
import { render, screen } from "@testing-library/react";
import { MantineProvider } from "@mantine/core";
// A disabled mic must explain WHY it is unavailable rather than silently saying
// "Start dictation". This renders MicButton in its idle+disabled state with a
// forwarded reason and asserts the accessible label resolves to that reason's
// text via the shared resolver (dictation-status.resolveUnavailableLabel).
// matchMedia (read by MantineProvider) is stubbed globally in vitest.setup.ts.
// Pass i18n keys through verbatim so we assert the exact resolved string.
vi.mock("react-i18next", () => ({
useTranslation: () => ({ t: (s: string) => s }),
}));
// Keep both controllers inert and idle so MicButton renders the idle branch.
const idleCtl = {
status: "idle" as const,
start: vi.fn(async () => {}),
stop: vi.fn(),
cancel: vi.fn(),
audioLevel: 0,
errorMessage: null,
};
vi.mock("@/features/dictation/hooks/use-dictation", () => ({
useDictation: () => idleCtl,
}));
vi.mock("@/features/dictation/hooks/use-streaming-dictation", () => ({
useStreamingDictation: () => idleCtl,
}));
import { MicButton } from "./mic-button";
function renderButton(props: React.ComponentProps<typeof MicButton>) {
render(
<MantineProvider>
<MicButton {...props} />
</MantineProvider>,
);
}
describe("MicButton — disabled reason label", () => {
// jsdom has no MediaRecorder / mediaDevices, so isDictationSupported() would
// report "unsupported" and mask the forwarded reason. Stub both so the button
// is considered supported and the availability reason is what surfaces.
beforeEach(() => {
(globalThis as unknown as { MediaRecorder: unknown }).MediaRecorder =
class {};
Object.defineProperty(navigator, "mediaDevices", {
configurable: true,
value: { getUserMedia: vi.fn() },
});
});
afterEach(() => {
delete (globalThis as unknown as { MediaRecorder?: unknown }).MediaRecorder;
});
it("shows the cause-specific reason instead of 'Start dictation' when disabled with a reason", () => {
renderButton({ onText: () => {}, disabled: true, unavailableReason: "offline" });
const expected =
"No connection to the collaboration server — dictation unavailable";
// The reason surfaces as the accessible label (and the tooltip text).
const button = screen.getByRole("button", { name: expected });
expect(button).toBeDefined();
// It is marked disabled the Mantine way (data-disabled), NOT the native
// `disabled` attribute — otherwise pointer-events:none would kill the tooltip.
expect(button.getAttribute("data-disabled")).toBe("true");
expect(button.hasAttribute("disabled")).toBe(false);
// And it no longer silently reads "Start dictation".
expect(screen.queryByRole("button", { name: "Start dictation" })).toBeNull();
});
it("reads 'Start dictation' when enabled with no reason", () => {
renderButton({ onText: () => {} });
expect(
screen.getByRole("button", { name: "Start dictation" }),
).toBeDefined();
});
it("does not advertise 'Start dictation' when disabled with no reason", () => {
// A consumer passing bare `disabled` (e.g. the AI chat's isStreaming) with no
// unavailableReason must not get a hoverable mic whose tooltip invites
// "Start dictation" on a click that is rejected.
renderButton({ onText: () => {}, disabled: true });
expect(
screen.queryByRole("button", { name: "Start dictation" }),
).toBeNull();
const button = screen.getByRole("button");
expect(button.getAttribute("data-disabled")).toBe("true");
});
});
@@ -4,6 +4,11 @@ import { IconMicrophone, IconPlayerStopFilled } from "@tabler/icons-react";
import { useTranslation } from "react-i18next";
import { useDictation } from "@/features/dictation/hooks/use-dictation";
import { useStreamingDictation } from "@/features/dictation/hooks/use-streaming-dictation";
import {
isDictationSupported,
resolveUnavailableLabel,
type DictationUnavailableReason,
} from "@/features/dictation/dictation-status";
import classes from "./mic-button.module.css";
interface MicButtonProps {
@@ -21,6 +26,9 @@ interface MicButtonProps {
// When true, use the streaming (Silero-VAD) dictation controller, which emits
// text progressively as the user pauses; otherwise use the batch controller.
streaming?: boolean;
// When the mic is disabled for an availability reason, this is the cause the
// idle tooltip explains (e.g. pre-sync "connecting", "offline", "read-only").
unavailableReason?: DictationUnavailableReason;
}
/**
@@ -37,6 +45,7 @@ export const MicButton: FC<MicButtonProps> = ({
color,
iconSize,
streaming = false,
unavailableReason,
}) => {
const { t } = useTranslation();
// Call BOTH hooks unconditionally to respect the rules of hooks: which one is
@@ -46,7 +55,7 @@ export const MicButton: FC<MicButtonProps> = ({
const batchCtl = useDictation({ onText, onStart });
const streamingCtl = useStreamingDictation({ onText, onStart });
const ctl = streaming ? streamingCtl : batchCtl;
const { status, start, stop, audioLevel } = ctl;
const { status, start, stop, audioLevel, errorMessage } = ctl;
const resolvedIconSize = iconSize ?? (size === "lg" ? 18 : 16);
if (status === "recording") {
@@ -82,15 +91,28 @@ export const MicButton: FC<MicButtonProps> = ({
) {
// "loading" (streaming hook fetching the VAD model on first use) shows the
// same spinner+disabled state so the first click is visibly acknowledged and
// a confusing second click can't fire while the model loads.
const label = status === "loading" ? t("Preparing…") : t("Transcribing…");
// a confusing second click can't fire while the model loads. The error case
// explains the failure via the hook's resolved errorMessage instead of the
// transient "Transcribing…" label.
const label =
status === "error"
? (errorMessage ?? t("Transcription failed"))
: status === "loading"
? t("Preparing…")
: t("Transcribing…");
return (
<Tooltip label={label} withArrow>
<ActionIcon
size={size}
variant="subtle"
color={color}
disabled
// Mark disabled the Mantine way (data-disabled/aria-disabled) rather
// than the native `disabled` attribute: native `disabled` sets
// `pointer-events:none`, which suppresses hover so the Tooltip never
// fires. This is a status display with no click action to guard, so
// keeping it hoverable simply lets the error reason be read on hover.
data-disabled
aria-disabled
aria-label={label}
>
<Loader size="xs" />
@@ -99,18 +121,56 @@ export const MicButton: FC<MicButtonProps> = ({
);
}
// Idle branch. A grey/disabled mic must explain WHY it can't record. An
// unsupported browser/context is detected here; otherwise the parent forwards
// a cause-specific reason. We must NOT pass the native `disabled` prop: Mantine
// renders `<button disabled>` with `pointer-events:none`, which suppresses
// hover so the Tooltip never fires. Instead mark it disabled the Mantine way
// (data-disabled/aria-disabled) — keeping it hoverable and in the a11y tree —
// and guard the click ourselves.
const unsupported = !isDictationSupported();
const isDisabled = disabled || unsupported;
const reason: DictationUnavailableReason | undefined = unsupported
? "unsupported"
: unavailableReason;
const reasonLabel = reason ? resolveUnavailableLabel(reason, t) : undefined;
// A disabled mic with a known reason surfaces it on hover; an enabled mic
// invites "Start dictation". But a mic disabled with NO reason (e.g. a
// consumer that passes bare `disabled` — the AI chat's isStreaming, with no
// unavailableReason) must NOT hover a misleading, actionable "Start dictation"
// tooltip on a control that rejects the click. In that case we render the icon
// without a Tooltip and give it a neutral accessible label instead.
const ariaLabel = reasonLabel ?? (isDisabled ? t("Dictation") : t("Start dictation"));
const icon = (
<ActionIcon
size={size}
variant="subtle"
color={color}
onClick={(e) => {
if (isDisabled) {
e.preventDefault();
return;
}
void start();
}}
data-disabled={isDisabled || undefined}
aria-disabled={isDisabled}
aria-label={ariaLabel}
>
<IconMicrophone size={resolvedIconSize} />
</ActionIcon>
);
// Suppress the tooltip on a disabled mic that has nothing to explain — hovering
// a grey, unclickable mic should not advertise "Start dictation".
if (isDisabled && !reasonLabel) {
return icon;
}
return (
<Tooltip label={t("Start dictation")} withArrow>
<ActionIcon
size={size}
variant="subtle"
color={color}
onClick={() => void start()}
disabled={disabled}
aria-label={t("Start dictation")}
>
<IconMicrophone size={resolvedIconSize} />
</ActionIcon>
<Tooltip
label={reasonLabel ?? t("Start dictation")}
withArrow
>
{icon}
</Tooltip>
);
};
@@ -0,0 +1,156 @@
import { describe, it, expect } from "vitest";
import {
classifyGetUserMediaError,
classifyTranscriptionError,
dictationErrorMessage,
resolveUnavailableLabel,
isDictationSupported,
} from "./dictation-status";
// Unit tests for the shared dictation-status resolvers (dictation-status.ts).
// Both dictation hooks and the mic button form their user-facing strings here,
// so a regression in the classification or message mapping would silently swap
// what a user reads when the mic is grey or a recording fails. A fake `t`
// returns its key verbatim so we assert the exact i18n key each branch selects.
const t = (k: string) => k;
describe("classifyGetUserMediaError", () => {
it("maps NotAllowedError / SecurityError to mic-denied", () => {
expect(classifyGetUserMediaError({ name: "NotAllowedError" })).toBe(
"mic-denied",
);
expect(classifyGetUserMediaError({ name: "SecurityError" })).toBe(
"mic-denied",
);
});
it("maps NotFoundError / OverconstrainedError to no-mic", () => {
expect(classifyGetUserMediaError({ name: "NotFoundError" })).toBe("no-mic");
expect(classifyGetUserMediaError({ name: "OverconstrainedError" })).toBe(
"no-mic",
);
});
it("maps NotReadableError / AbortError to mic-in-use", () => {
expect(classifyGetUserMediaError({ name: "NotReadableError" })).toBe(
"mic-in-use",
);
expect(classifyGetUserMediaError({ name: "AbortError" })).toBe(
"mic-in-use",
);
});
it("maps anything else / undefined to unknown", () => {
expect(classifyGetUserMediaError({ name: "WeirdError" })).toBe("unknown");
expect(classifyGetUserMediaError(undefined)).toBe("unknown");
expect(classifyGetUserMediaError({})).toBe("unknown");
});
});
describe("classifyTranscriptionError", () => {
it("returns the verbatim server message when present", () => {
const err = { response: { status: 500, data: { message: "provider 404" } } };
expect(classifyTranscriptionError(err)).toEqual({
code: "transcription-failed",
serverMessage: "provider 404",
});
});
it("maps 503 / 403 (no server message) to stt-not-configured", () => {
expect(classifyTranscriptionError({ response: { status: 503 } })).toEqual({
code: "stt-not-configured",
});
expect(classifyTranscriptionError({ response: { status: 403 } })).toEqual({
code: "stt-not-configured",
});
});
it("falls back to transcription-failed with no server message otherwise", () => {
expect(classifyTranscriptionError({ response: { status: 500 } })).toEqual({
code: "transcription-failed",
});
expect(classifyTranscriptionError(new Error("network"))).toEqual({
code: "transcription-failed",
});
// Blank server message is ignored (does not win as verbatim text).
expect(
classifyTranscriptionError({ response: { data: { message: " " } } }),
).toEqual({ code: "transcription-failed" });
});
});
describe("dictationErrorMessage", () => {
it("maps each code to the expected i18n key", () => {
expect(dictationErrorMessage("mic-denied", t)).toBe(
"Microphone access denied",
);
expect(dictationErrorMessage("no-mic", t)).toBe("No microphone found");
expect(dictationErrorMessage("mic-in-use", t)).toBe(
"Microphone is unavailable or already in use",
);
expect(dictationErrorMessage("no-media-devices", t)).toBe(
"Audio recording is not available in this browser/context",
);
expect(dictationErrorMessage("stt-not-configured", t)).toBe(
"Voice dictation is not configured",
);
expect(dictationErrorMessage("transcription-failed", t)).toBe(
"Transcription failed",
);
expect(dictationErrorMessage("recorder-failed", t)).toBe(
"Could not start recording",
);
expect(dictationErrorMessage("vad-init-failed", t)).toBe(
"Could not start recording",
);
expect(dictationErrorMessage("unknown", t)).toBe(
"Could not start recording",
);
});
it("returns the server message verbatim for transcription-failed (not the t key)", () => {
expect(
dictationErrorMessage("transcription-failed", t, {
serverMessage: "quota exceeded",
}),
).toBe("quota exceeded");
});
it("appends the detail to recorder-failed / unknown", () => {
expect(
dictationErrorMessage("recorder-failed", t, { detail: "boom" }),
).toBe("Could not start recording: boom");
expect(dictationErrorMessage("unknown", t, { detail: "nope" })).toBe(
"Could not start recording: nope",
);
});
it("appends the detail to transcription-failed when there is no server message", () => {
expect(
dictationErrorMessage("transcription-failed", t, { detail: "timeout" }),
).toBe("Transcription failed: timeout");
});
});
describe("resolveUnavailableLabel", () => {
it("maps each reason to its expected i18n key", () => {
expect(resolveUnavailableLabel("connecting", t)).toBe(
"Dictation becomes available once the page finishes connecting",
);
expect(resolveUnavailableLabel("offline", t)).toBe(
"No connection to the collaboration server — dictation unavailable",
);
expect(resolveUnavailableLabel("read-only", t)).toBe(
"This page is read-only",
);
expect(resolveUnavailableLabel("unsupported", t)).toBe(
"Audio recording is not available in this browser/context",
);
});
});
describe("isDictationSupported", () => {
it("returns a boolean", () => {
expect(typeof isDictationSupported()).toBe("boolean");
});
});
@@ -0,0 +1,110 @@
// Single source of truth for "why dictation is unavailable" and "why it failed".
// Both dictation hooks and the mic button pull their user-facing strings from
// the resolvers here so the wording lives in exactly one place.
export type DictationUnavailableReason =
| "connecting"
| "offline"
| "read-only"
| "unsupported";
export type DictationErrorCode =
| "no-media-devices"
| "mic-denied"
| "no-mic"
| "mic-in-use"
| "recorder-failed"
| "vad-init-failed"
| "stt-not-configured"
| "transcription-failed"
| "unknown";
// True if this browser/context can record audio.
export function isDictationSupported(): boolean {
return (
typeof MediaRecorder !== "undefined" &&
typeof navigator !== "undefined" &&
!!navigator.mediaDevices?.getUserMedia
);
}
// getUserMedia / VAD.start rejection -> code, by DOMException .name.
export function classifyGetUserMediaError(err: unknown): DictationErrorCode {
const name = (err as { name?: string })?.name;
if (name === "NotAllowedError" || name === "SecurityError")
return "mic-denied";
if (name === "NotFoundError" || name === "OverconstrainedError")
return "no-mic";
if (name === "NotReadableError" || name === "AbortError") return "mic-in-use";
return "unknown";
}
// Transcription HTTP failure -> code (+ verbatim server message when present).
export function classifyTranscriptionError(err: unknown): {
code: DictationErrorCode;
serverMessage?: string;
} {
const resp = (
err as { response?: { status?: number; data?: { message?: string } } }
)?.response;
const serverMessage = resp?.data?.message;
if (serverMessage && serverMessage.trim().length > 0)
return { code: "transcription-failed", serverMessage };
if (resp?.status === 503 || resp?.status === 403)
return { code: "stt-not-configured" };
return { code: "transcription-failed" };
}
type TFn = (key: string) => string;
// Code -> user text. The ONE place runtime error strings are formed.
// serverMessage (verbatim) wins for transcription-failed; detail is appended
// to the generic "could not start"/"transcription failed" strings.
export function dictationErrorMessage(
code: DictationErrorCode,
t: TFn,
extra?: { serverMessage?: string; detail?: string },
): string {
const detail = extra?.detail;
switch (code) {
case "mic-denied":
return t("Microphone access denied");
case "no-mic":
return t("No microphone found");
case "mic-in-use":
return t("Microphone is unavailable or already in use");
case "no-media-devices":
return t("Audio recording is not available in this browser/context");
case "stt-not-configured":
return t("Voice dictation is not configured");
case "transcription-failed":
if (extra?.serverMessage && extra.serverMessage.trim().length > 0)
return extra.serverMessage;
return `${t("Transcription failed")}${detail ? `: ${detail}` : ""}`;
case "recorder-failed":
case "vad-init-failed":
case "unknown":
default:
return `${t("Could not start recording")}${detail ? `: ${detail}` : ""}`;
}
}
// Unavailable reason -> tooltip text (the ONE place these strings are formed).
export function resolveUnavailableLabel(
r: DictationUnavailableReason,
t: TFn,
): string {
switch (r) {
case "connecting":
return t("Dictation becomes available once the page finishes connecting");
case "offline":
return t(
"No connection to the collaboration server — dictation unavailable",
);
case "read-only":
return t("This page is read-only");
case "unsupported":
default:
return t("Audio recording is not available in this browser/context");
}
}
@@ -2,6 +2,11 @@ import { useCallback, useEffect, useRef, useState } from "react";
import { notifications } from "@mantine/notifications";
import { useTranslation } from "react-i18next";
import { transcribeAudio } from "@/features/dictation/services/dictation-service";
import {
classifyGetUserMediaError,
classifyTranscriptionError,
dictationErrorMessage,
} from "@/features/dictation/dictation-status";
// "loading" is set only by the streaming hook while it lazily loads the VAD
// model on first use; the batch hook never sets it. It exists so the streaming
@@ -26,6 +31,8 @@ interface UseDictationResult {
cancel: () => void;
// Smoothed live microphone level in the 0..1 range while recording (0 when idle).
audioLevel: number;
// The last error shown to the user (null until one occurs / on a new start).
errorMessage: string | null;
}
// Candidate container/codec combinations in preference order. The first one the
@@ -67,6 +74,8 @@ export function useDictation(
const { t } = useTranslation();
const [status, setStatus] = useState<DictationStatus>("idle");
const [audioLevel, setAudioLevel] = useState(0);
// Last error message shown to the user; the mic button reads it for its tooltip.
const [errorMessage, setErrorMessage] = useState<string | null>(null);
// Keep the latest callbacks in a ref so the recorder's onstop closure always
// calls the current handlers without re-creating the recorder.
@@ -194,15 +203,16 @@ export function useDictation(
if (startingRef.current || recorderRef.current || streamRef.current) return;
if (status !== "idle") return;
startingRef.current = true;
// Clear any stale error from a previous attempt.
setErrorMessage(null);
if (!navigator.mediaDevices?.getUserMedia) {
const reason =
"navigator.mediaDevices.getUserMedia is unavailable in this context";
console.error("[dictation] " + reason);
notifications.show({
color: "red",
message: t("Audio recording is not available in this browser/context"),
});
const message = dictationErrorMessage("no-media-devices", t);
notifications.show({ color: "red", message });
setErrorMessage(message);
setStatus("idle");
startingRef.current = false;
return;
@@ -215,19 +225,16 @@ export function useDictation(
// Always log the full error for diagnosis (name, message, stack).
console.error("[dictation] getUserMedia failed", err);
const name = (err as { name?: string })?.name;
const detail = (err as { message?: string })?.message ?? String(err);
let message: string;
if (name === "NotAllowedError" || name === "SecurityError") {
message = t("Microphone access denied");
} else if (name === "NotFoundError" || name === "OverconstrainedError") {
message = t("No microphone found");
} else if (name === "NotReadableError" || name === "AbortError") {
message = t("Microphone is unavailable or already in use");
} else {
// Unknown failure: show the real reason instead of a generic string.
message = `${t("Could not start recording")}: ${name ? `${name}: ` : ""}${detail}`;
}
const rawDetail = (err as { message?: string })?.message ?? String(err);
// Prefix the DOMException name (e.g. "TypeError: …") so the generic
// resolver branch reproduces this hook's original "Could not start
// recording: <name>: <detail>" text. Each caller owns its own detail; the
// streaming hook intentionally does not add the name.
const detail = `${name ? `${name}: ` : ""}${rawDetail}`;
const code = classifyGetUserMediaError(err);
const message = dictationErrorMessage(code, t, { detail });
notifications.show({ color: "red", message });
setErrorMessage(message);
setStatus("idle");
startingRef.current = false;
return;
@@ -249,10 +256,10 @@ export function useDictation(
// The stream was acquired but the recorder failed to construct; stop the
// tracks so the MediaStream does not leak before bailing out.
stopTracks();
notifications.show({
color: "red",
message: `${t("Could not start recording")}: ${(err as { message?: string })?.message ?? String(err)}`,
});
const detail = (err as { message?: string })?.message ?? String(err);
const message = dictationErrorMessage("recorder-failed", t, { detail });
notifications.show({ color: "red", message });
setErrorMessage(message);
setStatus("idle");
startingRef.current = false;
return;
@@ -293,21 +300,14 @@ export function useDictation(
.catch((err: unknown) => {
// Log the full error for diagnosis (status + body + stack).
console.error("[dictation] transcription failed", err);
const resp = (
err as { response?: { status?: number; data?: { message?: string } } }
)?.response;
const serverMsg = resp?.data?.message;
let message: string;
if (serverMsg && serverMsg.trim().length > 0) {
// The server already explains the cause (e.g. provider 404, bad
// format, STT not configured) — show it verbatim.
message = serverMsg;
} else if (resp?.status === 503 || resp?.status === 403) {
message = t("Voice dictation is not configured");
} else {
message = `${t("Transcription failed")}: ${(err as { message?: string })?.message ?? String(err)}`;
}
const { code, serverMessage } = classifyTranscriptionError(err);
const detail = (err as { message?: string })?.message ?? String(err);
const message = dictationErrorMessage(code, t, {
serverMessage,
detail,
});
notifications.show({ color: "red", message });
setErrorMessage(message);
setStatus("error");
if (errorTimerRef.current !== null) {
clearTimeout(errorTimerRef.current);
@@ -332,10 +332,10 @@ export function useDictation(
stopTracks();
recorderRef.current = null;
startingRef.current = false;
notifications.show({
color: "red",
message: `${t("Could not start recording")}: ${(err as { message?: string })?.message ?? String(err)}`,
});
const detail = (err as { message?: string })?.message ?? String(err);
const message = dictationErrorMessage("recorder-failed", t, { detail });
notifications.show({ color: "red", message });
setErrorMessage(message);
setStatus("idle");
return;
}
@@ -405,5 +405,5 @@ export function useDictation(
};
}, [clearTimer, stopTracks, stopMeter]);
return { status, start, stop, cancel, audioLevel };
return { status, start, stop, cancel, audioLevel, errorMessage };
}
@@ -4,6 +4,11 @@ import { useTranslation } from "react-i18next";
import { transcribeAudio } from "@/features/dictation/services/dictation-service";
import { encodeWavPcm16 } from "@/features/dictation/utils/encode-wav";
import type { DictationStatus } from "@/features/dictation/hooks/use-dictation";
import {
classifyGetUserMediaError,
classifyTranscriptionError,
dictationErrorMessage,
} from "@/features/dictation/dictation-status";
// Lazily-imported MicVAD type. The runtime import happens inside start() so the
// heavy onnxruntime-web / Silero model is code-split out of the main bundle and
@@ -27,6 +32,8 @@ interface UseStreamingDictationResult {
cancel: () => void;
// Smoothed live speech level in the 0..1 range while recording (0 when idle).
audioLevel: number;
// The last error shown to the user (null until one occurs / on a new start).
errorMessage: string | null;
}
// Sample rate of the audio MicVAD hands to onSpeechEnd (Silero VAD runs at 16k).
@@ -60,6 +67,8 @@ export function useStreamingDictation(
const { t } = useTranslation();
const [status, setStatus] = useState<DictationStatus>("idle");
const [audioLevel, setAudioLevel] = useState(0);
// Last error message shown to the user; the mic button reads it for its tooltip.
const [errorMessage, setErrorMessage] = useState<string | null>(null);
// Keep the latest callbacks in a ref so async VAD/HTTP closures always call the
// current handlers without re-creating the VAD.
@@ -158,26 +167,6 @@ export function useStreamingDictation(
}
}, []);
// Map a transcription error to a user-facing message, mirroring the batch hook.
const transcriptionErrorMessage = useCallback(
(err: unknown): string => {
const resp = (
err as { response?: { status?: number; data?: { message?: string } } }
)?.response;
const serverMsg = resp?.data?.message;
if (serverMsg && serverMsg.trim().length > 0) {
// The server already explains the cause (e.g. provider 404, bad format,
// STT not configured) — show it verbatim.
return serverMsg;
}
if (resp?.status === 503 || resp?.status === 403) {
return t("Voice dictation is not configured");
}
return `${t("Transcription failed")}: ${(err as { message?: string })?.message ?? String(err)}`;
},
[t],
);
// Handle one ended speech segment: encode to WAV and transcribe. Results are
// buffered by seq and flushed in order. A single failed segment does NOT kill
// the session: log + one notification, then advance past that seq so later
@@ -204,10 +193,14 @@ export function useStreamingDictation(
if (epoch !== epochRef.current) return;
// Log the full error for diagnosis (status + body + stack).
console.error("[dictation] segment transcription failed", err);
notifications.show({
color: "red",
message: transcriptionErrorMessage(err),
const { code, serverMessage } = classifyTranscriptionError(err);
const detail = (err as { message?: string })?.message ?? String(err);
const message = dictationErrorMessage(code, t, {
serverMessage,
detail,
});
notifications.show({ color: "red", message });
setErrorMessage(message);
// Skip this seq so later segments can still flush in order.
if (nextEmitSeqRef.current === seq) {
nextEmitSeqRef.current += 1;
@@ -226,7 +219,7 @@ export function useStreamingDictation(
}
});
},
[drainResults, transcriptionErrorMessage],
[drainResults, t],
);
const start = useCallback(async (): Promise<void> => {
@@ -236,6 +229,8 @@ export function useStreamingDictation(
if (startingRef.current || vadRef.current || activeRef.current) return;
if (status !== "idle") return;
startingRef.current = true;
// Clear any stale error from a previous attempt.
setErrorMessage(null);
// Notify the caller right when dictation begins (before any async work) so the
// editor can snapshot the caret position.
@@ -354,10 +349,9 @@ export function useStreamingDictation(
// actually runs.)
console.error("[dictation] VAD init failed", err);
const detail = (err as { message?: string })?.message ?? String(err);
notifications.show({
color: "red",
message: `${t("Could not start recording")}: ${detail}`,
});
const message = dictationErrorMessage("vad-init-failed", t, { detail });
notifications.show({ color: "red", message });
setErrorMessage(message);
// Defensive: if MicVAD.new partially succeeded before throwing, make sure we
// don't leak it.
destroyVad();
@@ -379,19 +373,11 @@ export function useStreamingDictation(
} catch (err) {
// Always log the full error for diagnosis (name, message, stack).
console.error("[dictation] VAD.start failed", err);
const name = (err as { name?: string })?.name;
const detail = (err as { message?: string })?.message ?? String(err);
let message: string;
if (name === "NotAllowedError" || name === "SecurityError") {
message = t("Microphone access denied");
} else if (name === "NotFoundError" || name === "OverconstrainedError") {
message = t("No microphone found");
} else if (name === "NotReadableError" || name === "AbortError") {
message = t("Microphone is unavailable or already in use");
} else {
message = `${t("Could not start recording")}: ${detail}`;
}
const code = classifyGetUserMediaError(err);
const message = dictationErrorMessage(code, t, { detail });
notifications.show({ color: "red", message });
setErrorMessage(message);
activeRef.current = false;
destroyVad();
setStatus("idle");
@@ -470,5 +456,5 @@ export function useStreamingDictation(
};
}, [clearTimer, destroyVad]);
return { status, start, stop, cancel, audioLevel };
return { status, start, stop, cancel, audioLevel, errorMessage };
}
@@ -1,6 +1,7 @@
import { atom } from "jotai";
import { Editor } from "@tiptap/core";
import { PageEditMode } from "@/features/user/types/user.types.ts";
import type { DictationUnavailableReason } from "@/features/dictation/dictation-status";
export const pageEditorAtom = atom<Editor | null>(null);
@@ -15,3 +16,15 @@ export const showLinkMenuAtom = atom(false);
// Current page's edit mode — initialized from the user's saved preference on
// first load, can be toggled locally without persisting to the server.
export const currentPageEditModeAtom = atom<PageEditMode>(PageEditMode.Edit);
// Whether the dictation mic can start, and (when it can't) the cause-specific
// reason the mic button surfaces as a tooltip. Published by the page editor,
// consumed by DictationGroup -> MicButton.
export type DictationAvailability = {
isEditable: boolean;
reason: DictationUnavailableReason | null;
};
export const dictationAvailabilityAtom = atom<DictationAvailability>({
isEditable: false,
reason: null,
});
@@ -1,36 +1,15 @@
import { describe, it, expect, vi } from "vitest";
import { useEffect, useState } from "react";
import { render, act } from "@testing-library/react";
import { Provider, createStore } from "jotai";
import { dictationAvailabilityAtom } from "@/features/editor/atoms/editor-atoms.ts";
// Regression test for #311: on a page the user can edit, the byline mic stayed
// stuck disabled until an unrelated re-render happened, because DictationGroup
// read the non-reactive field `editor.isEditable` directly. The fix reads it via
// `useEditorState`, which subscribes to the editor's own events.
//
// The mock below mirrors the real `useEditorState` contract: it runs the
// selector, and re-runs it (re-rendering the consumer) whenever the editor emits
// an event. This is what makes the test faithful — with the pre-fix code
// (`disabled={!editor.isEditable}`) DictationGroup never subscribes, so emitting
// an event would NOT re-render and the mic would stay disabled.
vi.mock("@tiptap/react", () => ({
useEditorState: ({ editor, selector }: any) => {
const [value, setValue] = useState(() => selector({ editor }));
useEffect(() => {
const handler = () => setValue(selector({ editor }));
editor.on("update", handler);
return () => editor.off("update", handler);
}, [editor, selector]);
return value;
},
}));
// The mic only cares about the workspace's streaming flag; return a stable stub.
vi.mock("jotai", () => ({
useAtomValue: () => ({ settings: { ai: { dictationStreaming: false } } }),
}));
vi.mock("@/features/user/atoms/current-user-atom.ts", () => ({
workspaceAtom: {},
}));
// Regression test for the byline mic staying stuck disabled (#311 / #309): on a
// page the user can edit, the mic must un-grey once the body becomes editable.
// #311 first fixed this by reading `editor.isEditable` via `useEditorState`; #309
// superseded that with a reactive `dictationAvailabilityAtom` that page-editor
// publishes (carrying both the editable gate AND the unavailable reason). The mic
// now gates on `dictationAvailability.isEditable`, so a change to that atom must
// re-render the group and flip the disabled state (jotai drives the subscription).
// Detectable stand-in that surfaces the `disabled` prop the component computes.
vi.mock("@/features/dictation/components/mic-button", () => ({
@@ -41,33 +20,39 @@ vi.mock("@/features/dictation/components/mic-button", () => ({
import { DictationGroup } from "./dictation-group";
// Minimal editor stand-in: a mutable `isEditable` field plus a tiny event
// emitter, matching the surface DictationGroup + the mocked useEditorState use.
function makeFakeEditor(isEditable: boolean) {
const listeners = new Set<() => void>();
// Minimal editor stand-in matching the surface DictationGroup uses (handleStart /
// handleText). The disabled gate no longer reads this — it reads the atom.
function makeFakeEditor() {
return {
isEditable,
isEditable: false,
isDestroyed: false,
state: { selection: { from: 0, to: 0 }, doc: { content: { size: 0 } } },
on: (_event: string, cb: () => void) => listeners.add(cb),
off: (_event: string, cb: () => void) => listeners.delete(cb),
emit: () => listeners.forEach((cb) => cb()),
} as any;
}
describe("DictationGroup editable reactivity (#311)", () => {
it("re-enables the mic when the editor flips isEditable false -> true", () => {
const editor = makeFakeEditor(false);
const { getByTestId } = render(<DictationGroup editor={editor} />);
describe("DictationGroup editable reactivity (#309 atom / #311)", () => {
it("re-enables the mic when dictationAvailability flips isEditable false -> true", () => {
const editor = makeFakeEditor();
const store = createStore();
// Pre-sync: page editor publishes not-editable (with a reason).
store.set(dictationAvailabilityAtom, {
isEditable: false,
reason: "connecting",
});
// Pre-sync: not editable yet, so the mic is disabled (preserves #218 intent).
const { getByTestId } = render(
<Provider store={store}>
<DictationGroup editor={editor} />
</Provider>,
);
// Not editable yet -> disabled (preserves the #218 pre-sync intent).
expect(getByTestId("mic").hasAttribute("disabled")).toBe(true);
// Collab sync flips the editor editable via editor.setEditable(true), which
// mutates the field and emits — the mic must react and enable itself.
// Collab sync -> page editor republishes editable; the atom change must
// re-render the group and enable the mic.
act(() => {
editor.isEditable = true;
editor.emit();
store.set(dictationAvailabilityAtom, { isEditable: true, reason: null });
});
expect(getByTestId("mic").hasAttribute("disabled")).toBe(false);
@@ -1,7 +1,8 @@
import { FC, useRef } from "react";
import { Editor, useEditorState } from "@tiptap/react";
import { Editor } from "@tiptap/react";
import { useAtomValue } from "jotai";
import { workspaceAtom } from "@/features/user/atoms/current-user-atom.ts";
import { dictationAvailabilityAtom } from "@/features/editor/atoms/editor-atoms.ts";
import { MicButton } from "@/features/dictation/components/mic-button";
interface Props {
@@ -16,20 +17,14 @@ export const DictationGroup: FC<Props> = ({ editor, color, iconSize }) => {
const workspace = useAtomValue(workspaceAtom);
const streamingDictation =
workspace?.settings?.ai?.dictationStreaming === true;
// Cause-specific reason the mic is unavailable (published by the page editor).
const dictationAvailability = useAtomValue(dictationAvailabilityAtom);
// Caret snapshot taken when dictation starts (where the first segment lands).
const rangeRef = useRef<{ from: number; to: number } | null>(null);
// Running insertion point: after each inserted segment we remember the caret
// end so the NEXT segment appends right after it, contiguously, regardless of
// where the user's caret currently is. Null until the first segment lands.
const insertPosRef = useRef<number | null>(null);
// editor.isEditable is a mutable, non-reactive field — read it via
// useEditorState so the mic re-enables when the body flips to editable after
// collab sync (otherwise it stays stuck disabled). Mirrors the body's own
// reactive read.
const isEditable = useEditorState({
editor,
selector: (ctx) => ctx.editor?.isEditable ?? false,
});
const handleStart = () => {
const { from, to } = editor.state.selection;
@@ -88,7 +83,8 @@ export const DictationGroup: FC<Props> = ({ editor, color, iconSize }) => {
streaming={streamingDictation}
onStart={handleStart}
onText={handleText}
disabled={!isEditable}
disabled={!dictationAvailability.isEditable}
unavailableReason={dictationAvailability.reason ?? undefined}
color={color}
iconSize={iconSize}
/>
@@ -1,6 +1,10 @@
import { describe, it, expect } from "vitest";
import { WebSocketStatus } from "@hocuspocus/provider";
import { isCollabSynced, isBodyEditable } from "./editor-sync-state";
import {
isCollabSynced,
isBodyEditable,
computeDictationAvailability,
} from "./editor-sync-state";
describe("isCollabSynced", () => {
it("is true only when Connected and synced", () => {
@@ -30,3 +34,77 @@ describe("isBodyEditable (pre-sync data-loss gate, #218)", () => {
expect(isBodyEditable({ ...base, inEditMode: false })).toBe(false);
});
});
describe("computeDictationAvailability (mic reason precedence, #309)", () => {
const base = {
editable: true,
inEditMode: true,
showStatic: false,
isDisconnected: false,
};
it("is available with no reason once synced (showStatic false)", () => {
expect(computeDictationAvailability(base)).toEqual({
isEditable: true,
reason: null,
});
});
it("reports 'offline' during pre-sync while disconnected", () => {
expect(
computeDictationAvailability({
...base,
showStatic: true,
isDisconnected: true,
}),
).toEqual({ isEditable: false, reason: "offline" });
});
it("reports 'connecting' during pre-sync while still connecting", () => {
expect(
computeDictationAvailability({
...base,
showStatic: true,
isDisconnected: false,
}),
).toEqual({ isEditable: false, reason: "connecting" });
});
it("reports 'read-only' without edit permission", () => {
expect(
computeDictationAvailability({ ...base, editable: false }),
).toEqual({ isEditable: false, reason: "read-only" });
});
it("reports 'read-only' when not in edit mode", () => {
expect(
computeDictationAvailability({ ...base, inEditMode: false }),
).toEqual({ isEditable: false, reason: "read-only" });
});
// Lack of edit permission takes precedence over the pre-sync reason: a
// read-only viewer who is ALSO inside the pre-sync window (showStatic) must
// still read "read-only", never "offline"/"connecting". This pins the
// `opts.editable &&` guard on the pre-sync branch.
it("prefers 'read-only' over pre-sync when a read-only viewer is disconnected", () => {
expect(
computeDictationAvailability({
editable: false,
inEditMode: true,
showStatic: true,
isDisconnected: true,
}),
).toEqual({ isEditable: false, reason: "read-only" });
});
it("prefers 'read-only' over pre-sync when a read-only viewer is still connecting", () => {
expect(
computeDictationAvailability({
editable: false,
inEditMode: true,
showStatic: true,
isDisconnected: false,
}),
).toEqual({ isEditable: false, reason: "read-only" });
});
});
@@ -1,4 +1,5 @@
import { WebSocketStatus } from "@hocuspocus/provider";
import type { DictationUnavailableReason } from "@/features/dictation/dictation-status";
/**
* The collab document is usable only once the provider is Connected AND has
@@ -30,3 +31,32 @@ export function isBodyEditable(opts: {
}): boolean {
return opts.editable && opts.inEditMode && !opts.showStatic;
}
/**
* Whether dictation can start and, when it can't, the cause-specific reason the
* mic button surfaces. Derives editability from `isBodyEditable` (the single,
* tested gate) so the published `isEditable` can never diverge from the actual
* body-editable state and make the tooltip lie (#309).
*
* `isDisconnected` is the caller's own boolean (collab connection is in the
* Disconnected state), passed in so this module stays free of the collab enum.
*/
export function computeDictationAvailability(opts: {
editable: boolean;
inEditMode: boolean;
showStatic: boolean;
isDisconnected: boolean;
}): { isEditable: boolean; reason: DictationUnavailableReason | null } {
const isEditable = isBodyEditable({
editable: opts.editable,
inEditMode: opts.inEditMode,
showStatic: opts.showStatic,
});
if (isEditable) return { isEditable, reason: null };
// Permitted to edit and in edit mode but not yet synced (showStatic) → pre-sync.
if (opts.editable && opts.inEditMode && opts.showStatic) {
return { isEditable, reason: opts.isDisconnected ? "offline" : "connecting" };
}
// No edit permission or not in edit mode.
return { isEditable, reason: "read-only" };
}
@@ -27,11 +27,12 @@ import {
collabExtensions,
mainExtensions,
} from "@/features/editor/extensions/extensions";
import { useAtom, useAtomValue } from "jotai";
import { useAtom, useAtomValue, useSetAtom } from "jotai";
import useCollaborationUrl from "@/features/editor/hooks/use-collaboration-url";
import { currentUserAtom } from "@/features/user/atoms/current-user-atom";
import {
currentPageEditModeAtom,
dictationAvailabilityAtom,
pageEditorAtom,
yjsConnectionStatusAtom,
} from "@/features/editor/atoms/editor-atoms";
@@ -88,6 +89,7 @@ import { PageEmbedAncestryProvider } from "@/features/editor/components/page-emb
import PageEmbedPicker from "@/features/editor/components/page-embed/page-embed-picker";
import { useTranslation } from "react-i18next";
import {
computeDictationAvailability,
isBodyEditable,
isCollabSynced,
} from "@/features/editor/editor-sync-state";
@@ -139,6 +141,7 @@ export default function PageEditor({
const { pageSlug } = useParams();
const slugId = extractPageSlugId(pageSlug);
const currentPageEditMode = useAtomValue(currentPageEditModeAtom);
const setDictationAvailability = useSetAtom(dictationAvailabilityAtom);
const canScroll = useCallback(
() => Boolean(isComponentMounted.current && editorRef.current),
[isComponentMounted],
@@ -488,6 +491,26 @@ export default function PageEditor({
);
}, [currentPageEditMode, editor, editable, showStatic]);
// Publish whether dictation can start and, if not, the cause-specific reason
// the mic button surfaces. Recomputed on the same signals that drive body
// editability so the tooltip never lies about the current state.
useEffect(() => {
setDictationAvailability(
computeDictationAvailability({
editable,
inEditMode: currentPageEditMode === PageEditMode.Edit,
showStatic,
isDisconnected: yjsConnectionStatus === WebSocketStatus.Disconnected,
}),
);
}, [
editable,
currentPageEditMode,
showStatic,
yjsConnectionStatus,
setDictationAvailability,
]);
useEffect(() => {
if (
!hasConnectedOnceRef.current &&