Merge pull request 'feat(ai-chat): realtime token counter + reasoning tokens (#151)' (#158) from feat/ai-chat-realtime-tokens into develop
Reviewed-on: #158
This commit was merged in pull request #158.
This commit is contained in:
@@ -156,6 +156,12 @@ export default function AiChatWindow() {
|
||||
isStreaming: false,
|
||||
});
|
||||
|
||||
// Live turn-token total (reasoning + output) for the in-flight turn, pushed up
|
||||
// (THROTTLED to ~8 Hz inside ChatThread) so the header badge ticks mid-stream.
|
||||
// `null` means no turn is in flight -> the badge falls back to the persisted
|
||||
// context size below.
|
||||
const [liveTurnTokens, setLiveTurnTokens] = useState<number | null>(null);
|
||||
|
||||
// The page the user is currently viewing. AiChatWindow lives in a pathless
|
||||
// parent layout route, so useParams() can't see :pageSlug. Match the full
|
||||
// pathname against the authenticated page route instead so "the current page"
|
||||
@@ -485,11 +491,19 @@ export default function AiChatWindow() {
|
||||
)}
|
||||
|
||||
<div style={{ flex: 1, display: "flex", justifyContent: "center" }}>
|
||||
{contextTokens > 0 && (
|
||||
{/* While a turn streams, show the LIVE turn-token count (ticks ~8 Hz);
|
||||
once it finishes, fall back to the persisted context size. Require
|
||||
> 0 so the very first emit (an empty tail message, count 0) does not
|
||||
flash a "0" badge before any token streams in (#151 review). */}
|
||||
{liveTurnTokens !== null && liveTurnTokens > 0 ? (
|
||||
<Tooltip label={t("Tokens generated this turn")} withArrow>
|
||||
<span className={classes.badge}>{formatTokens(liveTurnTokens)}</span>
|
||||
</Tooltip>
|
||||
) : contextTokens > 0 ? (
|
||||
<Tooltip label={t("Current context size")} withArrow>
|
||||
<span className={classes.badge}>{formatTokens(contextTokens)}</span>
|
||||
</Tooltip>
|
||||
)}
|
||||
) : null}
|
||||
</div>
|
||||
|
||||
<div style={{ display: "flex", alignItems: "center", gap: 1 }}>
|
||||
@@ -608,6 +622,7 @@ export default function AiChatWindow() {
|
||||
assistantName={currentRole?.name}
|
||||
onTurnFinished={onTurnFinished}
|
||||
liveStateRef={liveThreadRef}
|
||||
onLiveTurnTokens={setLiveTurnTokens}
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
|
||||
@@ -111,6 +111,24 @@
|
||||
background: light-dark(var(--mantine-color-gray-0), var(--mantine-color-dark-6));
|
||||
}
|
||||
|
||||
/* Collapsible "Thinking" (reasoning) block: a subtle left rule, dimmer than the
|
||||
answer so it reads as secondary thinking context above the real answer. */
|
||||
.reasoningBlock {
|
||||
border-left: 2px solid light-dark(var(--mantine-color-gray-3), var(--mantine-color-dark-4));
|
||||
padding-left: 8px;
|
||||
}
|
||||
|
||||
.reasoningText {
|
||||
margin-top: 4px;
|
||||
font-size: var(--mantine-font-size-xs);
|
||||
color: light-dark(var(--mantine-color-gray-7), var(--mantine-color-dark-1));
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
|
||||
.reasoningText p {
|
||||
margin: 0 0 4px;
|
||||
}
|
||||
|
||||
.inputWrapper {
|
||||
flex: 0 0 auto;
|
||||
padding-top: var(--mantine-spacing-xs);
|
||||
|
||||
@@ -27,6 +27,7 @@ import {
|
||||
} from "@/features/ai-chat/utils/role-launch.ts";
|
||||
import { describeChatError } from "@/features/ai-chat/utils/error-message.ts";
|
||||
import { extractServerChatId } from "@/features/ai-chat/utils/adopt-chat-id.ts";
|
||||
import { liveTurnTokens } from "@/features/ai-chat/utils/count-stream-tokens.ts";
|
||||
import {
|
||||
dequeue,
|
||||
enqueueMessage,
|
||||
@@ -73,6 +74,12 @@ interface ChatThreadProps {
|
||||
* assistant message. A ref (not state) avoids re-rendering the parent on
|
||||
* every streamed delta. */
|
||||
liveStateRef?: MutableRefObject<{ messages: UIMessage[]; isStreaming: boolean }>;
|
||||
/** Reports the live turn-token total (reasoning + output) for the in-flight
|
||||
* turn so the parent can show a header badge that ticks mid-stream. THROTTLED
|
||||
* here (~8 Hz) so the parent re-renders a handful of times a second, not on
|
||||
* every streamed delta. Called with `null` when no turn is in flight (the
|
||||
* parent then reverts the badge to the persisted context size). */
|
||||
onLiveTurnTokens?: (tokens: number | null) => void;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -117,6 +124,7 @@ export default function ChatThread({
|
||||
assistantName,
|
||||
onTurnFinished,
|
||||
liveStateRef,
|
||||
onLiveTurnTokens,
|
||||
}: ChatThreadProps) {
|
||||
const { t } = useTranslation();
|
||||
|
||||
@@ -314,6 +322,54 @@ export default function ChatThread({
|
||||
};
|
||||
}, [liveStateRef, messages, isStreaming]);
|
||||
|
||||
// Report the live turn-token total to the parent header badge, THROTTLED to
|
||||
// ~8 Hz so the parent re-renders a few times a second instead of on every
|
||||
// streamed delta. The tail assistant message's reasoning+output (estimate while
|
||||
// streaming, authoritative once a step reports usage) is the live figure. When
|
||||
// the turn ends we emit a final exact value, then `null` so the parent reverts
|
||||
// the badge to the persisted context size.
|
||||
const lastEmitRef = useRef(0);
|
||||
const emitTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
||||
useEffect(() => {
|
||||
if (!onLiveTurnTokens) return;
|
||||
if (!isStreaming) {
|
||||
// Turn ended (or never started): clear any pending throttle and revert.
|
||||
if (emitTimerRef.current) {
|
||||
clearTimeout(emitTimerRef.current);
|
||||
emitTimerRef.current = null;
|
||||
}
|
||||
lastEmitRef.current = 0;
|
||||
onLiveTurnTokens(null);
|
||||
return;
|
||||
}
|
||||
const tail = messages[messages.length - 1];
|
||||
const live =
|
||||
tail?.role === "assistant" ? liveTurnTokens(tail) : null;
|
||||
const total = live ? live.reasoning + live.output : 0;
|
||||
const now = Date.now();
|
||||
const MIN_INTERVAL = 120; // ms (~8 Hz)
|
||||
const elapsed = now - lastEmitRef.current;
|
||||
if (elapsed >= MIN_INTERVAL) {
|
||||
lastEmitRef.current = now;
|
||||
onLiveTurnTokens(total);
|
||||
} else if (!emitTimerRef.current) {
|
||||
// Schedule a trailing emit so the FINAL value of a burst is not dropped.
|
||||
emitTimerRef.current = setTimeout(() => {
|
||||
emitTimerRef.current = null;
|
||||
lastEmitRef.current = Date.now();
|
||||
onLiveTurnTokens(total);
|
||||
}, MIN_INTERVAL - elapsed);
|
||||
}
|
||||
}, [messages, isStreaming, onLiveTurnTokens]);
|
||||
|
||||
// Clear any pending throttle timer on unmount (chat switch via `key`) so a
|
||||
// trailing emit can't fire into a torn-down thread's parent.
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
if (emitTimerRef.current) clearTimeout(emitTimerRef.current);
|
||||
};
|
||||
}, []);
|
||||
|
||||
// Classify the turn error into a heading + detail so the banner names the cause
|
||||
// (connection reset, timeout, rate limit, context overflow, quota, ...) instead
|
||||
// of a generic "Something went wrong".
|
||||
|
||||
@@ -2,12 +2,14 @@ import { Box, Text } from "@mantine/core";
|
||||
import { useTranslation } from "react-i18next";
|
||||
import type { UIMessage } from "@ai-sdk/react";
|
||||
import ToolCallCard from "@/features/ai-chat/components/tool-call-card.tsx";
|
||||
import ReasoningBlock from "@/features/ai-chat/components/reasoning-block.tsx";
|
||||
import ChatErrorAlert from "@/features/ai-chat/components/chat-error-alert.tsx";
|
||||
import ChatStoppedNotice from "@/features/ai-chat/components/chat-stopped-notice.tsx";
|
||||
import { ToolUiPart, isToolPart } from "@/features/ai-chat/utils/tool-parts.tsx";
|
||||
import { assistantMessageHasVisibleContent } from "@/features/ai-chat/utils/message-content.ts";
|
||||
import { renderChatMarkdown } from "@/features/ai-chat/utils/markdown.ts";
|
||||
import { resolveAssistantName } from "@/features/ai-chat/utils/assistant-name.ts";
|
||||
import { reasoningTokensForPart } from "@/features/ai-chat/utils/reasoning-tokens.ts";
|
||||
import { describeChatError } from "@/features/ai-chat/utils/error-message.ts";
|
||||
import classes from "@/features/ai-chat/components/ai-chat.module.css";
|
||||
|
||||
@@ -77,12 +79,31 @@ export default function MessageItem({
|
||||
// return won't fire for them.
|
||||
if (!assistantMessageHasVisibleContent(message)) return null;
|
||||
|
||||
// Authoritative reasoning token count to attribute to a reasoning block, or
|
||||
// undefined when the block must estimate on its own. See reasoningTokensForPart
|
||||
// for the #151 anti-double-count rule (only a single reasoning part may carry
|
||||
// the turn total). The authoritative turn total is still surfaced live in the
|
||||
// header badge regardless.
|
||||
const reasoningTokens = reasoningTokensForPart(message);
|
||||
|
||||
return (
|
||||
<Box className={classes.messageRow}>
|
||||
<Text size="xs" c="dimmed" mb={4}>
|
||||
{resolveAssistantName(assistantName) ?? t("AI agent")}
|
||||
</Text>
|
||||
{message.parts.map((part, index) => {
|
||||
if (part.type === "reasoning") {
|
||||
// Reasoning ("thinking") -> a collapsible block with its own token
|
||||
// count. Empty/whitespace reasoning with no authoritative count carries
|
||||
// nothing to show, so skip it (avoids an empty 0-token block).
|
||||
const text = (part as { text?: string }).text ?? "";
|
||||
if (!text.trim() && !(reasoningTokens && reasoningTokens > 0))
|
||||
return null;
|
||||
return (
|
||||
<ReasoningBlock key={index} text={text} tokens={reasoningTokens} />
|
||||
);
|
||||
}
|
||||
|
||||
if (part.type === "text") {
|
||||
// Skip empty/whitespace-only text parts (a streaming message often
|
||||
// starts with an empty text part before the first token arrives); the
|
||||
|
||||
@@ -6,6 +6,7 @@ import MessageItem from "@/features/ai-chat/components/message-item.tsx";
|
||||
import TypingIndicator from "@/features/ai-chat/components/typing-indicator.tsx";
|
||||
import { isToolPart, toolRunState, ToolUiPart } from "@/features/ai-chat/utils/tool-parts.tsx";
|
||||
import { assistantMessageHasVisibleContent } from "@/features/ai-chat/utils/message-content.ts";
|
||||
import { liveTurnTokens } from "@/features/ai-chat/utils/count-stream-tokens.ts";
|
||||
import classes from "@/features/ai-chat/components/ai-chat.module.css";
|
||||
|
||||
interface MessageListProps {
|
||||
@@ -94,6 +95,19 @@ export function typingIndicatorShowsName(messages: UIMessage[]): boolean {
|
||||
return !assistantMessageHasVisibleContent(last);
|
||||
}
|
||||
|
||||
/**
|
||||
* The live thinking-token count to show on the standalone typing indicator. It
|
||||
* is the reasoning split of the tail assistant message (estimate while streaming,
|
||||
* authoritative once the server attaches usage at a step/turn boundary). Returns
|
||||
* 0 when the turn has produced no reasoning yet — the indicator then shows the
|
||||
* plain "Thinking…" line.
|
||||
*/
|
||||
export function tailThinkingTokens(messages: UIMessage[]): number {
|
||||
const last = messages[messages.length - 1];
|
||||
if (!last || last.role !== "assistant") return 0;
|
||||
return liveTurnTokens(last).reasoning;
|
||||
}
|
||||
|
||||
/**
|
||||
* Scrollable transcript. Auto-scrolls to the newest message as it streams in,
|
||||
* but only while the user is pinned to the bottom — if they scrolled up to read
|
||||
@@ -190,7 +204,13 @@ export default function MessageList({
|
||||
assistantName={assistantName}
|
||||
/>
|
||||
))}
|
||||
{typing && <TypingIndicator assistantName={assistantName} showName={typingIndicatorShowsName(messages)} />}
|
||||
{typing && (
|
||||
<TypingIndicator
|
||||
assistantName={assistantName}
|
||||
showName={typingIndicatorShowsName(messages)}
|
||||
thinkingTokens={tailThinkingTokens(messages)}
|
||||
/>
|
||||
)}
|
||||
</Stack>
|
||||
</ScrollArea>
|
||||
);
|
||||
|
||||
@@ -0,0 +1,65 @@
|
||||
import { describe, it, expect, vi } from "vitest";
|
||||
import { render, screen } from "@testing-library/react";
|
||||
import { MantineProvider } from "@mantine/core";
|
||||
|
||||
// Stub react-i18next so `t` returns the key with `{{count}}` interpolated. This
|
||||
// keeps the assertions on the component's OWN count logic (authoritative vs
|
||||
// estimate) rather than on translation, and mirrors the t-mock pattern used by
|
||||
// other component tests in the repo.
|
||||
vi.mock("react-i18next", () => ({
|
||||
useTranslation: () => ({
|
||||
t: (key: string, opts?: { count?: number }) =>
|
||||
opts && typeof opts.count === "number"
|
||||
? key.replace("{{count}}", String(opts.count))
|
||||
: key,
|
||||
}),
|
||||
}));
|
||||
|
||||
import ReasoningBlock from "./reasoning-block";
|
||||
import { estimateTokens } from "@/features/ai-chat/utils/count-stream-tokens.ts";
|
||||
|
||||
// matchMedia (read by MantineProvider) is stubbed globally in vitest.setup.ts.
|
||||
|
||||
function renderBlock(props: { text: string; tokens?: number }) {
|
||||
return render(
|
||||
<MantineProvider>
|
||||
<ReasoningBlock {...props} />
|
||||
</MantineProvider>,
|
||||
);
|
||||
}
|
||||
|
||||
describe("ReasoningBlock", () => {
|
||||
it("shows the authoritative count in the header when tokens > 0", () => {
|
||||
// Text "thinking…" estimates to ceil(9/4) = 3, but the authoritative 42
|
||||
// must win, so the header shows 42 (and NOT the 3-token estimate).
|
||||
renderBlock({ text: "thinking…", tokens: 42 });
|
||||
expect(screen.getByText("Thinking · 42 tokens")).toBeDefined();
|
||||
expect(screen.queryByText("Thinking · 3 tokens")).toBeNull();
|
||||
});
|
||||
|
||||
it("falls back to the text-length estimate when no authoritative tokens", () => {
|
||||
const text = "some reasoning prose that streams in";
|
||||
const estimate = estimateTokens(text);
|
||||
renderBlock({ text });
|
||||
expect(estimate).toBeGreaterThan(0);
|
||||
expect(screen.getByText(new RegExp(`${estimate} tokens`))).toBeDefined();
|
||||
});
|
||||
|
||||
it("header-only when text is empty but an authoritative count is present", () => {
|
||||
renderBlock({ text: "", tokens: 17 });
|
||||
expect(screen.getByText(/17 tokens/)).toBeDefined();
|
||||
// No disclosure body to expand: the toggle button is disabled.
|
||||
const button = screen.getByRole("button");
|
||||
expect((button as HTMLButtonElement).disabled).toBe(true);
|
||||
});
|
||||
|
||||
it("renders the reasoning body (markdown or raw-text fallback)", () => {
|
||||
renderBlock({ text: "**bold** reasoning", tokens: 5 });
|
||||
// The toggle is enabled because there IS body text to expand.
|
||||
const button = screen.getByRole("button");
|
||||
expect((button as HTMLButtonElement).disabled).toBe(false);
|
||||
// The body prose renders (markdown -> sanitized html, or raw-text fallback);
|
||||
// either way the text is present in the document.
|
||||
expect(screen.getByText(/reasoning/)).toBeDefined();
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,83 @@
|
||||
import { useState } from "react";
|
||||
import { Box, Collapse, Group, Text, UnstyledButton } from "@mantine/core";
|
||||
import { IconChevronDown } from "@tabler/icons-react";
|
||||
import { useTranslation } from "react-i18next";
|
||||
import { estimateTokens } from "@/features/ai-chat/utils/count-stream-tokens.ts";
|
||||
import { renderChatMarkdown } from "@/features/ai-chat/utils/markdown.ts";
|
||||
import classes from "@/features/ai-chat/components/ai-chat.module.css";
|
||||
|
||||
interface ReasoningBlockProps {
|
||||
/** The streamed/persisted reasoning (thinking) text. May be empty when the
|
||||
* provider reports only a reasoning token COUNT without the text. */
|
||||
text: string;
|
||||
/** Authoritative reasoning token count from `usage.reasoningTokens`, when the
|
||||
* step/turn has finished. When absent (or 0) the count is estimated from the
|
||||
* text length so it ticks live as the reasoning streams in. */
|
||||
tokens?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Collapsible "Thinking" block for an assistant `reasoning` part. Mirrors Claude
|
||||
* Code's surfacing of the model's thinking: a header that shows the thinking
|
||||
* token count (authoritative when the step has reported usage, else a live
|
||||
* estimate from the streamed text) and an expandable body with the reasoning
|
||||
* prose. Collapsed by default so it never crowds out the answer.
|
||||
*
|
||||
* Providers that don't stream reasoning TEXT still render this block from the
|
||||
* authoritative count alone (header only, empty body) so the cost is visible.
|
||||
*/
|
||||
export default function ReasoningBlock({ text, tokens }: ReasoningBlockProps) {
|
||||
const { t } = useTranslation();
|
||||
const [open, setOpen] = useState(false);
|
||||
|
||||
// Authoritative count wins; otherwise estimate live from the streamed text.
|
||||
const count = tokens && tokens > 0 ? tokens : estimateTokens(text);
|
||||
const trimmed = text.trim();
|
||||
const html = trimmed ? renderChatMarkdown(trimmed, {}) : "";
|
||||
|
||||
return (
|
||||
<Box className={classes.reasoningBlock} mb={6}>
|
||||
<UnstyledButton
|
||||
onClick={() => setOpen((o) => !o)}
|
||||
// No body to expand when the provider reported only a token count.
|
||||
disabled={!trimmed}
|
||||
aria-expanded={open}
|
||||
>
|
||||
<Group gap={6} wrap="nowrap" align="center">
|
||||
<IconChevronDown
|
||||
size={12}
|
||||
style={{
|
||||
transform: open ? "none" : "rotate(-90deg)",
|
||||
transition: "transform 150ms ease",
|
||||
opacity: trimmed ? 1 : 0.4,
|
||||
}}
|
||||
/>
|
||||
<Text size="xs" c="dimmed">
|
||||
{count > 0
|
||||
? t("Thinking · {{count}} tokens", { count })
|
||||
: t("Thinking")}
|
||||
</Text>
|
||||
</Group>
|
||||
</UnstyledButton>
|
||||
|
||||
{trimmed && (
|
||||
<Collapse in={open}>
|
||||
{html ? (
|
||||
<div
|
||||
className={classes.reasoningText}
|
||||
// Sanitized by renderChatMarkdown (DOMPurify) before insertion.
|
||||
dangerouslySetInnerHTML={{ __html: html }}
|
||||
/>
|
||||
) : (
|
||||
<Text
|
||||
className={classes.reasoningText}
|
||||
style={{ whiteSpace: "pre-wrap" }}
|
||||
>
|
||||
{trimmed}
|
||||
</Text>
|
||||
)}
|
||||
</Collapse>
|
||||
)}
|
||||
</Box>
|
||||
);
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import type { UIMessage } from "@ai-sdk/react";
|
||||
import { tailThinkingTokens } from "@/features/ai-chat/components/message-list.tsx";
|
||||
|
||||
/**
|
||||
* Pure-helper tests for `tailThinkingTokens`: the live thinking-token count the
|
||||
* standalone typing indicator shows. It is the reasoning split of the tail
|
||||
* assistant message (estimate while streaming, authoritative once usage arrives).
|
||||
*/
|
||||
const msg = (
|
||||
role: "user" | "assistant",
|
||||
parts: unknown[],
|
||||
metadata?: unknown,
|
||||
): UIMessage =>
|
||||
({ id: Math.random().toString(), role, parts, metadata }) as UIMessage;
|
||||
|
||||
describe("tailThinkingTokens", () => {
|
||||
it("is 0 when there are no messages", () => {
|
||||
expect(tailThinkingTokens([])).toBe(0);
|
||||
});
|
||||
|
||||
it("is 0 when the tail message is the user's", () => {
|
||||
expect(tailThinkingTokens([msg("user", [{ type: "text", text: "q" }])])).toBe(0);
|
||||
});
|
||||
|
||||
it("is 0 when the assistant has produced no reasoning yet", () => {
|
||||
expect(
|
||||
tailThinkingTokens([msg("assistant", [{ type: "text", text: "answer" }])]),
|
||||
).toBe(0);
|
||||
});
|
||||
|
||||
it("estimates reasoning tokens from streamed reasoning text", () => {
|
||||
// 8 chars -> 2 tokens.
|
||||
expect(
|
||||
tailThinkingTokens([
|
||||
msg("assistant", [{ type: "reasoning", text: "12345678" }]),
|
||||
]),
|
||||
).toBe(2);
|
||||
});
|
||||
|
||||
it("uses authoritative usage.reasoningTokens once the server attaches it", () => {
|
||||
expect(
|
||||
tailThinkingTokens([
|
||||
msg("assistant", [{ type: "reasoning", text: "x" }], {
|
||||
usage: { outputTokens: 100, reasoningTokens: 42 },
|
||||
}),
|
||||
]),
|
||||
).toBe(42);
|
||||
});
|
||||
});
|
||||
@@ -16,6 +16,12 @@ interface TypingIndicatorProps {
|
||||
* assistant row above already shows the same name, to avoid a duplicate label.
|
||||
*/
|
||||
showName?: boolean;
|
||||
/**
|
||||
* Live thinking/reasoning token count for the in-flight turn. When > 0 the
|
||||
* typing line becomes `Thinking… · {count} tokens` (like Claude Code). Omitted
|
||||
* / 0 keeps the plain `Thinking…` line.
|
||||
*/
|
||||
thinkingTokens?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -30,9 +36,14 @@ interface TypingIndicatorProps {
|
||||
* typing line is always the generic "Thinking…" (it never includes the
|
||||
* role/identity name).
|
||||
*/
|
||||
export default function TypingIndicator({ assistantName, showName = true }: TypingIndicatorProps) {
|
||||
export default function TypingIndicator({ assistantName, showName = true, thinkingTokens }: TypingIndicatorProps) {
|
||||
const { t } = useTranslation();
|
||||
const name = resolveAssistantName(assistantName);
|
||||
// Show the running thinking-token count only once there is something to count.
|
||||
const thinkingLine =
|
||||
thinkingTokens && thinkingTokens > 0
|
||||
? t("Thinking… · {{count}} tokens", { count: thinkingTokens })
|
||||
: t("Thinking…");
|
||||
|
||||
return (
|
||||
<Box className={classes.messageRow}>
|
||||
@@ -48,7 +59,7 @@ export default function TypingIndicator({ assistantName, showName = true }: Typi
|
||||
<span />
|
||||
</span>
|
||||
<Text size="sm" c="dimmed">
|
||||
{t("Thinking…")}
|
||||
{thinkingLine}
|
||||
</Text>
|
||||
</Group>
|
||||
</Box>
|
||||
|
||||
@@ -106,6 +106,10 @@ export interface IAiChatMessageRow {
|
||||
inputTokens?: number;
|
||||
outputTokens?: number;
|
||||
totalTokens?: number;
|
||||
// Reasoning (thinking) tokens, when the provider reports them. Optional so
|
||||
// old history rows (recorded before this shipped) stay valid. Included in
|
||||
// `outputTokens` per the AI SDK usage shape.
|
||||
reasoningTokens?: number;
|
||||
};
|
||||
// Current context size for the turn = final-step (input+output) tokens, i.e.
|
||||
// how much the conversation occupies in the model's context window after this
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
* ============================ CANONICAL #137 NOTE ============================
|
||||
* This docblock is the single authoritative explanation of the new-chat id
|
||||
* adoption design and the #137 two-tab race it fixes. Other call sites
|
||||
* (use-chat-session.ts, the server's `chatStreamStartMetadata`) reference here
|
||||
* (use-chat-session.ts, the server's `chatStreamMetadata`) reference here
|
||||
* rather than restating it.
|
||||
*
|
||||
* When a user sends the first turn of a BRAND-NEW chat, the client has no chat
|
||||
@@ -17,7 +17,7 @@
|
||||
* leak its later turns into it (#137). We adopt by IDENTITY instead, two ways:
|
||||
*
|
||||
* PRIMARY path: the server streams the real chat id on the assistant message
|
||||
* metadata's `start` part (see `chatStreamStartMetadata` server-side);
|
||||
* metadata's `start` part (see `chatStreamMetadata` server-side);
|
||||
* `extractServerChatId` reads it off the finished message and
|
||||
* `resolveAdoptedChatId` turns it into the id to adopt for a new chat. This is
|
||||
* authoritative and immune to the race.
|
||||
@@ -46,7 +46,7 @@ export function resolveAdoptedChatId(
|
||||
/**
|
||||
* Read the authoritative server chat id off a finished assistant message. The
|
||||
* server attaches it as `message.metadata.chatId` on the `start` part (see
|
||||
* `chatStreamStartMetadata`). Returns it only when it is a string; undefined for
|
||||
* `chatStreamMetadata`). Returns it only when it is a string; undefined for
|
||||
* a missing message, missing metadata, or a non-string `chatId`.
|
||||
*/
|
||||
export function extractServerChatId(
|
||||
|
||||
@@ -314,6 +314,57 @@ describe("buildChatMarkdown — token totals", () => {
|
||||
});
|
||||
expect(md).toContain("- Total tokens: 99");
|
||||
});
|
||||
|
||||
it("appends the reasoning figure to the row footer when reasoningTokens > 0", () => {
|
||||
const md = buildChatMarkdown({
|
||||
title: "t",
|
||||
chatId: "c",
|
||||
rows: [
|
||||
row({
|
||||
role: "assistant",
|
||||
content: "x",
|
||||
metadata: {
|
||||
usage: { inputTokens: 10, outputTokens: 8, reasoningTokens: 3 },
|
||||
},
|
||||
}),
|
||||
],
|
||||
t,
|
||||
});
|
||||
expect(md).toContain("_Tokens — in: 10, out: 8, reasoning: 3, total: 18_");
|
||||
});
|
||||
|
||||
it("omits the reasoning figure when reasoningTokens is 0 / absent", () => {
|
||||
const zero = buildChatMarkdown({
|
||||
title: "t",
|
||||
chatId: "c",
|
||||
rows: [
|
||||
row({
|
||||
role: "assistant",
|
||||
content: "x",
|
||||
metadata: {
|
||||
usage: { inputTokens: 10, outputTokens: 5, reasoningTokens: 0 },
|
||||
},
|
||||
}),
|
||||
],
|
||||
t,
|
||||
});
|
||||
expect(zero).toContain("_Tokens — in: 10, out: 5, total: 15_");
|
||||
expect(zero).not.toContain("reasoning:");
|
||||
|
||||
const absent = buildChatMarkdown({
|
||||
title: "t",
|
||||
chatId: "c",
|
||||
rows: [
|
||||
row({
|
||||
role: "assistant",
|
||||
content: "x",
|
||||
metadata: { usage: { inputTokens: 10, outputTokens: 5 } },
|
||||
}),
|
||||
],
|
||||
t,
|
||||
});
|
||||
expect(absent).not.toContain("reasoning:");
|
||||
});
|
||||
});
|
||||
|
||||
describe("buildChatMarkdown — pending / in-progress messages", () => {
|
||||
|
||||
@@ -77,6 +77,7 @@ function rowTokens(usage: {
|
||||
inputTokens?: number;
|
||||
outputTokens?: number;
|
||||
totalTokens?: number;
|
||||
reasoningTokens?: number;
|
||||
}): number {
|
||||
return (
|
||||
usage.totalTokens ?? (usage.inputTokens ?? 0) + (usage.outputTokens ?? 0)
|
||||
@@ -175,8 +176,14 @@ export function buildChatMarkdown(args: BuildChatMarkdownArgs): string {
|
||||
const usage = row.metadata?.usage;
|
||||
if (usage) {
|
||||
const total = usage.totalTokens ?? rowTokens(usage);
|
||||
// Reasoning (thinking) tokens are shown only when the provider reported a
|
||||
// positive count; old rows / non-reasoning providers omit it.
|
||||
const reasoning =
|
||||
usage.reasoningTokens && usage.reasoningTokens > 0
|
||||
? `, reasoning: ${usage.reasoningTokens}`
|
||||
: "";
|
||||
blocks.push(
|
||||
`_Tokens — in: ${usage.inputTokens ?? "?"}, out: ${usage.outputTokens ?? "?"}, total: ${total}_`,
|
||||
`_Tokens — in: ${usage.inputTokens ?? "?"}, out: ${usage.outputTokens ?? "?"}${reasoning}, total: ${total}_`,
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -0,0 +1,119 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import type { UIMessage } from "@ai-sdk/react";
|
||||
import {
|
||||
estimateTokens,
|
||||
liveTurnTokens,
|
||||
} from "@/features/ai-chat/utils/count-stream-tokens.ts";
|
||||
|
||||
const msg = (parts: unknown[], metadata?: unknown): UIMessage =>
|
||||
({
|
||||
id: Math.random().toString(),
|
||||
role: "assistant",
|
||||
parts,
|
||||
metadata,
|
||||
}) as UIMessage;
|
||||
|
||||
describe("estimateTokens", () => {
|
||||
it("returns 0 for the empty string", () => {
|
||||
expect(estimateTokens("")).toBe(0);
|
||||
});
|
||||
|
||||
it("ceils chars/4 so any non-empty text is at least 1 token", () => {
|
||||
expect(estimateTokens("a")).toBe(1);
|
||||
expect(estimateTokens("abcd")).toBe(1);
|
||||
expect(estimateTokens("abcde")).toBe(2);
|
||||
expect(estimateTokens("12345678")).toBe(2);
|
||||
});
|
||||
});
|
||||
|
||||
describe("liveTurnTokens — estimate path", () => {
|
||||
it("is all zeros for an undefined message", () => {
|
||||
expect(liveTurnTokens(undefined)).toEqual({
|
||||
reasoning: 0,
|
||||
output: 0,
|
||||
authoritative: false,
|
||||
});
|
||||
});
|
||||
|
||||
it("is all zeros for a parts-less message", () => {
|
||||
expect(liveTurnTokens({ id: "x", role: "assistant" } as UIMessage)).toEqual({
|
||||
reasoning: 0,
|
||||
output: 0,
|
||||
authoritative: false,
|
||||
});
|
||||
});
|
||||
|
||||
it("estimates output from text parts", () => {
|
||||
// 8 chars -> 2 tokens.
|
||||
const r = liveTurnTokens(msg([{ type: "text", text: "12345678" }]));
|
||||
expect(r).toEqual({ reasoning: 0, output: 2, authoritative: false });
|
||||
});
|
||||
|
||||
it("estimates reasoning from reasoning parts (kept separate from output)", () => {
|
||||
const r = liveTurnTokens(
|
||||
msg([
|
||||
{ type: "reasoning", text: "12345678" },
|
||||
{ type: "text", text: "abcd" },
|
||||
]),
|
||||
);
|
||||
expect(r).toEqual({ reasoning: 2, output: 1, authoritative: false });
|
||||
});
|
||||
|
||||
it("accumulates across multiple text + reasoning parts (multi-step)", () => {
|
||||
const r = liveTurnTokens(
|
||||
msg([
|
||||
{ type: "reasoning", text: "abcd" }, // 1
|
||||
{ type: "text", text: "abcd" }, // 1
|
||||
{ type: "tool-getPage", state: "output-available" }, // ignored
|
||||
{ type: "reasoning", text: "abcd" }, // 1
|
||||
{ type: "text", text: "abcdefgh" }, // 2
|
||||
]),
|
||||
);
|
||||
expect(r).toEqual({ reasoning: 2, output: 3, authoritative: false });
|
||||
});
|
||||
|
||||
it("ignores non text/reasoning parts (tools, step-start)", () => {
|
||||
const r = liveTurnTokens(
|
||||
msg([
|
||||
{ type: "step-start" },
|
||||
{ type: "tool-getPage", state: "input-available" },
|
||||
]),
|
||||
);
|
||||
expect(r).toEqual({ reasoning: 0, output: 0, authoritative: false });
|
||||
});
|
||||
});
|
||||
|
||||
describe("liveTurnTokens — authoritative path", () => {
|
||||
it("returns authoritative usage verbatim, splitting reasoning out of output", () => {
|
||||
// outputTokens INCLUDES reasoning in the AI SDK shape -> answer = 100 - 30.
|
||||
const r = liveTurnTokens(
|
||||
msg([{ type: "text", text: "estimate would be tiny" }], {
|
||||
usage: { inputTokens: 500, outputTokens: 100, reasoningTokens: 30 },
|
||||
}),
|
||||
);
|
||||
expect(r).toEqual({ reasoning: 30, output: 70, authoritative: true });
|
||||
});
|
||||
|
||||
it("treats missing reasoningTokens as 0 and keeps full output", () => {
|
||||
const r = liveTurnTokens(
|
||||
msg([{ type: "text", text: "x" }], {
|
||||
usage: { inputTokens: 10, outputTokens: 42 },
|
||||
}),
|
||||
);
|
||||
expect(r).toEqual({ reasoning: 0, output: 42, authoritative: true });
|
||||
});
|
||||
|
||||
it("never returns a negative output when reasoning exceeds reported output", () => {
|
||||
const r = liveTurnTokens(
|
||||
msg([], { usage: { outputTokens: 10, reasoningTokens: 40 } }),
|
||||
);
|
||||
expect(r).toEqual({ reasoning: 40, output: 0, authoritative: true });
|
||||
});
|
||||
|
||||
it("falls back to the estimate when metadata has no usage object", () => {
|
||||
const r = liveTurnTokens(
|
||||
msg([{ type: "text", text: "abcd" }], { chatId: "c1" }),
|
||||
);
|
||||
expect(r).toEqual({ reasoning: 0, output: 1, authoritative: false });
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,94 @@
|
||||
import type { UIMessage } from "@ai-sdk/react";
|
||||
|
||||
/**
|
||||
* Live token counting for a streaming AI-chat turn — split into REASONING
|
||||
* (thinking) and OUTPUT (answer) tokens, mirroring how Claude Code shows
|
||||
* `Thinking… · 60 tokens` next to its thinking indicator.
|
||||
*
|
||||
* No provider streams exact per-token usage mid-stream, so the live number is a
|
||||
* CLIENT ESTIMATE (chars/≈4 heuristic) that is reconciled to AUTHORITATIVE usage
|
||||
* once the server attaches it on a step/turn boundary (see the server's
|
||||
* `chatStreamMetadata` + the client's read of `message.metadata.usage`). When
|
||||
* authoritative usage is present we return it verbatim (the number "jumps to
|
||||
* exact"); otherwise we return the running estimate. Pure + unit-testable: it
|
||||
* never runs a real BPE tokenizer (that would be O(n²) on the hot path, bloat the
|
||||
* bundle, and be wrong for Gemini/Ollama anyway).
|
||||
*/
|
||||
|
||||
/**
|
||||
* Rough token estimate for a piece of text using the standard chars/≈4 heuristic.
|
||||
* Returns 0 for empty/whitespace-free-of-content input, and ceils so any
|
||||
* non-empty text counts as at least one token.
|
||||
*/
|
||||
export function estimateTokens(text: string): number {
|
||||
if (!text) return 0;
|
||||
return Math.ceil(text.length / 4);
|
||||
}
|
||||
|
||||
/** Authoritative per-step/turn usage the server attaches to message metadata. */
|
||||
export interface AuthoritativeUsage {
|
||||
inputTokens?: number;
|
||||
outputTokens?: number;
|
||||
totalTokens?: number;
|
||||
reasoningTokens?: number;
|
||||
}
|
||||
|
||||
/** Live token split for a turn's tail (streaming) assistant message. */
|
||||
export interface LiveTurnTokens {
|
||||
/** Thinking/reasoning tokens (estimate, or authoritative when available). */
|
||||
reasoning: number;
|
||||
/** Answer/output tokens (estimate, or authoritative when available). */
|
||||
output: number;
|
||||
/** True when the numbers come from authoritative server usage, not estimate. */
|
||||
authoritative: boolean;
|
||||
}
|
||||
|
||||
/** Read the authoritative usage off a UIMessage's metadata, if the server set it. */
|
||||
function metadataUsage(message: UIMessage): AuthoritativeUsage | undefined {
|
||||
const meta = message?.metadata as
|
||||
| { usage?: AuthoritativeUsage }
|
||||
| undefined;
|
||||
const usage = meta?.usage;
|
||||
if (!usage || typeof usage !== "object") return undefined;
|
||||
return usage;
|
||||
}
|
||||
|
||||
/**
|
||||
* Token split for the given (streaming) assistant message.
|
||||
*
|
||||
* Prefers AUTHORITATIVE `metadata.usage` when the server has attached it (at a
|
||||
* step/turn boundary, incl. `reasoningTokens`) — so the live counter snaps to the
|
||||
* provider's exact figures. Until then it returns a running ESTIMATE summed over
|
||||
* the message parts: `reasoning` parts feed the reasoning estimate, `text` parts
|
||||
* feed the output estimate. Multi-part / multi-step turns accumulate naturally
|
||||
* because every part of the turn is summed.
|
||||
*
|
||||
* Providers that don't stream reasoning text still surface a reasoning count once
|
||||
* the authoritative usage arrives (`usage.reasoningTokens`); on the pure estimate
|
||||
* path such a turn simply shows `reasoning: 0` until then.
|
||||
*/
|
||||
export function liveTurnTokens(message: UIMessage | undefined): LiveTurnTokens {
|
||||
if (!message) return { reasoning: 0, output: 0, authoritative: false };
|
||||
|
||||
const usage = metadataUsage(message);
|
||||
if (usage) {
|
||||
// Authoritative branch: outputTokens already INCLUDES reasoning tokens in the
|
||||
// AI SDK usage shape, so subtract reasoning out for the "answer" figure (never
|
||||
// go negative if a provider reports them inconsistently).
|
||||
const reasoning = usage.reasoningTokens ?? 0;
|
||||
const totalOutput = usage.outputTokens ?? 0;
|
||||
const output = Math.max(0, totalOutput - reasoning);
|
||||
return { reasoning, output, authoritative: true };
|
||||
}
|
||||
|
||||
let reasoning = 0;
|
||||
let output = 0;
|
||||
for (const part of message.parts ?? []) {
|
||||
if (part.type === "reasoning") {
|
||||
reasoning += estimateTokens((part as { text?: string }).text ?? "");
|
||||
} else if (part.type === "text") {
|
||||
output += estimateTokens((part as { text?: string }).text ?? "");
|
||||
}
|
||||
}
|
||||
return { reasoning, output, authoritative: false };
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import type { UIMessage } from "@ai-sdk/react";
|
||||
import { reasoningTokensForPart } from "@/features/ai-chat/utils/reasoning-tokens.ts";
|
||||
|
||||
/**
|
||||
* Pure-helper tests for `reasoningTokensForPart`, the #151 anti-double-count
|
||||
* rule: the authoritative `usage.reasoningTokens` is the TURN TOTAL, so it may
|
||||
* only be attributed when the turn has exactly one reasoning part. With multiple
|
||||
* reasoning parts (or no authoritative usage) every part falls back to its own
|
||||
* per-part estimate, signalled here by `undefined`.
|
||||
*/
|
||||
const msg = (
|
||||
parts: UIMessage["parts"],
|
||||
metadata?: unknown,
|
||||
): UIMessage =>
|
||||
({
|
||||
id: Math.random().toString(),
|
||||
role: "assistant",
|
||||
parts,
|
||||
metadata,
|
||||
}) as UIMessage;
|
||||
|
||||
describe("reasoningTokensForPart", () => {
|
||||
it("single reasoning part -> the authoritative turn total", () => {
|
||||
const m = msg(
|
||||
[
|
||||
{ type: "reasoning", text: "thinking…" } as never,
|
||||
{ type: "text", text: "answer" },
|
||||
],
|
||||
{ usage: { reasoningTokens: 42 } },
|
||||
);
|
||||
expect(reasoningTokensForPart(m)).toBe(42);
|
||||
});
|
||||
|
||||
it("multiple reasoning parts -> undefined (each estimates on its own)", () => {
|
||||
const m = msg(
|
||||
[
|
||||
{ type: "reasoning", text: "step one" } as never,
|
||||
{ type: "reasoning", text: "step two" } as never,
|
||||
{ type: "text", text: "answer" },
|
||||
],
|
||||
{ usage: { reasoningTokens: 99 } },
|
||||
);
|
||||
// Even with an authoritative total, two reasoning parts must each estimate
|
||||
// (attributing the total to one would double-count against the other).
|
||||
expect(reasoningTokensForPart(m)).toBeUndefined();
|
||||
});
|
||||
|
||||
it("no authoritative usage -> undefined even for a single reasoning part", () => {
|
||||
const m = msg([
|
||||
{ type: "reasoning", text: "thinking…" } as never,
|
||||
{ type: "text", text: "answer" },
|
||||
]);
|
||||
expect(reasoningTokensForPart(m)).toBeUndefined();
|
||||
});
|
||||
});
|
||||
34
apps/client/src/features/ai-chat/utils/reasoning-tokens.ts
Normal file
34
apps/client/src/features/ai-chat/utils/reasoning-tokens.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
import type { UIMessage } from "@ai-sdk/react";
|
||||
|
||||
/**
|
||||
* Decide the authoritative reasoning token count to attribute to a single
|
||||
* `reasoning` part of an assistant message — or `undefined` when the part should
|
||||
* fall back to its own per-part estimate.
|
||||
*
|
||||
* `usage.reasoningTokens` is the TURN TOTAL, so it may only be attributed to a
|
||||
* block when the turn has exactly ONE reasoning part (the common one-step turn):
|
||||
* then that block can show the exact figure. With MULTIPLE reasoning parts (a
|
||||
* multi-step agent turn) every block must fall back to its own estimate —
|
||||
* attributing the turn total to one of them would double-count against the
|
||||
* others' estimates (#151 review anti-double-count rule). When there is no
|
||||
* authoritative usage at all, every part estimates.
|
||||
*
|
||||
* Returns the authoritative `reasoningTokens` only for the single-reasoning-part
|
||||
* case; `undefined` otherwise (the caller estimates from the part text).
|
||||
*/
|
||||
export function reasoningTokensForPart(
|
||||
message: UIMessage,
|
||||
): number | undefined {
|
||||
const reasoningTokens = (
|
||||
message.metadata as { usage?: { reasoningTokens?: number } } | undefined
|
||||
)?.usage?.reasoningTokens;
|
||||
|
||||
const reasoningPartCount = (message.parts ?? []).reduce(
|
||||
(acc, p) => (p.type === "reasoning" ? acc + 1 : acc),
|
||||
0,
|
||||
);
|
||||
|
||||
// Exactly one reasoning part -> attribute the authoritative turn total to it.
|
||||
// Otherwise (zero or multiple) each part estimates on its own.
|
||||
return reasoningPartCount === 1 ? reasoningTokens : undefined;
|
||||
}
|
||||
Reference in New Issue
Block a user