Merge pull request 'feat(ai-chat): realtime token counter + reasoning tokens (#151)' (#158) from feat/ai-chat-realtime-tokens into develop

Reviewed-on: #158
2026-06-24 13:07:51 +03:00
parent 7325eeac19 044e3f7e6a
commit 9225eeeeed
21 changed files with 979 additions and 31 deletions
--- a/apps/client/public/locales/en-US/translation.json
+++ b/apps/client/public/locales/en-US/translation.json
@@ -1147,6 +1147,12 @@
  "Ask a question about this documentation.": "Ask a question about this documentation.",
  "Ask a question…": "Ask a question…",
  "Thinking…": "Thinking…",
+  "Thinking… · {{count}} tokens": "Thinking… · {{count}} tokens",
+  "Thinking… · {{count}} tokens_one": "Thinking… · {{count}} token",
+  "Thinking… · {{count}} tokens_other": "Thinking… · {{count}} tokens",
+  "Thinking · {{count}} tokens": "Thinking · {{count}} tokens",
+  "Thinking · {{count}} tokens_one": "Thinking · {{count}} token",
+  "Thinking · {{count}} tokens_other": "Thinking · {{count}} tokens",
  "The assistant is unavailable right now. Please try again.": "The assistant is unavailable right now. Please try again.",
  "Public share assistant": "Public share assistant",
  "Let anonymous visitors of public shares ask an AI assistant scoped to that share's pages. You pay for the tokens.": "Let anonymous visitors of public shares ask an AI assistant scoped to that share's pages. You pay for the tokens.",
@@ -1158,6 +1164,7 @@
  "Built-in assistant persona": "Built-in assistant persona",
  "Minimize": "Minimize",
  "Current context size": "Current context size",
+  "Tokens generated this turn": "Tokens generated this turn",
  "AI agent": "AI agent",
  "Take a look at the current document": "Take a look at the current document",
  "AI agent is typing…": "AI agent is typing…",
--- a/apps/client/public/locales/ru-RU/translation.json
+++ b/apps/client/public/locales/ru-RU/translation.json
@@ -684,6 +684,14 @@
  "AI agent is typing…": "AI-агент печатает…",
  "{{name}} is typing…": "{{name}} печатает…",
  "Thinking…": "Думаю…",
+  "Thinking… · {{count}} tokens": "Думаю… · {{count}} токенов",
+  "Thinking… · {{count}} tokens_one": "Думаю… · {{count}} токен",
+  "Thinking… · {{count}} tokens_few": "Думаю… · {{count}} токена",
+  "Thinking… · {{count}} tokens_many": "Думаю… · {{count}} токенов",
+  "Thinking · {{count}} tokens": "Размышления · {{count}} токенов",
+  "Thinking · {{count}} tokens_one": "Размышления · {{count}} токен",
+  "Thinking · {{count}} tokens_few": "Размышления · {{count}} токена",
+  "Thinking · {{count}} tokens_many": "Размышления · {{count}} токенов",
  "Agent role": "Роль агента",
  "AI chat": "AI-чат",
  "AI chat is disabled for this workspace.": "AI-чат отключён для этого рабочего пространства.",
@@ -694,6 +702,7 @@
  "Copy chat": "Копировать чат",
  "Created successfully": "Успешно создано",
  "Current context size": "Текущий размер контекста",
+  "Tokens generated this turn": "Токенов сгенерировано за ход",
  "Delete this chat?": "Удалить этот чат?",
  "Deleted successfully": "Успешно удалено",
  "Edited by AI agent on behalf of {{name}}": "Отредактировано AI-агентом от имени {{name}}",
--- a/apps/client/src/features/ai-chat/components/ai-chat-window.tsx
+++ b/apps/client/src/features/ai-chat/components/ai-chat-window.tsx
@@ -156,6 +156,12 @@ export default function AiChatWindow() {
    isStreaming: false,
  });

+  // Live turn-token total (reasoning + output) for the in-flight turn, pushed up
+  // (THROTTLED to ~8 Hz inside ChatThread) so the header badge ticks mid-stream.
+  // `null` means no turn is in flight -> the badge falls back to the persisted
+  // context size below.
+  const [liveTurnTokens, setLiveTurnTokens] = useState<number | null>(null);
+
  // The page the user is currently viewing. AiChatWindow lives in a pathless
  // parent layout route, so useParams() can't see :pageSlug. Match the full
  // pathname against the authenticated page route instead so "the current page"
@@ -485,11 +491,19 @@ export default function AiChatWindow() {
        )}

        <div style={{ flex: 1, display: "flex", justifyContent: "center" }}>
-          {contextTokens > 0 && (
+          {/* While a turn streams, show the LIVE turn-token count (ticks ~8 Hz);
+              once it finishes, fall back to the persisted context size. Require
+              > 0 so the very first emit (an empty tail message, count 0) does not
+              flash a "0" badge before any token streams in (#151 review). */}
+          {liveTurnTokens !== null && liveTurnTokens > 0 ? (
+            <Tooltip label={t("Tokens generated this turn")} withArrow>
+              <span className={classes.badge}>{formatTokens(liveTurnTokens)}</span>
+            </Tooltip>
+          ) : contextTokens > 0 ? (
            <Tooltip label={t("Current context size")} withArrow>
              <span className={classes.badge}>{formatTokens(contextTokens)}</span>
            </Tooltip>
-          )}
+          ) : null}
        </div>

        <div style={{ display: "flex", alignItems: "center", gap: 1 }}>
@@ -608,6 +622,7 @@ export default function AiChatWindow() {
              assistantName={currentRole?.name}
              onTurnFinished={onTurnFinished}
              liveStateRef={liveThreadRef}
+              onLiveTurnTokens={setLiveTurnTokens}
            />
          )}
        </div>
--- a/apps/client/src/features/ai-chat/components/ai-chat.module.css
+++ b/apps/client/src/features/ai-chat/components/ai-chat.module.css
@@ -111,6 +111,24 @@
    background: light-dark(var(--mantine-color-gray-0), var(--mantine-color-dark-6));
 }

+/* Collapsible "Thinking" (reasoning) block: a subtle left rule, dimmer than the
+   answer so it reads as secondary thinking context above the real answer. */
+.reasoningBlock {
+    border-left: 2px solid light-dark(var(--mantine-color-gray-3), var(--mantine-color-dark-4));
+    padding-left: 8px;
+}
+
+.reasoningText {
+    margin-top: 4px;
+    font-size: var(--mantine-font-size-xs);
+    color: light-dark(var(--mantine-color-gray-7), var(--mantine-color-dark-1));
+    white-space: pre-wrap;
+}
+
+.reasoningText p {
+    margin: 0 0 4px;
+}
+
 .inputWrapper {
    flex: 0 0 auto;
    padding-top: var(--mantine-spacing-xs);
--- a/apps/client/src/features/ai-chat/components/chat-thread.tsx
+++ b/apps/client/src/features/ai-chat/components/chat-thread.tsx
@@ -27,6 +27,7 @@ import {
 } from "@/features/ai-chat/utils/role-launch.ts";
 import { describeChatError } from "@/features/ai-chat/utils/error-message.ts";
 import { extractServerChatId } from "@/features/ai-chat/utils/adopt-chat-id.ts";
+import { liveTurnTokens } from "@/features/ai-chat/utils/count-stream-tokens.ts";
 import {
  dequeue,
  enqueueMessage,
@@ -73,6 +74,12 @@ interface ChatThreadProps {
   *  assistant message. A ref (not state) avoids re-rendering the parent on
   *  every streamed delta. */
  liveStateRef?: MutableRefObject<{ messages: UIMessage[]; isStreaming: boolean }>;
+  /** Reports the live turn-token total (reasoning + output) for the in-flight
+   *  turn so the parent can show a header badge that ticks mid-stream. THROTTLED
+   *  here (~8 Hz) so the parent re-renders a handful of times a second, not on
+   *  every streamed delta. Called with `null` when no turn is in flight (the
+   *  parent then reverts the badge to the persisted context size). */
+  onLiveTurnTokens?: (tokens: number | null) => void;
 }

 /**
@@ -117,6 +124,7 @@ export default function ChatThread({
  assistantName,
  onTurnFinished,
  liveStateRef,
+  onLiveTurnTokens,
 }: ChatThreadProps) {
  const { t } = useTranslation();

@@ -314,6 +322,54 @@ export default function ChatThread({
    };
  }, [liveStateRef, messages, isStreaming]);

+  // Report the live turn-token total to the parent header badge, THROTTLED to
+  // ~8 Hz so the parent re-renders a few times a second instead of on every
+  // streamed delta. The tail assistant message's reasoning+output (estimate while
+  // streaming, authoritative once a step reports usage) is the live figure. When
+  // the turn ends we emit a final exact value, then `null` so the parent reverts
+  // the badge to the persisted context size.
+  const lastEmitRef = useRef(0);
+  const emitTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+  useEffect(() => {
+    if (!onLiveTurnTokens) return;
+    if (!isStreaming) {
+      // Turn ended (or never started): clear any pending throttle and revert.
+      if (emitTimerRef.current) {
+        clearTimeout(emitTimerRef.current);
+        emitTimerRef.current = null;
+      }
+      lastEmitRef.current = 0;
+      onLiveTurnTokens(null);
+      return;
+    }
+    const tail = messages[messages.length - 1];
+    const live =
+      tail?.role === "assistant" ? liveTurnTokens(tail) : null;
+    const total = live ? live.reasoning + live.output : 0;
+    const now = Date.now();
+    const MIN_INTERVAL = 120; // ms (~8 Hz)
+    const elapsed = now - lastEmitRef.current;
+    if (elapsed >= MIN_INTERVAL) {
+      lastEmitRef.current = now;
+      onLiveTurnTokens(total);
+    } else if (!emitTimerRef.current) {
+      // Schedule a trailing emit so the FINAL value of a burst is not dropped.
+      emitTimerRef.current = setTimeout(() => {
+        emitTimerRef.current = null;
+        lastEmitRef.current = Date.now();
+        onLiveTurnTokens(total);
+      }, MIN_INTERVAL - elapsed);
+    }
+  }, [messages, isStreaming, onLiveTurnTokens]);
+
+  // Clear any pending throttle timer on unmount (chat switch via `key`) so a
+  // trailing emit can't fire into a torn-down thread's parent.
+  useEffect(() => {
+    return () => {
+      if (emitTimerRef.current) clearTimeout(emitTimerRef.current);
+    };
+  }, []);
+
  // Classify the turn error into a heading + detail so the banner names the cause
  // (connection reset, timeout, rate limit, context overflow, quota, ...) instead
  // of a generic "Something went wrong".
--- a/apps/client/src/features/ai-chat/components/message-item.tsx
+++ b/apps/client/src/features/ai-chat/components/message-item.tsx
@@ -2,12 +2,14 @@ import { Box, Text } from "@mantine/core";
 import { useTranslation } from "react-i18next";
 import type { UIMessage } from "@ai-sdk/react";
 import ToolCallCard from "@/features/ai-chat/components/tool-call-card.tsx";
+import ReasoningBlock from "@/features/ai-chat/components/reasoning-block.tsx";
 import ChatErrorAlert from "@/features/ai-chat/components/chat-error-alert.tsx";
 import ChatStoppedNotice from "@/features/ai-chat/components/chat-stopped-notice.tsx";
 import { ToolUiPart, isToolPart } from "@/features/ai-chat/utils/tool-parts.tsx";
 import { assistantMessageHasVisibleContent } from "@/features/ai-chat/utils/message-content.ts";
 import { renderChatMarkdown } from "@/features/ai-chat/utils/markdown.ts";
 import { resolveAssistantName } from "@/features/ai-chat/utils/assistant-name.ts";
+import { reasoningTokensForPart } from "@/features/ai-chat/utils/reasoning-tokens.ts";
 import { describeChatError } from "@/features/ai-chat/utils/error-message.ts";
 import classes from "@/features/ai-chat/components/ai-chat.module.css";

@@ -77,12 +79,31 @@ export default function MessageItem({
  // return won't fire for them.
  if (!assistantMessageHasVisibleContent(message)) return null;

+  // Authoritative reasoning token count to attribute to a reasoning block, or
+  // undefined when the block must estimate on its own. See reasoningTokensForPart
+  // for the #151 anti-double-count rule (only a single reasoning part may carry
+  // the turn total). The authoritative turn total is still surfaced live in the
+  // header badge regardless.
+  const reasoningTokens = reasoningTokensForPart(message);
+
  return (
    <Box className={classes.messageRow}>
      <Text size="xs" c="dimmed" mb={4}>
        {resolveAssistantName(assistantName) ?? t("AI agent")}
      </Text>
      {message.parts.map((part, index) => {
+        if (part.type === "reasoning") {
+          // Reasoning ("thinking") -> a collapsible block with its own token
+          // count. Empty/whitespace reasoning with no authoritative count carries
+          // nothing to show, so skip it (avoids an empty 0-token block).
+          const text = (part as { text?: string }).text ?? "";
+          if (!text.trim() && !(reasoningTokens && reasoningTokens > 0))
+            return null;
+          return (
+            <ReasoningBlock key={index} text={text} tokens={reasoningTokens} />
+          );
+        }
+
        if (part.type === "text") {
          // Skip empty/whitespace-only text parts (a streaming message often
          // starts with an empty text part before the first token arrives); the
--- a/apps/client/src/features/ai-chat/components/message-list.tsx
+++ b/apps/client/src/features/ai-chat/components/message-list.tsx
@@ -6,6 +6,7 @@ import MessageItem from "@/features/ai-chat/components/message-item.tsx";
 import TypingIndicator from "@/features/ai-chat/components/typing-indicator.tsx";
 import { isToolPart, toolRunState, ToolUiPart } from "@/features/ai-chat/utils/tool-parts.tsx";
 import { assistantMessageHasVisibleContent } from "@/features/ai-chat/utils/message-content.ts";
+import { liveTurnTokens } from "@/features/ai-chat/utils/count-stream-tokens.ts";
 import classes from "@/features/ai-chat/components/ai-chat.module.css";

 interface MessageListProps {
@@ -94,6 +95,19 @@ export function typingIndicatorShowsName(messages: UIMessage[]): boolean {
  return !assistantMessageHasVisibleContent(last);
 }

+/**
+ * The live thinking-token count to show on the standalone typing indicator. It
+ * is the reasoning split of the tail assistant message (estimate while streaming,
+ * authoritative once the server attaches usage at a step/turn boundary). Returns
+ * 0 when the turn has produced no reasoning yet — the indicator then shows the
+ * plain "Thinking…" line.
+ */
+export function tailThinkingTokens(messages: UIMessage[]): number {
+  const last = messages[messages.length - 1];
+  if (!last || last.role !== "assistant") return 0;
+  return liveTurnTokens(last).reasoning;
+}
+
 /**
 * Scrollable transcript. Auto-scrolls to the newest message as it streams in,
 * but only while the user is pinned to the bottom — if they scrolled up to read
@@ -190,7 +204,13 @@ export default function MessageList({
            assistantName={assistantName}
          />
        ))}
-        {typing && <TypingIndicator assistantName={assistantName} showName={typingIndicatorShowsName(messages)} />}
+        {typing && (
+          <TypingIndicator
+            assistantName={assistantName}
+            showName={typingIndicatorShowsName(messages)}
+            thinkingTokens={tailThinkingTokens(messages)}
+          />
+        )}
      </Stack>
    </ScrollArea>
  );
--- a/apps/client/src/features/ai-chat/components/reasoning-block.test.tsx
+++ b/apps/client/src/features/ai-chat/components/reasoning-block.test.tsx
@@ -0,0 +1,65 @@
+import { describe, it, expect, vi } from "vitest";
+import { render, screen } from "@testing-library/react";
+import { MantineProvider } from "@mantine/core";
+
+// Stub react-i18next so `t` returns the key with `{{count}}` interpolated. This
+// keeps the assertions on the component's OWN count logic (authoritative vs
+// estimate) rather than on translation, and mirrors the t-mock pattern used by
+// other component tests in the repo.
+vi.mock("react-i18next", () => ({
+  useTranslation: () => ({
+    t: (key: string, opts?: { count?: number }) =>
+      opts && typeof opts.count === "number"
+        ? key.replace("{{count}}", String(opts.count))
+        : key,
+  }),
+}));
+
+import ReasoningBlock from "./reasoning-block";
+import { estimateTokens } from "@/features/ai-chat/utils/count-stream-tokens.ts";
+
+// matchMedia (read by MantineProvider) is stubbed globally in vitest.setup.ts.
+
+function renderBlock(props: { text: string; tokens?: number }) {
+  return render(
+    <MantineProvider>
+      <ReasoningBlock {...props} />
+    </MantineProvider>,
+  );
+}
+
+describe("ReasoningBlock", () => {
+  it("shows the authoritative count in the header when tokens > 0", () => {
+    // Text "thinking…" estimates to ceil(9/4) = 3, but the authoritative 42
+    // must win, so the header shows 42 (and NOT the 3-token estimate).
+    renderBlock({ text: "thinking…", tokens: 42 });
+    expect(screen.getByText("Thinking · 42 tokens")).toBeDefined();
+    expect(screen.queryByText("Thinking · 3 tokens")).toBeNull();
+  });
+
+  it("falls back to the text-length estimate when no authoritative tokens", () => {
+    const text = "some reasoning prose that streams in";
+    const estimate = estimateTokens(text);
+    renderBlock({ text });
+    expect(estimate).toBeGreaterThan(0);
+    expect(screen.getByText(new RegExp(`${estimate} tokens`))).toBeDefined();
+  });
+
+  it("header-only when text is empty but an authoritative count is present", () => {
+    renderBlock({ text: "", tokens: 17 });
+    expect(screen.getByText(/17 tokens/)).toBeDefined();
+    // No disclosure body to expand: the toggle button is disabled.
+    const button = screen.getByRole("button");
+    expect((button as HTMLButtonElement).disabled).toBe(true);
+  });
+
+  it("renders the reasoning body (markdown or raw-text fallback)", () => {
+    renderBlock({ text: "**bold** reasoning", tokens: 5 });
+    // The toggle is enabled because there IS body text to expand.
+    const button = screen.getByRole("button");
+    expect((button as HTMLButtonElement).disabled).toBe(false);
+    // The body prose renders (markdown -> sanitized html, or raw-text fallback);
+    // either way the text is present in the document.
+    expect(screen.getByText(/reasoning/)).toBeDefined();
+  });
+});
--- a/apps/client/src/features/ai-chat/components/reasoning-block.tsx
+++ b/apps/client/src/features/ai-chat/components/reasoning-block.tsx
@@ -0,0 +1,83 @@
+import { useState } from "react";
+import { Box, Collapse, Group, Text, UnstyledButton } from "@mantine/core";
+import { IconChevronDown } from "@tabler/icons-react";
+import { useTranslation } from "react-i18next";
+import { estimateTokens } from "@/features/ai-chat/utils/count-stream-tokens.ts";
+import { renderChatMarkdown } from "@/features/ai-chat/utils/markdown.ts";
+import classes from "@/features/ai-chat/components/ai-chat.module.css";
+
+interface ReasoningBlockProps {
+  /** The streamed/persisted reasoning (thinking) text. May be empty when the
+   *  provider reports only a reasoning token COUNT without the text. */
+  text: string;
+  /** Authoritative reasoning token count from `usage.reasoningTokens`, when the
+   *  step/turn has finished. When absent (or 0) the count is estimated from the
+   *  text length so it ticks live as the reasoning streams in. */
+  tokens?: number;
+}
+
+/**
+ * Collapsible "Thinking" block for an assistant `reasoning` part. Mirrors Claude
+ * Code's surfacing of the model's thinking: a header that shows the thinking
+ * token count (authoritative when the step has reported usage, else a live
+ * estimate from the streamed text) and an expandable body with the reasoning
+ * prose. Collapsed by default so it never crowds out the answer.
+ *
+ * Providers that don't stream reasoning TEXT still render this block from the
+ * authoritative count alone (header only, empty body) so the cost is visible.
+ */
+export default function ReasoningBlock({ text, tokens }: ReasoningBlockProps) {
+  const { t } = useTranslation();
+  const [open, setOpen] = useState(false);
+
+  // Authoritative count wins; otherwise estimate live from the streamed text.
+  const count = tokens && tokens > 0 ? tokens : estimateTokens(text);
+  const trimmed = text.trim();
+  const html = trimmed ? renderChatMarkdown(trimmed, {}) : "";
+
+  return (
+    <Box className={classes.reasoningBlock} mb={6}>
+      <UnstyledButton
+        onClick={() => setOpen((o) => !o)}
+        // No body to expand when the provider reported only a token count.
+        disabled={!trimmed}
+        aria-expanded={open}
+      >
+        <Group gap={6} wrap="nowrap" align="center">
+          <IconChevronDown
+            size={12}
+            style={{
+              transform: open ? "none" : "rotate(-90deg)",
+              transition: "transform 150ms ease",
+              opacity: trimmed ? 1 : 0.4,
+            }}
+          />
+          <Text size="xs" c="dimmed">
+            {count > 0
+              ? t("Thinking · {{count}} tokens", { count })
+              : t("Thinking")}
+          </Text>
+        </Group>
+      </UnstyledButton>
+
+      {trimmed && (
+        <Collapse in={open}>
+          {html ? (
+            <div
+              className={classes.reasoningText}
+              // Sanitized by renderChatMarkdown (DOMPurify) before insertion.
+              dangerouslySetInnerHTML={{ __html: html }}
+            />
+          ) : (
+            <Text
+              className={classes.reasoningText}
+              style={{ whiteSpace: "pre-wrap" }}
+            >
+              {trimmed}
+            </Text>
+          )}
+        </Collapse>
+      )}
+    </Box>
+  );
+}
--- a/apps/client/src/features/ai-chat/components/tail-thinking-tokens.test.ts
+++ b/apps/client/src/features/ai-chat/components/tail-thinking-tokens.test.ts
@@ -0,0 +1,50 @@
+import { describe, expect, it } from "vitest";
+import type { UIMessage } from "@ai-sdk/react";
+import { tailThinkingTokens } from "@/features/ai-chat/components/message-list.tsx";
+
+/**
+ * Pure-helper tests for `tailThinkingTokens`: the live thinking-token count the
+ * standalone typing indicator shows. It is the reasoning split of the tail
+ * assistant message (estimate while streaming, authoritative once usage arrives).
+ */
+const msg = (
+  role: "user" | "assistant",
+  parts: unknown[],
+  metadata?: unknown,
+): UIMessage =>
+  ({ id: Math.random().toString(), role, parts, metadata }) as UIMessage;
+
+describe("tailThinkingTokens", () => {
+  it("is 0 when there are no messages", () => {
+    expect(tailThinkingTokens([])).toBe(0);
+  });
+
+  it("is 0 when the tail message is the user's", () => {
+    expect(tailThinkingTokens([msg("user", [{ type: "text", text: "q" }])])).toBe(0);
+  });
+
+  it("is 0 when the assistant has produced no reasoning yet", () => {
+    expect(
+      tailThinkingTokens([msg("assistant", [{ type: "text", text: "answer" }])]),
+    ).toBe(0);
+  });
+
+  it("estimates reasoning tokens from streamed reasoning text", () => {
+    // 8 chars -> 2 tokens.
+    expect(
+      tailThinkingTokens([
+        msg("assistant", [{ type: "reasoning", text: "12345678" }]),
+      ]),
+    ).toBe(2);
+  });
+
+  it("uses authoritative usage.reasoningTokens once the server attaches it", () => {
+    expect(
+      tailThinkingTokens([
+        msg("assistant", [{ type: "reasoning", text: "x" }], {
+          usage: { outputTokens: 100, reasoningTokens: 42 },
+        }),
+      ]),
+    ).toBe(42);
+  });
+});
--- a/apps/client/src/features/ai-chat/components/typing-indicator.tsx
+++ b/apps/client/src/features/ai-chat/components/typing-indicator.tsx
@@ -16,6 +16,12 @@ interface TypingIndicatorProps {
   * assistant row above already shows the same name, to avoid a duplicate label.
   */
  showName?: boolean;
+  /**
+   * Live thinking/reasoning token count for the in-flight turn. When > 0 the
+   * typing line becomes `Thinking… · {count} tokens` (like Claude Code). Omitted
+   * / 0 keeps the plain `Thinking…` line.
+   */
+  thinkingTokens?: number;
 }

 /**
@@ -30,9 +36,14 @@ interface TypingIndicatorProps {
 * typing line is always the generic "Thinking…" (it never includes the
 * role/identity name).
 */
-export default function TypingIndicator({ assistantName, showName = true }: TypingIndicatorProps) {
+export default function TypingIndicator({ assistantName, showName = true, thinkingTokens }: TypingIndicatorProps) {
  const { t } = useTranslation();
  const name = resolveAssistantName(assistantName);
+  // Show the running thinking-token count only once there is something to count.
+  const thinkingLine =
+    thinkingTokens && thinkingTokens > 0
+      ? t("Thinking… · {{count}} tokens", { count: thinkingTokens })
+      : t("Thinking…");

  return (
    <Box className={classes.messageRow}>
@@ -48,7 +59,7 @@ export default function TypingIndicator({ assistantName, showName = true }: Typi
          <span />
        </span>
        <Text size="sm" c="dimmed">
-          {t("Thinking…")}
+          {thinkingLine}
        </Text>
      </Group>
    </Box>
--- a/apps/client/src/features/ai-chat/types/ai-chat.types.ts
+++ b/apps/client/src/features/ai-chat/types/ai-chat.types.ts
@@ -106,6 +106,10 @@ export interface IAiChatMessageRow {
      inputTokens?: number;
      outputTokens?: number;
      totalTokens?: number;
+      // Reasoning (thinking) tokens, when the provider reports them. Optional so
+      // old history rows (recorded before this shipped) stay valid. Included in
+      // `outputTokens` per the AI SDK usage shape.
+      reasoningTokens?: number;
    };
    // Current context size for the turn = final-step (input+output) tokens, i.e.
    // how much the conversation occupies in the model's context window after this
--- a/apps/client/src/features/ai-chat/utils/adopt-chat-id.ts
+++ b/apps/client/src/features/ai-chat/utils/adopt-chat-id.ts
@@ -4,7 +4,7 @@
 * ============================ CANONICAL #137 NOTE ============================
 * This docblock is the single authoritative explanation of the new-chat id
 * adoption design and the #137 two-tab race it fixes. Other call sites
- * (use-chat-session.ts, the server's `chatStreamStartMetadata`) reference here
+ * (use-chat-session.ts, the server's `chatStreamMetadata`) reference here
 * rather than restating it.
 *
 * When a user sends the first turn of a BRAND-NEW chat, the client has no chat
@@ -17,7 +17,7 @@
 * leak its later turns into it (#137). We adopt by IDENTITY instead, two ways:
 *
 * PRIMARY path: the server streams the real chat id on the assistant message
- * metadata's `start` part (see `chatStreamStartMetadata` server-side);
+ * metadata's `start` part (see `chatStreamMetadata` server-side);
 * `extractServerChatId` reads it off the finished message and
 * `resolveAdoptedChatId` turns it into the id to adopt for a new chat. This is
 * authoritative and immune to the race.
@@ -46,7 +46,7 @@ export function resolveAdoptedChatId(
 /**
 * Read the authoritative server chat id off a finished assistant message. The
 * server attaches it as `message.metadata.chatId` on the `start` part (see
- * `chatStreamStartMetadata`). Returns it only when it is a string; undefined for
+ * `chatStreamMetadata`). Returns it only when it is a string; undefined for
 * a missing message, missing metadata, or a non-string `chatId`.
 */
 export function extractServerChatId(
--- a/apps/client/src/features/ai-chat/utils/chat-markdown.test.ts
+++ b/apps/client/src/features/ai-chat/utils/chat-markdown.test.ts
@@ -314,6 +314,57 @@ describe("buildChatMarkdown — token totals", () => {
    });
    expect(md).toContain("- Total tokens: 99");
  });
+
+  it("appends the reasoning figure to the row footer when reasoningTokens > 0", () => {
+    const md = buildChatMarkdown({
+      title: "t",
+      chatId: "c",
+      rows: [
+        row({
+          role: "assistant",
+          content: "x",
+          metadata: {
+            usage: { inputTokens: 10, outputTokens: 8, reasoningTokens: 3 },
+          },
+        }),
+      ],
+      t,
+    });
+    expect(md).toContain("_Tokens — in: 10, out: 8, reasoning: 3, total: 18_");
+  });
+
+  it("omits the reasoning figure when reasoningTokens is 0 / absent", () => {
+    const zero = buildChatMarkdown({
+      title: "t",
+      chatId: "c",
+      rows: [
+        row({
+          role: "assistant",
+          content: "x",
+          metadata: {
+            usage: { inputTokens: 10, outputTokens: 5, reasoningTokens: 0 },
+          },
+        }),
+      ],
+      t,
+    });
+    expect(zero).toContain("_Tokens — in: 10, out: 5, total: 15_");
+    expect(zero).not.toContain("reasoning:");
+
+    const absent = buildChatMarkdown({
+      title: "t",
+      chatId: "c",
+      rows: [
+        row({
+          role: "assistant",
+          content: "x",
+          metadata: { usage: { inputTokens: 10, outputTokens: 5 } },
+        }),
+      ],
+      t,
+    });
+    expect(absent).not.toContain("reasoning:");
+  });
 });

 describe("buildChatMarkdown — pending / in-progress messages", () => {
--- a/apps/client/src/features/ai-chat/utils/chat-markdown.ts
+++ b/apps/client/src/features/ai-chat/utils/chat-markdown.ts
@@ -77,6 +77,7 @@ function rowTokens(usage: {
  inputTokens?: number;
  outputTokens?: number;
  totalTokens?: number;
+  reasoningTokens?: number;
 }): number {
  return (
    usage.totalTokens ?? (usage.inputTokens ?? 0) + (usage.outputTokens ?? 0)
@@ -175,8 +176,14 @@ export function buildChatMarkdown(args: BuildChatMarkdownArgs): string {
    const usage = row.metadata?.usage;
    if (usage) {
      const total = usage.totalTokens ?? rowTokens(usage);
+      // Reasoning (thinking) tokens are shown only when the provider reported a
+      // positive count; old rows / non-reasoning providers omit it.
+      const reasoning =
+        usage.reasoningTokens && usage.reasoningTokens > 0
+          ? `, reasoning: ${usage.reasoningTokens}`
+          : "";
      blocks.push(
-        `_Tokens — in: ${usage.inputTokens ?? "?"}, out: ${usage.outputTokens ?? "?"}, total: ${total}_`,
+        `_Tokens — in: ${usage.inputTokens ?? "?"}, out: ${usage.outputTokens ?? "?"}${reasoning}, total: ${total}_`,
      );
    }
  });
--- a/apps/client/src/features/ai-chat/utils/count-stream-tokens.test.ts
+++ b/apps/client/src/features/ai-chat/utils/count-stream-tokens.test.ts
@@ -0,0 +1,119 @@
+import { describe, expect, it } from "vitest";
+import type { UIMessage } from "@ai-sdk/react";
+import {
+  estimateTokens,
+  liveTurnTokens,
+} from "@/features/ai-chat/utils/count-stream-tokens.ts";
+
+const msg = (parts: unknown[], metadata?: unknown): UIMessage =>
+  ({
+    id: Math.random().toString(),
+    role: "assistant",
+    parts,
+    metadata,
+  }) as UIMessage;
+
+describe("estimateTokens", () => {
+  it("returns 0 for the empty string", () => {
+    expect(estimateTokens("")).toBe(0);
+  });
+
+  it("ceils chars/4 so any non-empty text is at least 1 token", () => {
+    expect(estimateTokens("a")).toBe(1);
+    expect(estimateTokens("abcd")).toBe(1);
+    expect(estimateTokens("abcde")).toBe(2);
+    expect(estimateTokens("12345678")).toBe(2);
+  });
+});
+
+describe("liveTurnTokens — estimate path", () => {
+  it("is all zeros for an undefined message", () => {
+    expect(liveTurnTokens(undefined)).toEqual({
+      reasoning: 0,
+      output: 0,
+      authoritative: false,
+    });
+  });
+
+  it("is all zeros for a parts-less message", () => {
+    expect(liveTurnTokens({ id: "x", role: "assistant" } as UIMessage)).toEqual({
+      reasoning: 0,
+      output: 0,
+      authoritative: false,
+    });
+  });
+
+  it("estimates output from text parts", () => {
+    // 8 chars -> 2 tokens.
+    const r = liveTurnTokens(msg([{ type: "text", text: "12345678" }]));
+    expect(r).toEqual({ reasoning: 0, output: 2, authoritative: false });
+  });
+
+  it("estimates reasoning from reasoning parts (kept separate from output)", () => {
+    const r = liveTurnTokens(
+      msg([
+        { type: "reasoning", text: "12345678" },
+        { type: "text", text: "abcd" },
+      ]),
+    );
+    expect(r).toEqual({ reasoning: 2, output: 1, authoritative: false });
+  });
+
+  it("accumulates across multiple text + reasoning parts (multi-step)", () => {
+    const r = liveTurnTokens(
+      msg([
+        { type: "reasoning", text: "abcd" }, // 1
+        { type: "text", text: "abcd" }, // 1
+        { type: "tool-getPage", state: "output-available" }, // ignored
+        { type: "reasoning", text: "abcd" }, // 1
+        { type: "text", text: "abcdefgh" }, // 2
+      ]),
+    );
+    expect(r).toEqual({ reasoning: 2, output: 3, authoritative: false });
+  });
+
+  it("ignores non text/reasoning parts (tools, step-start)", () => {
+    const r = liveTurnTokens(
+      msg([
+        { type: "step-start" },
+        { type: "tool-getPage", state: "input-available" },
+      ]),
+    );
+    expect(r).toEqual({ reasoning: 0, output: 0, authoritative: false });
+  });
+});
+
+describe("liveTurnTokens — authoritative path", () => {
+  it("returns authoritative usage verbatim, splitting reasoning out of output", () => {
+    // outputTokens INCLUDES reasoning in the AI SDK shape -> answer = 100 - 30.
+    const r = liveTurnTokens(
+      msg([{ type: "text", text: "estimate would be tiny" }], {
+        usage: { inputTokens: 500, outputTokens: 100, reasoningTokens: 30 },
+      }),
+    );
+    expect(r).toEqual({ reasoning: 30, output: 70, authoritative: true });
+  });
+
+  it("treats missing reasoningTokens as 0 and keeps full output", () => {
+    const r = liveTurnTokens(
+      msg([{ type: "text", text: "x" }], {
+        usage: { inputTokens: 10, outputTokens: 42 },
+      }),
+    );
+    expect(r).toEqual({ reasoning: 0, output: 42, authoritative: true });
+  });
+
+  it("never returns a negative output when reasoning exceeds reported output", () => {
+    const r = liveTurnTokens(
+      msg([], { usage: { outputTokens: 10, reasoningTokens: 40 } }),
+    );
+    expect(r).toEqual({ reasoning: 40, output: 0, authoritative: true });
+  });
+
+  it("falls back to the estimate when metadata has no usage object", () => {
+    const r = liveTurnTokens(
+      msg([{ type: "text", text: "abcd" }], { chatId: "c1" }),
+    );
+    expect(r).toEqual({ reasoning: 0, output: 1, authoritative: false });
+  });
+});
--- a/apps/client/src/features/ai-chat/utils/count-stream-tokens.ts
+++ b/apps/client/src/features/ai-chat/utils/count-stream-tokens.ts
@@ -0,0 +1,94 @@
+import type { UIMessage } from "@ai-sdk/react";
+
+/**
+ * Live token counting for a streaming AI-chat turn — split into REASONING
+ * (thinking) and OUTPUT (answer) tokens, mirroring how Claude Code shows
+ * `Thinking… · 60 tokens` next to its thinking indicator.
+ *
+ * No provider streams exact per-token usage mid-stream, so the live number is a
+ * CLIENT ESTIMATE (chars/≈4 heuristic) that is reconciled to AUTHORITATIVE usage
+ * once the server attaches it on a step/turn boundary (see the server's
+ * `chatStreamMetadata` + the client's read of `message.metadata.usage`). When
+ * authoritative usage is present we return it verbatim (the number "jumps to
+ * exact"); otherwise we return the running estimate. Pure + unit-testable: it
+ * never runs a real BPE tokenizer (that would be O(n²) on the hot path, bloat the
+ * bundle, and be wrong for Gemini/Ollama anyway).
+ */
+
+/**
+ * Rough token estimate for a piece of text using the standard chars/≈4 heuristic.
+ * Returns 0 for empty/whitespace-free-of-content input, and ceils so any
+ * non-empty text counts as at least one token.
+ */
+export function estimateTokens(text: string): number {
+  if (!text) return 0;
+  return Math.ceil(text.length / 4);
+}
+
+/** Authoritative per-step/turn usage the server attaches to message metadata. */
+export interface AuthoritativeUsage {
+  inputTokens?: number;
+  outputTokens?: number;
+  totalTokens?: number;
+  reasoningTokens?: number;
+}
+
+/** Live token split for a turn's tail (streaming) assistant message. */
+export interface LiveTurnTokens {
+  /** Thinking/reasoning tokens (estimate, or authoritative when available). */
+  reasoning: number;
+  /** Answer/output tokens (estimate, or authoritative when available). */
+  output: number;
+  /** True when the numbers come from authoritative server usage, not estimate. */
+  authoritative: boolean;
+}
+
+/** Read the authoritative usage off a UIMessage's metadata, if the server set it. */
+function metadataUsage(message: UIMessage): AuthoritativeUsage | undefined {
+  const meta = message?.metadata as
+    | { usage?: AuthoritativeUsage }
+    | undefined;
+  const usage = meta?.usage;
+  if (!usage || typeof usage !== "object") return undefined;
+  return usage;
+}
+
+/**
+ * Token split for the given (streaming) assistant message.
+ *
+ * Prefers AUTHORITATIVE `metadata.usage` when the server has attached it (at a
+ * step/turn boundary, incl. `reasoningTokens`) — so the live counter snaps to the
+ * provider's exact figures. Until then it returns a running ESTIMATE summed over
+ * the message parts: `reasoning` parts feed the reasoning estimate, `text` parts
+ * feed the output estimate. Multi-part / multi-step turns accumulate naturally
+ * because every part of the turn is summed.
+ *
+ * Providers that don't stream reasoning text still surface a reasoning count once
+ * the authoritative usage arrives (`usage.reasoningTokens`); on the pure estimate
+ * path such a turn simply shows `reasoning: 0` until then.
+ */
+export function liveTurnTokens(message: UIMessage | undefined): LiveTurnTokens {
+  if (!message) return { reasoning: 0, output: 0, authoritative: false };
+
+  const usage = metadataUsage(message);
+  if (usage) {
+    // Authoritative branch: outputTokens already INCLUDES reasoning tokens in the
+    // AI SDK usage shape, so subtract reasoning out for the "answer" figure (never
+    // go negative if a provider reports them inconsistently).
+    const reasoning = usage.reasoningTokens ?? 0;
+    const totalOutput = usage.outputTokens ?? 0;
+    const output = Math.max(0, totalOutput - reasoning);
+    return { reasoning, output, authoritative: true };
+  }
+
+  let reasoning = 0;
+  let output = 0;
+  for (const part of message.parts ?? []) {
+    if (part.type === "reasoning") {
+      reasoning += estimateTokens((part as { text?: string }).text ?? "");
+    } else if (part.type === "text") {
+      output += estimateTokens((part as { text?: string }).text ?? "");
+    }
+  }
+  return { reasoning, output, authoritative: false };
+}
--- a/apps/client/src/features/ai-chat/utils/reasoning-tokens.test.ts
+++ b/apps/client/src/features/ai-chat/utils/reasoning-tokens.test.ts
@@ -0,0 +1,56 @@
+import { describe, expect, it } from "vitest";
+import type { UIMessage } from "@ai-sdk/react";
+import { reasoningTokensForPart } from "@/features/ai-chat/utils/reasoning-tokens.ts";
+
+/**
+ * Pure-helper tests for `reasoningTokensForPart`, the #151 anti-double-count
+ * rule: the authoritative `usage.reasoningTokens` is the TURN TOTAL, so it may
+ * only be attributed when the turn has exactly one reasoning part. With multiple
+ * reasoning parts (or no authoritative usage) every part falls back to its own
+ * per-part estimate, signalled here by `undefined`.
+ */
+const msg = (
+  parts: UIMessage["parts"],
+  metadata?: unknown,
+): UIMessage =>
+  ({
+    id: Math.random().toString(),
+    role: "assistant",
+    parts,
+    metadata,
+  }) as UIMessage;
+
+describe("reasoningTokensForPart", () => {
+  it("single reasoning part -> the authoritative turn total", () => {
+    const m = msg(
+      [
+        { type: "reasoning", text: "thinking…" } as never,
+        { type: "text", text: "answer" },
+      ],
+      { usage: { reasoningTokens: 42 } },
+    );
+    expect(reasoningTokensForPart(m)).toBe(42);
+  });
+
+  it("multiple reasoning parts -> undefined (each estimates on its own)", () => {
+    const m = msg(
+      [
+        { type: "reasoning", text: "step one" } as never,
+        { type: "reasoning", text: "step two" } as never,
+        { type: "text", text: "answer" },
+      ],
+      { usage: { reasoningTokens: 99 } },
+    );
+    // Even with an authoritative total, two reasoning parts must each estimate
+    // (attributing the total to one would double-count against the other).
+    expect(reasoningTokensForPart(m)).toBeUndefined();
+  });
+
+  it("no authoritative usage -> undefined even for a single reasoning part", () => {
+    const m = msg([
+      { type: "reasoning", text: "thinking…" } as never,
+      { type: "text", text: "answer" },
+    ]);
+    expect(reasoningTokensForPart(m)).toBeUndefined();
+  });
+});
--- a/apps/client/src/features/ai-chat/utils/reasoning-tokens.ts
+++ b/apps/client/src/features/ai-chat/utils/reasoning-tokens.ts
@@ -0,0 +1,34 @@
+import type { UIMessage } from "@ai-sdk/react";
+
+/**
+ * Decide the authoritative reasoning token count to attribute to a single
+ * `reasoning` part of an assistant message — or `undefined` when the part should
+ * fall back to its own per-part estimate.
+ *
+ * `usage.reasoningTokens` is the TURN TOTAL, so it may only be attributed to a
+ * block when the turn has exactly ONE reasoning part (the common one-step turn):
+ * then that block can show the exact figure. With MULTIPLE reasoning parts (a
+ * multi-step agent turn) every block must fall back to its own estimate —
+ * attributing the turn total to one of them would double-count against the
+ * others' estimates (#151 review anti-double-count rule). When there is no
+ * authoritative usage at all, every part estimates.
+ *
+ * Returns the authoritative `reasoningTokens` only for the single-reasoning-part
+ * case; `undefined` otherwise (the caller estimates from the part text).
+ */
+export function reasoningTokensForPart(
+  message: UIMessage,
+): number | undefined {
+  const reasoningTokens = (
+    message.metadata as { usage?: { reasoningTokens?: number } } | undefined
+  )?.usage?.reasoningTokens;
+
+  const reasoningPartCount = (message.parts ?? []).reduce(
+    (acc, p) => (p.type === "reasoning" ? acc + 1 : acc),
+    0,
+  );
+
+  // Exactly one reasoning part -> attribute the authoritative turn total to it.
+  // Otherwise (zero or multiple) each part estimates on its own.
+  return reasoningPartCount === 1 ? reasoningTokens : undefined;
+}
--- a/apps/server/src/core/ai-chat/ai-chat.service.spec.ts
+++ b/apps/server/src/core/ai-chat/ai-chat.service.spec.ts
@@ -5,7 +5,8 @@ import {
  rowToUiMessage,
  prepareAgentStep,
  buildPartialAssistantRecord,
-  chatStreamStartMetadata,
+  chatStreamMetadata,
+  accumulateStepUsage,
  MAX_AGENT_STEPS,
  FINAL_STEP_INSTRUCTION,
 } from './ai-chat.service';
@@ -298,18 +299,135 @@ describe('buildPartialAssistantRecord', () => {
 });

 /**
- * chatStreamStartMetadata: attach the authoritative chatId to the streamed
- * assistant UI message ONLY on the `start` part (so the client adopts the real
- * created chat id at the first chunk — see #137). Any non-start part adds none.
+ * chatStreamMetadata: attach metadata to the streamed assistant UI message per
+ * part type — `chatId` on `start` (so the client adopts the real created chat id
+ * at the first chunk — see #137), and AUTHORITATIVE usage (incl. reasoning
+ * tokens) on `finish-step` and `finish` so the client's live token counter snaps
+ * to exact at each step/turn boundary.
 */
-describe('chatStreamStartMetadata', () => {
+describe('chatStreamMetadata', () => {
  it('returns { chatId } for the start part', () => {
-    expect(chatStreamStartMetadata({ type: 'start' }, 'chat-1')).toEqual({
+    expect(chatStreamMetadata({ type: 'start' }, 'chat-1')).toEqual({
      chatId: 'chat-1',
    });
  });

-  it('returns undefined for a finish part (any non-start part)', () => {
-    expect(chatStreamStartMetadata({ type: 'finish' }, 'chat-1')).toBeUndefined();
+  it('returns the CUMULATIVE step usage passed in for the finish-step part', () => {
+    // finish-step usage is per-step in v6; the caller accumulates and passes the
+    // running sum, which this just wraps.
+    expect(
+      chatStreamMetadata(
+        { type: 'finish-step', usage: { outputTokens: 100 } },
+        'chat-1',
+        { inputTokens: 500, outputTokens: 220, totalTokens: 720, reasoningTokens: 30 },
+      ),
+    ).toEqual({
+      usage: { inputTokens: 500, outputTokens: 220, totalTokens: 720, reasoningTokens: 30 },
+    });
+  });
+
+  it('returns turn usage for the finish part (reasoning from deprecated top-level field)', () => {
+    expect(
+      chatStreamMetadata(
+        {
+          type: 'finish',
+          totalUsage: {
+            inputTokens: 1000,
+            outputTokens: 250,
+            totalTokens: 1250,
+            reasoningTokens: 50,
+          },
+        },
+        'chat-1',
+      ),
+    ).toEqual({
+      usage: {
+        inputTokens: 1000,
+        outputTokens: 250,
+        totalTokens: 1250,
+        reasoningTokens: 50,
+      },
+    });
+  });
+
+  it('prefers outputTokenDetails.reasoningTokens over the deprecated field (finish)', () => {
+    expect(
+      chatStreamMetadata(
+        {
+          type: 'finish',
+          totalUsage: {
+            outputTokens: 100,
+            reasoningTokens: 5,
+            outputTokenDetails: { reasoningTokens: 30 },
+          },
+        },
+        'chat-1',
+      ),
+    ).toEqual({
+      usage: {
+        inputTokens: undefined,
+        outputTokens: 100,
+        totalTokens: undefined,
+        reasoningTokens: 30,
+      },
+    });
+  });
+
+  it('returns undefined for a finish-step with no accumulated usage', () => {
+    expect(
+      chatStreamMetadata({ type: 'finish-step' }, 'chat-1'),
+    ).toBeUndefined();
+  });
+
+  it('returns undefined for an unrelated part (e.g. text-delta)', () => {
+    expect(
+      chatStreamMetadata({ type: 'text-delta' }, 'chat-1'),
+    ).toBeUndefined();
+  });
+});
+
+/**
+ * accumulateStepUsage: sums per-step usage into a running cumulative total so the
+ * client never sees the live counter jump DOWN on a multi-step agent turn (#151).
+ */
+describe('accumulateStepUsage', () => {
+  it('sums every field across two steps', () => {
+    expect(
+      accumulateStepUsage(
+        { inputTokens: 500, outputTokens: 100, totalTokens: 600, reasoningTokens: 30 },
+        { inputTokens: 520, outputTokens: 80, totalTokens: 600, reasoningTokens: 10 },
+      ),
+    ).toEqual({
+      inputTokens: 1020,
+      outputTokens: 180,
+      totalTokens: 1200,
+      reasoningTokens: 40,
+    });
+  });
+
+  it('returns the step as-is when there is no accumulator yet', () => {
+    expect(accumulateStepUsage(undefined, { outputTokens: 10 })).toEqual({
+      outputTokens: 10,
+    });
+  });
+
+  it('returns the accumulator unchanged when the step usage is absent', () => {
+    const acc = { outputTokens: 10 };
+    expect(accumulateStepUsage(acc, undefined)).toBe(acc);
+  });
+
+  it('returns undefined when both sides are absent', () => {
+    expect(accumulateStepUsage(undefined, undefined)).toBeUndefined();
+  });
+
+  it('keeps a field undefined only when neither side has it', () => {
+    expect(
+      accumulateStepUsage({ outputTokens: 5 }, { outputTokens: 7 }),
+    ).toEqual({
+      inputTokens: undefined,
+      outputTokens: 12,
+      totalTokens: undefined,
+      reasoningTokens: undefined,
+    });
  });
 });
--- a/apps/server/src/core/ai-chat/ai-chat.service.ts
+++ b/apps/server/src/core/ai-chat/ai-chat.service.ts
@@ -420,7 +420,11 @@ export class AiChatService {
          toolCalls: serializeSteps(steps),
          metadata: {
            finishReason,
-            usage: totalUsage,
+            // Persist the turn's cumulative usage WITH reasoning tokens resolved
+            // from either the new `outputTokenDetails` or the deprecated top-level
+            // field, so reopened history / the Markdown export show the thinking
+            // token cost too.
+            usage: normalizeStreamUsage(totalUsage as StreamUsage) ?? totalUsage,
            // Final-step usage = the context actually fed to the model on the last LLM
            // call (full history + tool results) plus the answer it just generated.
            // input+output of the FINAL step ≈ the conversation's CURRENT context size,
@@ -512,17 +516,42 @@ export class AiChatService {
      // does not buffer responses by default.
      // Scrub the SDK's hop-by-hop Connection header before it writes the head (Safari/HTTP2).
      stripStreamingHopByHopHeaders(res.raw);
+      // Running sum of per-step usage (v6 `finish-step.usage` is per-step). Sent
+      // as the cumulative authoritative usage so the client never jumps DOWN.
+      let cumulativeStepUsage: ChatStreamUsage | undefined;
      result.pipeUIMessageStreamToResponse(res.raw, {
        headers: { 'X-Accel-Buffering': 'no' },
        // Surface the authoritative chatId on the streamed assistant UI message so
        // the client adopts the REAL id of the row we created, instead of guessing
        // the newest chat in its list. `messageMetadata` is invoked by the AI SDK
-        // on the `start` and `finish` stream parts (ai@6); we attach `chatId` on the
-        // `start` part so it reaches the client (as message.metadata.chatId) at the
-        // very first chunk — before any second tab can race a newer chat into the
-        // list. This fixes the two-tab "adoption race" (#137) where a new chat in
-        // tab A could adopt tab B's id and leak its turns into the wrong row.
-        messageMetadata: ({ part }) => chatStreamStartMetadata(part, chatId),
+        // on the `start`, `finish-step` and `finish` stream parts (ai@6 — note the
+        // `finish-step` trigger relies on it being delivered as its own
+        // message-metadata chunk); we attach `chatId` on the `start` part so it
+        // reaches the client (as message.metadata.chatId) at the very first chunk —
+        // before any second tab can race a newer chat into the list. This fixes the
+        // two-tab "adoption race" (#137).
+        //
+        // `finish-step.usage` is PER-STEP (not cumulative) in v6, and the client
+        // merges each metadata.usage by replacement — so on a multi-step agent turn
+        // (up to MAX_AGENT_STEPS) the naive per-step value would make the live
+        // counter jump DOWN at each boundary. We keep a running sum here and send
+        // the CUMULATIVE usage, which converges to `finish.totalUsage` (#151).
+        messageMetadata: ({ part }) => {
+          const p = part as StreamMetadataPart;
+          if (p.type === 'finish-step') {
+            cumulativeStepUsage = accumulateStepUsage(
+              cumulativeStepUsage,
+              normalizeStreamUsage(p.usage),
+            );
+          }
+          return chatStreamMetadata(p, chatId, cumulativeStepUsage);
+        },
+        // Stream reasoning (thinking) parts to the client so the live counter can
+        // estimate reasoning tokens from streamed text. v6 default is already
+        // true; set explicitly so the intent survives any future SDK default
+        // change. Providers that don't emit reasoning text still surface the
+        // count via the authoritative `usage.reasoningTokens` on finish-step.
+        sendReasoning: true,
        onError: (error: unknown) => {
          // Reuse the shared formatter so provider error formatting stays
          // unified between the log line and the streamed error message.
@@ -573,16 +602,97 @@ export class AiChatService {
  }
 }

+/** Shape of the AI SDK v6 LanguageModelUsage we forward to the client. The SDK
+ *  exposes `reasoningTokens` both as a (deprecated) top-level field and under
+ *  `outputTokenDetails.reasoningTokens`; we normalize to a single field so the
+ *  client gets one stable usage shape regardless of provider/SDK version. */
+interface StreamUsage {
+  inputTokens?: number;
+  outputTokens?: number;
+  totalTokens?: number;
+  reasoningTokens?: number;
+  outputTokenDetails?: { reasoningTokens?: number };
+}
+
+/** A streamed part the messageMetadata callback can receive (only the fields we read). */
+interface StreamMetadataPart {
+  type: string;
+  usage?: StreamUsage;
+  totalUsage?: StreamUsage;
+}
+
+/** Authoritative usage we attach to a streamed assistant message's metadata. */
+export interface ChatStreamUsage {
+  inputTokens?: number;
+  outputTokens?: number;
+  totalTokens?: number;
+  reasoningTokens?: number;
+}
+
+/** Normalize an AI SDK usage object to our flat client-facing shape, resolving
+ *  reasoning tokens from either the new `outputTokenDetails` or the deprecated
+ *  top-level field. Returns undefined for a missing usage object. */
+function normalizeStreamUsage(
+  usage: StreamUsage | undefined,
+): ChatStreamUsage | undefined {
+  if (!usage) return undefined;
+  const reasoningTokens =
+    usage.outputTokenDetails?.reasoningTokens ?? usage.reasoningTokens;
+  return {
+    inputTokens: usage.inputTokens,
+    outputTokens: usage.outputTokens,
+    totalTokens: usage.totalTokens,
+    reasoningTokens,
+  };
+}
+
+/** Sum a (normalized) per-step usage into a running cumulative usage. v6's
+ *  `finish-step.usage` is PER-STEP, so the caller accumulates across steps; the
+ *  cumulative sum converges to the turn's `totalUsage` (no down-jump on the
+ *  client). Returns undefined only when both sides are absent. Pure. */
+export function accumulateStepUsage(
+  acc: ChatStreamUsage | undefined,
+  step: ChatStreamUsage | undefined,
+): ChatStreamUsage | undefined {
+  if (!acc) return step;
+  if (!step) return acc;
+  const add = (a?: number, b?: number): number | undefined =>
+    a == null && b == null ? undefined : (a ?? 0) + (b ?? 0);
+  return {
+    inputTokens: add(acc.inputTokens, step.inputTokens),
+    outputTokens: add(acc.outputTokens, step.outputTokens),
+    totalTokens: add(acc.totalTokens, step.totalTokens),
+    reasoningTokens: add(acc.reasoningTokens, step.reasoningTokens),
+  };
+}
+
 /**
- * Attach the authoritative `chatId` to the streamed assistant message's `start`
- * part (as `message.metadata.chatId`) so the client can adopt the real id for a
- * new chat. See the client's adopt-chat-id.ts for the full #137 design.
+ * Pure metadata builder for the streamed assistant UI message. The AI SDK calls
+ * `messageMetadata` on the `start`, `finish-step` and `finish` stream parts; we
+ * attach (as `message.metadata`):
+ *  - `start`        -> `{ chatId }` so the client adopts the real created chat id
+ *                      at the first chunk (see adopt-chat-id.ts / #137).
+ *  - `finish-step`  -> `{ usage }` the CUMULATIVE authoritative usage so far
+ *                      (incl. reasoning tokens) — the caller passes the running
+ *                      sum (`cumulativeStepUsage`), since v6 per-step usage is not
+ *                      cumulative; the client snaps to exact without jumping down.
+ *  - `finish`       -> `{ usage }` from the turn's `totalUsage` (final reconcile).
+ * Any other part type contributes no metadata. Pure + unit-testable.
 */
-export function chatStreamStartMetadata(
-  part: { type: string },
+export function chatStreamMetadata(
+  part: StreamMetadataPart,
  chatId: string,
-): { chatId: string } | undefined {
-  return part.type === 'start' ? { chatId } : undefined;
+  cumulativeStepUsage?: ChatStreamUsage,
+): { chatId: string } | { usage: ChatStreamUsage } | undefined {
+  if (part.type === 'start') return { chatId };
+  if (part.type === 'finish-step') {
+    return cumulativeStepUsage ? { usage: cumulativeStepUsage } : undefined;
+  }
+  if (part.type === 'finish') {
+    const usage = normalizeStreamUsage(part.totalUsage);
+    return usage ? { usage } : undefined;
+  }
+  return undefined;
 }

 /** The last message with role 'user' from a useChat payload, if any. */