The header token badge (and the "Thinking… · N tokens" line) froze between agent steps and jumped in chunks instead of ticking smoothly. liveTurnTokens returned the authoritative server `usage` VERBATIM as soon as it appeared, but the server only attaches usage at a step boundary and it is cumulative over COMPLETED steps — so during the next (in-flight) step the figure stayed frozen at the previous boundary and the running text estimate was ignored. Combine both sources per component via max: always compute the running estimate (chars/≈4 over the message's reasoning/text parts, which includes the in-flight step) and take max(authoritativeBase, estimate). Between boundaries the estimate ticks the number up; at a boundary the authoritative figure snaps it exact; and because the server usage is cumulative and we only ever take the max, the counter is monotonic (never drops). Reasoning/output stay split; the #151 reasoning-only authoritative count is preserved. Backward compatible: in every existing test the estimate is <= the authoritative figure, so max returns the same value. +4 tests for the in-flight-step-exceeds- base case (output + reasoning), the authoritative-wins case, and monotonicity. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
114 lines
5.0 KiB
TypeScript
114 lines
5.0 KiB
TypeScript
import type { UIMessage } from "@ai-sdk/react";
|
|
|
|
/**
|
|
* Live token counting for a streaming AI-chat turn — split into REASONING
|
|
* (thinking) and OUTPUT (answer) tokens, mirroring how Claude Code shows
|
|
* `Thinking… · 60 tokens` next to its thinking indicator.
|
|
*
|
|
* No provider streams exact per-token usage mid-stream, so the live number is a
|
|
* CLIENT ESTIMATE (chars/≈4 heuristic) that is reconciled to AUTHORITATIVE usage
|
|
* once the server attaches it on a step/turn boundary (see the server's
|
|
* `chatStreamMetadata` + the client's read of `message.metadata.usage`). When
|
|
* authoritative usage is present we return it verbatim (the number "jumps to
|
|
* exact"); otherwise we return the running estimate. Pure + unit-testable: it
|
|
* never runs a real BPE tokenizer (that would be O(n²) on the hot path, bloat the
|
|
* bundle, and be wrong for Gemini/Ollama anyway).
|
|
*/
|
|
|
|
/**
|
|
* Rough token estimate for a piece of text using the standard chars/≈4 heuristic.
|
|
* Returns 0 for empty/whitespace-free-of-content input, and ceils so any
|
|
* non-empty text counts as at least one token.
|
|
*/
|
|
export function estimateTokens(text: string): number {
|
|
if (!text) return 0;
|
|
return Math.ceil(text.length / 4);
|
|
}
|
|
|
|
/** Authoritative per-step/turn usage the server attaches to message metadata. */
|
|
export interface AuthoritativeUsage {
|
|
inputTokens?: number;
|
|
outputTokens?: number;
|
|
totalTokens?: number;
|
|
reasoningTokens?: number;
|
|
}
|
|
|
|
/** Live token split for a turn's tail (streaming) assistant message. */
|
|
export interface LiveTurnTokens {
|
|
/** Thinking/reasoning tokens (estimate, or authoritative when available). */
|
|
reasoning: number;
|
|
/** Answer/output tokens (estimate, or authoritative when available). */
|
|
output: number;
|
|
/** True when the numbers come from authoritative server usage, not estimate. */
|
|
authoritative: boolean;
|
|
}
|
|
|
|
/** Read the authoritative usage off a UIMessage's metadata, if the server set it. */
|
|
function metadataUsage(message: UIMessage): AuthoritativeUsage | undefined {
|
|
const meta = message?.metadata as
|
|
| { usage?: AuthoritativeUsage }
|
|
| undefined;
|
|
const usage = meta?.usage;
|
|
if (!usage || typeof usage !== "object") return undefined;
|
|
return usage;
|
|
}
|
|
|
|
/**
|
|
* Token split for the given (streaming) assistant message.
|
|
*
|
|
* COMBINES the authoritative server usage with the running text estimate so the
|
|
* counter ticks in real time AND lands exact. The server only attaches
|
|
* `metadata.usage` at a step/turn boundary (`finish-step`/`finish`) and it is
|
|
* CUMULATIVE over COMPLETED steps — it does NOT yet include the in-flight step.
|
|
* So a multi-step turn that returned the authoritative figure verbatim would
|
|
* FREEZE between boundaries and jump in steps (issue #163).
|
|
*
|
|
* Instead we always compute the running ESTIMATE (chars/≈4 over the message's
|
|
* `reasoning`/`text` parts, which grows on every streamed delta) and take the
|
|
* per-component MAX of the authoritative base and the estimate:
|
|
* - between boundaries the estimate of the in-flight step ticks the number up;
|
|
* - at a boundary the authoritative figure snaps it to exact;
|
|
* - because the server's usage is cumulative and we only ever take the max, the
|
|
* number is MONOTONIC — it never drops.
|
|
*
|
|
* Providers that don't stream reasoning text still surface a reasoning count once
|
|
* the authoritative usage arrives (`max(reasoningTokens, 0)`); on the pure
|
|
* estimate path (no usage yet) such a turn shows `reasoning: 0` until then.
|
|
*/
|
|
export function liveTurnTokens(message: UIMessage | undefined): LiveTurnTokens {
|
|
if (!message) return { reasoning: 0, output: 0, authoritative: false };
|
|
|
|
// Running ESTIMATE over every reasoning/text part — grows on each delta. This
|
|
// includes the IN-FLIGHT step, which the authoritative usage does not cover yet.
|
|
let estReasoning = 0;
|
|
let estOutput = 0;
|
|
for (const part of message.parts ?? []) {
|
|
if (part.type === "reasoning") {
|
|
estReasoning += estimateTokens((part as { text?: string }).text ?? "");
|
|
} else if (part.type === "text") {
|
|
estOutput += estimateTokens((part as { text?: string }).text ?? "");
|
|
}
|
|
}
|
|
|
|
const usage = metadataUsage(message);
|
|
if (!usage) {
|
|
// No authoritative usage streamed yet: the estimate IS the live figure.
|
|
return { reasoning: estReasoning, output: estOutput, authoritative: false };
|
|
}
|
|
|
|
// Authoritative sum over COMPLETED steps. `outputTokens` already INCLUDES
|
|
// reasoning in the AI SDK usage shape, so subtract it out for the "answer"
|
|
// figure (never go negative if a provider reports them inconsistently).
|
|
const authReasoning = usage.reasoningTokens ?? 0;
|
|
const authOutput = Math.max(0, (usage.outputTokens ?? 0) - authReasoning);
|
|
|
|
// Per-component max: the in-flight step's estimate ticks above the completed-
|
|
// steps base between boundaries, and the authoritative figure wins once it
|
|
// exceeds the (rough) estimate at the next boundary. Monotonic by construction.
|
|
return {
|
|
reasoning: Math.max(authReasoning, estReasoning),
|
|
output: Math.max(authOutput, estOutput),
|
|
authoritative: true,
|
|
};
|
|
}
|