diff --git a/backend/core/domain/experiment.py b/backend/core/domain/experiment.py index ebb527e8..72c8bf8c 100644 --- a/backend/core/domain/experiment.py +++ b/backend/core/domain/experiment.py @@ -24,6 +24,7 @@ class ExperimentOutput(BaseModel): output: AgentOutput | None cost_usd: float | None duration_seconds: float | None + reasoning_token_count: float | None = None class ExperimentVersion(Version): diff --git a/backend/core/storage/completion_storage.py b/backend/core/storage/completion_storage.py index a138ee0e..8f290e64 100644 --- a/backend/core/storage/completion_storage.py +++ b/backend/core/storage/completion_storage.py @@ -6,7 +6,7 @@ from core.domain.experiment import Experiment from core.domain.version import Version -type CompletionField = Literal["traces", "agent_id"] +type CompletionField = Literal["traces", "agent_id", "input_variables", "input_messages", "output_messages", "messages"] class CompletionStorage(Protocol): diff --git a/backend/protocol/api/_api_models.py b/backend/protocol/api/_api_models.py index 927b7eeb..0ce25f71 100644 --- a/backend/protocol/api/_api_models.py +++ b/backend/protocol/api/_api_models.py @@ -496,6 +496,10 @@ class Completion(BaseModel): output: Output cost_usd: float duration_seconds: float + reasoning_token_count: float | None = Field( + default=None, + description="The number of reasoning tokens used in the inference, if applicable.", + ) completions: list[Completion] | None = Field(default=None, description="The completions of the experiment.") diff --git a/backend/protocol/api/_services/conversions.py b/backend/protocol/api/_services/conversions.py index f4a7499c..f39eb6fc 100644 --- a/backend/protocol/api/_services/conversions.py +++ b/backend/protocol/api/_services/conversions.py @@ -919,6 +919,7 @@ def experiment_completion_from_domain(completion: ExperimentOutput) -> Experimen output=output_from_domain(completion.output) if completion.output else Output(), cost_usd=completion.cost_usd or 0.0, duration_seconds=completion.duration_seconds or 0.0, + reasoning_token_count=completion.reasoning_token_count, ) diff --git a/backend/protocol/api/_services/experiment_service.py b/backend/protocol/api/_services/experiment_service.py index 15b21146..86943df4 100644 --- a/backend/protocol/api/_services/experiment_service.py +++ b/backend/protocol/api/_services/experiment_service.py @@ -84,6 +84,25 @@ async def get_experiment( input_ids=input_ids, ) + # Fetch reasoning tokens for outputs if they exist + if exp.outputs: + completion_ids = [output.completion_id for output in exp.outputs] + # Get completions with traces from ClickHouse (exclude large fields we don't need) + completions_with_traces = await self.completion_storage.completions_by_ids( + completion_ids, exclude={"agent_id", "input_variables", "input_messages", "output_messages", "messages"}, + ) + # Create a map of completion_id -> reasoning_token_count + reasoning_tokens_map = {} + for comp in completions_with_traces: + reasoning_tokens = self._calculate_reasoning_tokens_from_traces(comp.traces) + if reasoning_tokens is not None: + reasoning_tokens_map[comp.id] = reasoning_tokens + + # Attach reasoning token counts to experiment outputs + for output in exp.outputs: + if output.completion_id in reasoning_tokens_map: + output.reasoning_token_count = reasoning_tokens_map[output.completion_id] + annotations: list[Annotation] = [] if include is None or "annotations" in include: annotations = await self.annotation_storage.list( @@ -97,6 +116,41 @@ async def get_experiment( # getting annotations as needed return experiment_from_domain(exp, annotations) + def _calculate_reasoning_tokens_from_traces(self, traces: list[Any] | None) -> float | None: + """Calculate total reasoning tokens from traces. + + Args: + traces: List of trace objects + + Returns: + Total reasoning tokens as float, or None if no reasoning tokens found + """ + if not traces: + return None + + total_reasoning_tokens = 0.0 + has_reasoning_field = False + + for trace in traces: + # Only check LLM traces + if not hasattr(trace, "kind") or trace.kind != "llm": + continue + + # Check if trace has usage data + if not hasattr(trace, "usage") or not trace.usage: + continue + + # Handle detailed usage structure + if hasattr(trace.usage, "completion") and trace.usage.completion: + completion_usage = trace.usage.completion + + # Check if reasoning_token_count field exists + if hasattr(completion_usage, "reasoning_token_count") and completion_usage.reasoning_token_count is not None: + has_reasoning_field = True + total_reasoning_tokens += float(completion_usage.reasoning_token_count or 0) + + return total_reasoning_tokens if has_reasoning_field else None + async def list_experiments(self, agent_id: str | None = None, limit: int = 10, offset: int = 0) -> Page[Experiment]: if agent_id: agent = await self.agent_storage.get_agent(agent_id) diff --git a/web/eslint.config.mjs b/web/eslint.config.mjs index 22331647..051521d9 100644 --- a/web/eslint.config.mjs +++ b/web/eslint.config.mjs @@ -12,6 +12,16 @@ const compat = new FlatCompat({ const eslintConfig = [ ...compat.extends("next/core-web-vitals", "next/typescript"), { + ignores: [ + ".next/**/*", + "out/**/*", + "build/**/*", + "dist/**/*", + "node_modules/**/*", + ".next/types/**/*", + ".next/static/**/*", + ".next/server/**/*", + ], rules: { "no-restricted-imports": [ "error", diff --git a/web/src/app/experiments/[id]/sections/Results/MatrixSection.tsx b/web/src/app/experiments/[id]/sections/Results/MatrixSection.tsx index 33b024d2..59efff9d 100644 --- a/web/src/app/experiments/[id]/sections/Results/MatrixSection.tsx +++ b/web/src/app/experiments/[id]/sections/Results/MatrixSection.tsx @@ -10,6 +10,7 @@ import { getSharedPartsOfPrompts, getValidCosts, getValidDurations, + getValidReasoningTokens, } from "@/components/utils/utils"; import { useColumnWidths } from "@/hooks/useColumnWidths"; import { useVersionHiding } from "@/hooks/useVersionHiding"; @@ -115,6 +116,9 @@ export function MatrixSection(props: Props) { const allAvgDurations = priceAndLatencyPerVersion .map(({ metrics }) => metrics.avgDuration) .filter((duration): duration is number => duration !== undefined); + const allAvgReasoningTokens = priceAndLatencyPerVersion + .map(({ metrics }) => metrics.avgReasoningTokens) + .filter((tokens): tokens is number => tokens !== undefined); // Calculate raw metrics lookup for percentile data const rawMetricsPerVersionPerKey = getRawMetricsPerVersionPerKey(experiment, annotations); @@ -140,6 +144,10 @@ export function MatrixSection(props: Props) { if (priceAndLatency.metrics.avgDuration !== undefined) { allMetrics.unshift({ key: "duration", average: priceAndLatency.metrics.avgDuration }); } + // Only add reasoning tokens metric if it has a valid value + if (priceAndLatency.metrics.avgReasoningTokens !== undefined) { + allMetrics.unshift({ key: "reasoning", average: priceAndLatency.metrics.avgReasoningTokens }); + } } // Combine allMetricsPerKey with price and latency data @@ -151,6 +159,9 @@ export function MatrixSection(props: Props) { if (allAvgDurations.length > 0) { allMetricsPerKeyForVersion.duration = allAvgDurations; } + if (allAvgReasoningTokens.length > 0) { + allMetricsPerKeyForVersion.reasoning = allAvgReasoningTokens; + } } // Combine rawMetricsPerKey with price and latency data @@ -158,6 +169,9 @@ export function MatrixSection(props: Props) { if (priceAndLatency?.metrics) { versionMetricsPerKeyForVersion.cost = priceAndLatency.metrics.costs; versionMetricsPerKeyForVersion.duration = priceAndLatency.metrics.durations; + if (priceAndLatency.metrics.reasoningTokens.length > 0) { + versionMetricsPerKeyForVersion.reasoning = priceAndLatency.metrics.reasoningTokens; + } } // Find the original index of this version in the sorted versions array @@ -204,9 +218,10 @@ export function MatrixSection(props: Props) { .map((version) => findCompletionForInputAndVersion(experiment.completions || [], input.id, version.id)) .filter(Boolean); // Remove undefined completions - // Calculate cost and duration arrays for this row using centralized utility functions + // Calculate cost, duration, and reasoning token arrays for this row using centralized utility functions const allCostsForRow = getValidCosts(completionsForInput); const allDurationsForRow = getValidDurations(completionsForInput); + const allReasoningTokensForRow = getValidReasoningTokens(completionsForInput); // Calculate metrics per key for this row (for row-based comparison coloring) const allMetricsPerKeyForRowData = getAllMetricsPerKeyForRow(experiment, annotations, input.id); @@ -216,13 +231,16 @@ export function MatrixSection(props: Props) { ...allMetricsPerKeyForRowData, }; - // Add cost and duration arrays if they have data + // Add cost, duration, and reasoning token arrays if they have data if (allCostsForRow.length > 0) { allMetricsPerKeyForRow.cost = allCostsForRow; } if (allDurationsForRow.length > 0) { allMetricsPerKeyForRow.duration = allDurationsForRow; } + if (allReasoningTokensForRow.length > 0) { + allMetricsPerKeyForRow.reasoning = allReasoningTokensForRow; + } return orderedVersions.map((version) => { const completion = findCompletionForInputAndVersion(experiment.completions || [], input.id, version.id); diff --git a/web/src/app/experiments/[id]/sections/Results/completion/CompletionCell.tsx b/web/src/app/experiments/[id]/sections/Results/completion/CompletionCell.tsx index 728a5397..a9e07456 100644 --- a/web/src/app/experiments/[id]/sections/Results/completion/CompletionCell.tsx +++ b/web/src/app/experiments/[id]/sections/Results/completion/CompletionCell.tsx @@ -7,7 +7,12 @@ import { PageError } from "@/components/PageError"; import { useToast } from "@/components/ToastProvider"; import { AnnotationsView } from "@/components/annotations/AnnotationsView"; import { MessagesViewer } from "@/components/messages/MessagesViewer"; -import { shouldIncludeCostMetric, shouldIncludeDurationMetric } from "@/components/utils/utils"; +import { + getReasoningTokenCount, + shouldIncludeCostMetric, + shouldIncludeDurationMetric, + shouldIncludeReasoningMetric, +} from "@/components/utils/utils"; import { Annotation, ExperimentCompletion } from "@/types/models"; import { getMetricsForCompletion } from "../../../utils"; @@ -48,6 +53,14 @@ function CompletionCell(props: CompletionCellProps) { metrics.push({ key: "duration", average: completion.duration_seconds }); } + // Add reasoning tokens metric if valid using centralized utility + if (shouldIncludeReasoningMetric(completion)) { + const reasoningTokens = getReasoningTokenCount(completion); + if (reasoningTokens !== undefined) { + metrics.push({ key: "reasoning", average: reasoningTokens }); + } + } + // Add custom metrics from annotations if (completionMetrics.length > 0) { metrics.push(...completionMetrics); diff --git a/web/src/components/MetricItem.tsx b/web/src/components/MetricItem.tsx index b5f6998c..b19a3f87 100644 --- a/web/src/components/MetricItem.tsx +++ b/web/src/components/MetricItem.tsx @@ -71,6 +71,9 @@ export function MetricItem({ if (metricKey.includes("duration") || metricKey.includes("latency")) { return "duration"; } + if (metricKey.includes("reasoning")) { + return "reasoning"; + } return undefined; }, [metricKey]); @@ -108,18 +111,30 @@ export function MetricItem({ return (value: number) => (usePer1kMultiplier ? formatCurrency(value, 1000) : `$${formatNumber(value)}`); } else if (metricType === "duration") { return formatDuration; + } else if (metricType === "reasoning") { + return (value: number) => `${Math.round(value).toLocaleString()} tokens`; } else { return (value: number) => value.toFixed(2); } }, [metricType, usePer1kMultiplier]); const displayLabel = showAvgPrefix - ? `Average ${metricKey === "cost" ? (usePer1kMultiplier ? "cost (per 1K)" : "cost") : metricKey.replace(/_/g, " ")}` + ? `Average ${ + metricKey === "cost" + ? usePer1kMultiplier + ? "cost (per 1K)" + : "cost" + : metricKey === "reasoning" + ? "reasoning" + : metricKey.replace(/_/g, " ") + }` : metricKey === "cost" ? usePer1kMultiplier ? "cost (per 1K)" : "cost" - : metricKey.replace(/_/g, " "); + : metricKey === "reasoning" + ? "reasoning" + : metricKey.replace(/_/g, " "); if (percentiles && showAvgPrefix) { return ( diff --git a/web/src/components/completion-modal/TracesView.tsx b/web/src/components/completion-modal/TracesView.tsx index 3b7bbeba..b74e1895 100644 --- a/web/src/components/completion-modal/TracesView.tsx +++ b/web/src/components/completion-modal/TracesView.tsx @@ -7,21 +7,133 @@ type Props = { traces?: Trace[]; }; +type UsageInfoProps = { + trace: Extract; + traceIndex: number; +}; + +function UsageInfo({ trace, traceIndex }: UsageInfoProps) { + if (!trace.usage) return null; + + // Check if this is the new detailed usage structure + if ("prompt" in trace.usage && "completion" in trace.usage) { + const detailedUsage = trace.usage as { + prompt: { text_token_count?: number; cost_usd: number }; + completion: { + text_token_count?: number; + reasoning_token_count?: number; + cached_token_count?: number; + cost_usd: number; + }; + }; + + const items = []; + + // Prompt tokens + if (detailedUsage.prompt.text_token_count) { + items.push( + + ); + } + + // Completion tokens + if (detailedUsage.completion.text_token_count) { + items.push( + + ); + } + + // Reasoning tokens (new!) + if (detailedUsage.completion.reasoning_token_count && detailedUsage.completion.reasoning_token_count > 0) { + items.push( + + ); + } + + // Cached tokens + if (detailedUsage.completion.cached_token_count && detailedUsage.completion.cached_token_count > 0) { + items.push( + + ); + } + + // Total cost + const totalCost = detailedUsage.prompt.cost_usd + detailedUsage.completion.cost_usd; + if (totalCost > 0) { + items.push(); + } + + return <>{items}; + } + + // Fallback to old simple structure + const simpleUsage = trace.usage as { input_tokens?: number; output_tokens?: number; total_tokens?: number }; + const items = []; + + if (simpleUsage.input_tokens) { + items.push( + + ); + } + + if (simpleUsage.output_tokens) { + items.push( + + ); + } + + if (simpleUsage.total_tokens) { + items.push( + + ); + } + + return <>{items}; +} + export function TracesView({ traces }: Props) { const llmTracesWithUsage = useMemo(() => { if (!traces || traces.length === 0) { return []; } - // Filter for LLM traces that have usage data with text_token_count + // Filter for LLM traces that have usage data return traces.filter((trace): trace is Extract => { if (trace.kind !== "llm" || !trace.usage) return false; - // Check if any usage entry has text_token_count - return Object.values(trace.usage).some( - (usageValue: unknown) => - typeof usageValue === "object" && usageValue !== null && "text_token_count" in usageValue - ); + // Check if usage has the new detailed structure or old simple structure + const hasDetailedUsage = "prompt" in trace.usage && "completion" in trace.usage; + const hasSimpleUsage = "input_tokens" in trace.usage || "output_tokens" in trace.usage; + + return hasDetailedUsage || hasSimpleUsage; }); }, [traces]); @@ -37,43 +149,7 @@ export function TracesView({ traces }: Props) {
{/* Show provider if available */} {trace.provider && } - {trace.usage && - Object.entries(trace.usage).map(([key, usageValue]) => { - // Type guard to check if usageValue has the expected properties - if (typeof usageValue !== "object" || usageValue === null || !("text_token_count" in usageValue)) { - return null; - } - - const usageData = usageValue as { - text_token_count: number; - cost_usd?: number; - }; - - const textTokenCount = usageData.text_token_count; - const costUsd = usageData.cost_usd; - - if (textTokenCount === undefined) return null; - - // Format the title based on the key - const formatTitle = (key: string) => { - const capitalizedKey = key.charAt(0).toUpperCase() + key.slice(1); - return `${capitalizedKey} Token Count`; - }; - - const formatCostTitle = (key: string) => { - const capitalizedKey = key.charAt(0).toUpperCase() + key.slice(1); - return `${capitalizedKey} Cost`; - }; - - return ( -
- - {costUsd !== undefined && ( - - )} -
- ); - })} +
))} diff --git a/web/src/components/utils/__tests__/utils.test.ts b/web/src/components/utils/__tests__/utils.test.ts index e8b0ebfd..ad6e8d71 100644 --- a/web/src/components/utils/__tests__/utils.test.ts +++ b/web/src/components/utils/__tests__/utils.test.ts @@ -1,4 +1,4 @@ -import { Version } from "@/types/models"; +import { Completion, ExperimentCompletion, Version } from "@/types/models"; import { calculateAverageMetrics, filterAnnotations, @@ -9,6 +9,7 @@ import { getDifferingVersionKeys, getMetricBadgeColor, getMetricBadgeWithRelative, + getReasoningTokenCount, getSharedPartsOfPrompts, getValidCosts, getValidDurations, @@ -19,6 +20,7 @@ import { resolveRef, shouldIncludeCostMetric, shouldIncludeDurationMetric, + shouldIncludeReasoningMetric, sortVersionKeys, stripMarkdown, transformCompletionsData, @@ -306,8 +308,10 @@ describe("Calculation Functions", () => { expect(result).toEqual({ avgCost: undefined, avgDuration: undefined, + avgReasoningTokens: undefined, costs: [], durations: [], + reasoningTokens: [], }); }); @@ -318,8 +322,10 @@ describe("Calculation Functions", () => { expect(result).toEqual({ avgCost: 2, avgDuration: 3, + avgReasoningTokens: undefined, costs: [1, 3], durations: [2, 4], + reasoningTokens: [], }); }); @@ -333,6 +339,94 @@ describe("Calculation Functions", () => { expect(result.avgCost).toBe(1); // (0 + 2) / 2 expect(result.avgDuration).toBe(2); // (0 + 4) / 2 }); + + it("calculates reasoning tokens correctly", () => { + const completions = [ + { + ...mockExperimentCompletion(1, 2), + reasoning_token_count: 100, + }, + { + ...mockExperimentCompletion(2, 3), + reasoning_token_count: 200, + }, + ]; + + const result = calculateAverageMetrics(completions); + expect(result.avgReasoningTokens).toBe(150); // (100 + 200) / 2 + expect(result.reasoningTokens).toEqual([100, 200]); + }); + }); + + describe("getReasoningTokenCount", () => { + it("returns reasoning tokens from ExperimentCompletion field", () => { + const completion: Partial = { + reasoning_token_count: 150, + }; + expect(getReasoningTokenCount(completion as ExperimentCompletion)).toBe(150); + }); + + it("returns 0 for explicitly 0 reasoning tokens", () => { + const completion: Partial = { + reasoning_token_count: 0, + }; + expect(getReasoningTokenCount(completion as ExperimentCompletion)).toBe(0); + }); + + it("returns undefined when reasoning tokens field not present", () => { + const completion: Partial = {}; + expect(getReasoningTokenCount(completion as ExperimentCompletion)).toBeUndefined(); + }); + + it("falls back to traces for regular Completion type", () => { + const completion: Partial = { + traces: [ + { + kind: "llm", + duration_seconds: 1, + cost_usd: 0, + model: "test-model", + provider: "test-provider", + usage: { + prompt: { text_token_count: 50, cost_usd: 0 }, + completion: { text_token_count: 100, reasoning_token_count: 100, cost_usd: 0 }, + }, + }, + ], + }; + expect(getReasoningTokenCount(completion as Completion)).toBe(100); + }); + }); + + describe("shouldIncludeReasoningMetric", () => { + it("returns true for valid completion with reasoning tokens", () => { + const completion: Partial = { + reasoning_token_count: 150, + output: { messages: [] }, + }; + expect(shouldIncludeReasoningMetric(completion as ExperimentCompletion)).toBe(true); + }); + + it("returns false for undefined completion", () => { + expect(shouldIncludeReasoningMetric(undefined)).toBe(false); + }); + + it("returns false when output has error", () => { + const completion: Partial = { + reasoning_token_count: 150, + output: { + error: { error: "Something went wrong" }, + }, + }; + expect(shouldIncludeReasoningMetric(completion as ExperimentCompletion)).toBe(false); + }); + + it("returns false when reasoning tokens are undefined", () => { + const completion: Partial = { + output: { messages: [] }, + }; + expect(shouldIncludeReasoningMetric(completion as ExperimentCompletion)).toBe(false); + }); }); }); diff --git a/web/src/components/utils/utils.ts b/web/src/components/utils/utils.ts index a902c6c9..4bc00a40 100644 --- a/web/src/components/utils/utils.ts +++ b/web/src/components/utils/utils.ts @@ -1,11 +1,13 @@ // Utility functions for experiment components import { Annotation, + Completion, ExperimentCompletion, ExperimentWithLookups, ExtendedVersion, Message, OutputSchema, + Trace, Version, } from "@/types/models"; import { findCommonSubstrings } from "./stringMatchingUtils"; @@ -32,7 +34,7 @@ export function getMetricBadgeWithRelative( value: number, values: number[], isHigherBetter: boolean = false, - metricType?: "cost" | "duration" + metricType?: "cost" | "duration" | "reasoning" ) { if (!values || values.length === 0) { return { @@ -70,6 +72,9 @@ export function getMetricBadgeWithRelative( if (metricType === "duration") { return isBetterValue ? "faster" : "slower"; } + if (metricType === "reasoning") { + return isBetterValue ? "more efficient" : "less efficient"; + } return ""; // Don't show any descriptor for unknown metric types }; @@ -79,14 +84,14 @@ export function getMetricBadgeWithRelative( if (isBest) { color = - metricType === "cost" || metricType === "duration" + metricType === "cost" || metricType === "duration" || metricType === "reasoning" ? "bg-green-200 border border-green-400 text-green-900" : "bg-transparent border border-gray-200 text-gray-700"; const comparisonText = getComparisonText(true); relativeText = comparisonText ? `${(max / min).toFixed(1)}x ${comparisonText}` : `${(max / min).toFixed(1)}x`; } else if (isWorst) { color = - metricType === "cost" || metricType === "duration" + metricType === "cost" || metricType === "duration" || metricType === "reasoning" ? "bg-red-200 border border-red-300 text-red-900" : "bg-transparent border border-gray-200 text-gray-700"; const comparisonText = getComparisonText(false); @@ -97,7 +102,7 @@ export function getMetricBadgeWithRelative( // For non-best values, show how much worse they are if (!isBest && max > 0) { - if (metricType === "cost" || metricType === "duration") { + if (metricType === "cost" || metricType === "duration" || metricType === "reasoning") { relativeText = `${(max / value).toFixed(1)}x ${getComparisonText(false)}`; } else { relativeText = `${(max / value).toFixed(1)}x`; @@ -109,14 +114,14 @@ export function getMetricBadgeWithRelative( if (isBest) { color = - metricType === "cost" || metricType === "duration" + metricType === "cost" || metricType === "duration" || metricType === "reasoning" ? "bg-green-200 border border-green-400 text-green-900" : "bg-transparent border border-gray-200 text-gray-700"; const comparisonText = getComparisonText(true); relativeText = comparisonText ? `${(max / min).toFixed(1)}x ${comparisonText}` : `${(max / min).toFixed(1)}x`; } else if (isWorst) { color = - metricType === "cost" || metricType === "duration" + metricType === "cost" || metricType === "duration" || metricType === "reasoning" ? "bg-red-200 border border-red-300 text-red-900" : "bg-transparent border border-gray-200 text-gray-700"; const comparisonText = getComparisonText(false); @@ -127,7 +132,7 @@ export function getMetricBadgeWithRelative( // For non-best values, show how much worse they are if (!isBest && min > 0) { - if (metricType === "cost" || metricType === "duration") { + if (metricType === "cost" || metricType === "duration" || metricType === "reasoning") { relativeText = `${(value / min).toFixed(1)}x ${getComparisonText(false)}`; } else { relativeText = `${(value / min).toFixed(1)}x`; @@ -256,6 +261,12 @@ export function shouldIncludeDurationMetric( ); } +export function shouldIncludeReasoningMetric( + completion: ExperimentCompletion | undefined +): completion is ExperimentCompletion { + return completion != null && getReasoningTokenCount(completion) !== undefined && !completion.output?.error; +} + export function getValidCosts(completions: (ExperimentCompletion | undefined)[]): number[] { return completions .filter((completion): completion is ExperimentCompletion => shouldIncludeCostMetric(completion)) @@ -268,26 +279,47 @@ export function getValidDurations(completions: (ExperimentCompletion | undefined .map((completion) => completion.duration_seconds); } +export function getValidReasoningTokens(completions: (ExperimentCompletion | undefined)[]): number[] { + return completions + .filter((completion): completion is ExperimentCompletion => shouldIncludeReasoningMetric(completion)) + .map((completion) => getReasoningTokenCount(completion)) + .filter((tokens): tokens is number => tokens !== undefined); +} + export function calculateAverageMetrics(completions: ExperimentCompletion[]): { avgCost: number | undefined; avgDuration: number | undefined; + avgReasoningTokens: number | undefined; costs: number[]; durations: number[]; + reasoningTokens: number[]; } { - if (completions.length === 0) return { avgCost: undefined, avgDuration: undefined, costs: [], durations: [] }; + if (completions.length === 0) + return { + avgCost: undefined, + avgDuration: undefined, + avgReasoningTokens: undefined, + costs: [], + durations: [], + reasoningTokens: [], + }; // Use centralized filtering logic const costs = getValidCosts(completions); const durations = getValidDurations(completions); + const reasoningTokens = getValidReasoningTokens(completions); const totalCost = costs.reduce((sum, cost) => sum + cost, 0); const totalDuration = durations.reduce((sum, duration) => sum + duration, 0); + const totalReasoningTokens = reasoningTokens.reduce((sum, tokens) => sum + tokens, 0); return { avgCost: costs.length > 0 ? totalCost / costs.length : undefined, avgDuration: durations.length > 0 ? totalDuration / durations.length : undefined, + avgReasoningTokens: reasoningTokens.length > 0 ? totalReasoningTokens / reasoningTokens.length : undefined, costs, durations, + reasoningTokens, }; } @@ -325,7 +357,14 @@ export function getPriceAndLatencyPerVersion( }> ): Array<{ versionId: string; - metrics: { avgCost: number | undefined; avgDuration: number | undefined; costs: number[]; durations: number[] }; + metrics: { + avgCost: number | undefined; + avgDuration: number | undefined; + avgReasoningTokens: number | undefined; + costs: number[]; + durations: number[]; + reasoningTokens: number[]; + }; }> { return completionsPerVersion.map(({ versionId, completions }) => ({ versionId, @@ -1134,3 +1173,131 @@ export function stripMarkdown(markdown: string): string { .replace(/\n+/g, " ") // Replace newlines with spaces .trim(); } + +/** + * Extracts the reasoning token count from a completion's traces. + * Looks through all LLM traces and returns the total reasoning tokens used. + * + * @param completion - The completion object containing traces + * @returns The total number of reasoning tokens used, or undefined if reasoning tokens are not present in the trace structure + */ +export function getReasoningTokenCount(completion: Completion | ExperimentCompletion): number | undefined { + // For ExperimentCompletion, use the direct reasoning_token_count field + if ("reasoning_token_count" in completion) { + return completion.reasoning_token_count; + } + + // For regular Completion, fall back to parsing traces + const traces = "traces" in completion ? completion.traces : undefined; + + if (!traces || !Array.isArray(traces)) { + return undefined; + } + + let totalReasoningTokens = 0; + let hasReasoningField = false; + + for (const trace of traces) { + // Only check LLM traces + if (trace.kind !== "llm") continue; + + const llmTrace = trace as Extract; + + // Check if trace has usage data + if (!llmTrace.usage) continue; + + // Handle both new detailed usage structure and old simple structure + if ("completion" in llmTrace.usage && llmTrace.usage.completion) { + const completionUsage = llmTrace.usage.completion; + + // Check if reasoning_token_count field exists (even if it's 0) + if ("reasoning_token_count" in completionUsage && completionUsage.reasoning_token_count !== undefined) { + hasReasoningField = true; + totalReasoningTokens += completionUsage.reasoning_token_count || 0; + } + } + } + + // Return undefined if no traces had reasoning_token_count field + // Return the total (including 0) if the field was present + return hasReasoningField ? totalReasoningTokens : undefined; +} + +/** + * Checks if a completion used reasoning (has reasoning tokens > 0) + * + * @param completion - The completion object to check + * @returns True if the completion used reasoning, false if no reasoning or reasoning tokens not present + */ +export function hasReasoningTokens(completion: Completion | ExperimentCompletion): boolean { + const reasoningTokens = getReasoningTokenCount(completion); + return reasoningTokens !== undefined && reasoningTokens > 0; +} + +/** + * Gets a summary of token usage from completion traces including reasoning tokens + * + * @param completion - The completion object containing traces + * @returns Object with token usage breakdown, reasoningTokens is undefined if not present in trace + */ +export function getTokenUsageSummary(completion: Completion | ExperimentCompletion): { + promptTokens: number; + completionTokens: number; + reasoningTokens: number | undefined; + cachedTokens: number; + totalTokens: number; +} { + const traces = "traces" in completion ? completion.traces : undefined; + + let promptTokens = 0; + let completionTokens = 0; + let reasoningTokens: number | undefined = undefined; + let cachedTokens = 0; + let hasReasoningField = false; + + if (traces && Array.isArray(traces)) { + for (const trace of traces) { + if (trace.kind !== "llm") continue; + + const llmTrace = trace as Extract; + + if (!llmTrace.usage) continue; + + // Handle detailed usage structure + if ("prompt" in llmTrace.usage && "completion" in llmTrace.usage) { + const usage = llmTrace.usage as { + prompt: { text_token_count?: number }; + completion: { text_token_count?: number; reasoning_token_count?: number; cached_token_count?: number }; + }; + + if (usage.prompt.text_token_count) { + promptTokens += usage.prompt.text_token_count; + } + + if (usage.completion.text_token_count) { + completionTokens += usage.completion.text_token_count; + } + + if ("reasoning_token_count" in usage.completion && usage.completion.reasoning_token_count !== undefined) { + if (!hasReasoningField) { + hasReasoningField = true; + reasoningTokens = 0; // Initialize when we first find the field + } + reasoningTokens = (reasoningTokens || 0) + (usage.completion.reasoning_token_count || 0); + } + + if (usage.completion.cached_token_count) { + cachedTokens += usage.completion.cached_token_count; + } + } + } + } + + return { + promptTokens, + completionTokens, + reasoningTokens, + cachedTokens, + totalTokens: promptTokens + completionTokens + (reasoningTokens || 0), + }; +} diff --git a/web/src/types/models.ts b/web/src/types/models.ts index cbd6698c..c1bfcdb8 100644 --- a/web/src/types/models.ts +++ b/web/src/types/models.ts @@ -145,10 +145,23 @@ export interface Output { error?: Error; } +export interface TokenUsage { + text_token_count?: number; + audio_token_count?: number; + audio_count?: number; + image_token_count?: number; + image_count?: number; + cost_usd: number; +} + +export interface CompletionUsage extends TokenUsage { + cached_token_count?: number; + reasoning_token_count?: number; +} + export interface InferenceUsage { - input_tokens: number; - output_tokens: number; - total_tokens: number; + prompt: TokenUsage; + completion: CompletionUsage; } export interface LLMTrace { @@ -199,6 +212,7 @@ export interface ExperimentCompletion { output: Output; cost_usd: number; duration_seconds: number; + reasoning_token_count?: number; } export interface Completion {