From d36b129a9df22e49f6a4f3e7a8bff364a37b6680 Mon Sep 17 00:00:00 2001 From: Jacek Zimonski <39839016+jacekzimonski@users.noreply.github.com> Date: Mon, 13 Oct 2025 15:57:31 +0200 Subject: [PATCH 1/4] GitHub-462 Reasoning Token Count Support --- backend/core/domain/experiment.py | 1 + backend/protocol/api/_api_models.py | 4 + backend/protocol/api/_services/conversions.py | 1 + .../api/_services/experiment_service.py | 54 ++++++ .../[id]/sections/Results/MatrixSection.tsx | 22 ++- .../Results/completion/CompletionCell.tsx | 10 +- web/src/components/MetricItem.tsx | 15 +- .../completion-modal/TracesView.tsx | 164 +++++++++++----- .../components/utils/__tests__/utils.test.ts | 89 +++++++++ web/src/components/utils/utils.ts | 181 +++++++++++++++++- web/src/types/models.ts | 20 +- 11 files changed, 502 insertions(+), 59 deletions(-) diff --git a/backend/core/domain/experiment.py b/backend/core/domain/experiment.py index ebb527e8..72c8bf8c 100644 --- a/backend/core/domain/experiment.py +++ b/backend/core/domain/experiment.py @@ -24,6 +24,7 @@ class ExperimentOutput(BaseModel): output: AgentOutput | None cost_usd: float | None duration_seconds: float | None + reasoning_token_count: float | None = None class ExperimentVersion(Version): diff --git a/backend/protocol/api/_api_models.py b/backend/protocol/api/_api_models.py index 927b7eeb..0ce25f71 100644 --- a/backend/protocol/api/_api_models.py +++ b/backend/protocol/api/_api_models.py @@ -496,6 +496,10 @@ class Completion(BaseModel): output: Output cost_usd: float duration_seconds: float + reasoning_token_count: float | None = Field( + default=None, + description="The number of reasoning tokens used in the inference, if applicable.", + ) completions: list[Completion] | None = Field(default=None, description="The completions of the experiment.") diff --git a/backend/protocol/api/_services/conversions.py b/backend/protocol/api/_services/conversions.py index f4a7499c..f39eb6fc 100644 --- a/backend/protocol/api/_services/conversions.py +++ b/backend/protocol/api/_services/conversions.py @@ -919,6 +919,7 @@ def experiment_completion_from_domain(completion: ExperimentOutput) -> Experimen output=output_from_domain(completion.output) if completion.output else Output(), cost_usd=completion.cost_usd or 0.0, duration_seconds=completion.duration_seconds or 0.0, + reasoning_token_count=completion.reasoning_token_count, ) diff --git a/backend/protocol/api/_services/experiment_service.py b/backend/protocol/api/_services/experiment_service.py index 15b21146..b38a1bd8 100644 --- a/backend/protocol/api/_services/experiment_service.py +++ b/backend/protocol/api/_services/experiment_service.py @@ -84,6 +84,25 @@ async def get_experiment( input_ids=input_ids, ) + # Fetch reasoning tokens for outputs if they exist + if exp.outputs: + completion_ids = [output.completion_id for output in exp.outputs] + # Get completions with traces from ClickHouse + completions_with_traces = await self.completion_storage.completions_by_ids( + completion_ids, exclude={"input_variables", "input_messages", "output_messages", "messages"}, + ) + # Create a map of completion_id -> reasoning_token_count + reasoning_tokens_map = {} + for comp in completions_with_traces: + reasoning_tokens = self._calculate_reasoning_tokens_from_traces(comp.traces) + if reasoning_tokens is not None: + reasoning_tokens_map[comp.id] = reasoning_tokens + + # Attach reasoning token counts to experiment outputs + for output in exp.outputs: + if output.completion_id in reasoning_tokens_map: + output.reasoning_token_count = reasoning_tokens_map[output.completion_id] + annotations: list[Annotation] = [] if include is None or "annotations" in include: annotations = await self.annotation_storage.list( @@ -97,6 +116,41 @@ async def get_experiment( # getting annotations as needed return experiment_from_domain(exp, annotations) + def _calculate_reasoning_tokens_from_traces(self, traces: list | None) -> float | None: + """Calculate total reasoning tokens from traces. + + Args: + traces: List of trace objects + + Returns: + Total reasoning tokens as float, or None if no reasoning tokens found + """ + if not traces: + return None + + total_reasoning_tokens = 0.0 + has_reasoning_field = False + + for trace in traces: + # Only check LLM traces + if not hasattr(trace, "kind") or trace.kind != "llm": + continue + + # Check if trace has usage data + if not hasattr(trace, "usage") or not trace.usage: + continue + + # Handle detailed usage structure + if hasattr(trace.usage, "completion") and trace.usage.completion: + completion_usage = trace.usage.completion + + # Check if reasoning_token_count field exists + if hasattr(completion_usage, "reasoning_token_count") and completion_usage.reasoning_token_count is not None: + has_reasoning_field = True + total_reasoning_tokens += float(completion_usage.reasoning_token_count or 0) + + return total_reasoning_tokens if has_reasoning_field else None + async def list_experiments(self, agent_id: str | None = None, limit: int = 10, offset: int = 0) -> Page[Experiment]: if agent_id: agent = await self.agent_storage.get_agent(agent_id) diff --git a/web/src/app/experiments/[id]/sections/Results/MatrixSection.tsx b/web/src/app/experiments/[id]/sections/Results/MatrixSection.tsx index 33b024d2..59efff9d 100644 --- a/web/src/app/experiments/[id]/sections/Results/MatrixSection.tsx +++ b/web/src/app/experiments/[id]/sections/Results/MatrixSection.tsx @@ -10,6 +10,7 @@ import { getSharedPartsOfPrompts, getValidCosts, getValidDurations, + getValidReasoningTokens, } from "@/components/utils/utils"; import { useColumnWidths } from "@/hooks/useColumnWidths"; import { useVersionHiding } from "@/hooks/useVersionHiding"; @@ -115,6 +116,9 @@ export function MatrixSection(props: Props) { const allAvgDurations = priceAndLatencyPerVersion .map(({ metrics }) => metrics.avgDuration) .filter((duration): duration is number => duration !== undefined); + const allAvgReasoningTokens = priceAndLatencyPerVersion + .map(({ metrics }) => metrics.avgReasoningTokens) + .filter((tokens): tokens is number => tokens !== undefined); // Calculate raw metrics lookup for percentile data const rawMetricsPerVersionPerKey = getRawMetricsPerVersionPerKey(experiment, annotations); @@ -140,6 +144,10 @@ export function MatrixSection(props: Props) { if (priceAndLatency.metrics.avgDuration !== undefined) { allMetrics.unshift({ key: "duration", average: priceAndLatency.metrics.avgDuration }); } + // Only add reasoning tokens metric if it has a valid value + if (priceAndLatency.metrics.avgReasoningTokens !== undefined) { + allMetrics.unshift({ key: "reasoning", average: priceAndLatency.metrics.avgReasoningTokens }); + } } // Combine allMetricsPerKey with price and latency data @@ -151,6 +159,9 @@ export function MatrixSection(props: Props) { if (allAvgDurations.length > 0) { allMetricsPerKeyForVersion.duration = allAvgDurations; } + if (allAvgReasoningTokens.length > 0) { + allMetricsPerKeyForVersion.reasoning = allAvgReasoningTokens; + } } // Combine rawMetricsPerKey with price and latency data @@ -158,6 +169,9 @@ export function MatrixSection(props: Props) { if (priceAndLatency?.metrics) { versionMetricsPerKeyForVersion.cost = priceAndLatency.metrics.costs; versionMetricsPerKeyForVersion.duration = priceAndLatency.metrics.durations; + if (priceAndLatency.metrics.reasoningTokens.length > 0) { + versionMetricsPerKeyForVersion.reasoning = priceAndLatency.metrics.reasoningTokens; + } } // Find the original index of this version in the sorted versions array @@ -204,9 +218,10 @@ export function MatrixSection(props: Props) { .map((version) => findCompletionForInputAndVersion(experiment.completions || [], input.id, version.id)) .filter(Boolean); // Remove undefined completions - // Calculate cost and duration arrays for this row using centralized utility functions + // Calculate cost, duration, and reasoning token arrays for this row using centralized utility functions const allCostsForRow = getValidCosts(completionsForInput); const allDurationsForRow = getValidDurations(completionsForInput); + const allReasoningTokensForRow = getValidReasoningTokens(completionsForInput); // Calculate metrics per key for this row (for row-based comparison coloring) const allMetricsPerKeyForRowData = getAllMetricsPerKeyForRow(experiment, annotations, input.id); @@ -216,13 +231,16 @@ export function MatrixSection(props: Props) { ...allMetricsPerKeyForRowData, }; - // Add cost and duration arrays if they have data + // Add cost, duration, and reasoning token arrays if they have data if (allCostsForRow.length > 0) { allMetricsPerKeyForRow.cost = allCostsForRow; } if (allDurationsForRow.length > 0) { allMetricsPerKeyForRow.duration = allDurationsForRow; } + if (allReasoningTokensForRow.length > 0) { + allMetricsPerKeyForRow.reasoning = allReasoningTokensForRow; + } return orderedVersions.map((version) => { const completion = findCompletionForInputAndVersion(experiment.completions || [], input.id, version.id); diff --git a/web/src/app/experiments/[id]/sections/Results/completion/CompletionCell.tsx b/web/src/app/experiments/[id]/sections/Results/completion/CompletionCell.tsx index 728a5397..5336b480 100644 --- a/web/src/app/experiments/[id]/sections/Results/completion/CompletionCell.tsx +++ b/web/src/app/experiments/[id]/sections/Results/completion/CompletionCell.tsx @@ -7,7 +7,7 @@ import { PageError } from "@/components/PageError"; import { useToast } from "@/components/ToastProvider"; import { AnnotationsView } from "@/components/annotations/AnnotationsView"; import { MessagesViewer } from "@/components/messages/MessagesViewer"; -import { shouldIncludeCostMetric, shouldIncludeDurationMetric } from "@/components/utils/utils"; +import { shouldIncludeCostMetric, shouldIncludeDurationMetric, shouldIncludeReasoningMetric, getReasoningTokenCount } from "@/components/utils/utils"; import { Annotation, ExperimentCompletion } from "@/types/models"; import { getMetricsForCompletion } from "../../../utils"; @@ -48,6 +48,14 @@ function CompletionCell(props: CompletionCellProps) { metrics.push({ key: "duration", average: completion.duration_seconds }); } + // Add reasoning tokens metric if valid using centralized utility + if (shouldIncludeReasoningMetric(completion)) { + const reasoningTokens = getReasoningTokenCount(completion); + if (reasoningTokens !== undefined) { + metrics.push({ key: "reasoning", average: reasoningTokens }); + } + } + // Add custom metrics from annotations if (completionMetrics.length > 0) { metrics.push(...completionMetrics); diff --git a/web/src/components/MetricItem.tsx b/web/src/components/MetricItem.tsx index b5f6998c..61e785dc 100644 --- a/web/src/components/MetricItem.tsx +++ b/web/src/components/MetricItem.tsx @@ -71,6 +71,9 @@ export function MetricItem({ if (metricKey.includes("duration") || metricKey.includes("latency")) { return "duration"; } + if (metricKey.includes("reasoning")) { + return "reasoning"; + } return undefined; }, [metricKey]); @@ -108,17 +111,27 @@ export function MetricItem({ return (value: number) => (usePer1kMultiplier ? formatCurrency(value, 1000) : `$${formatNumber(value)}`); } else if (metricType === "duration") { return formatDuration; + } else if (metricType === "reasoning") { + return (value: number) => `${Math.round(value).toLocaleString()} tokens`; } else { return (value: number) => value.toFixed(2); } }, [metricType, usePer1kMultiplier]); const displayLabel = showAvgPrefix - ? `Average ${metricKey === "cost" ? (usePer1kMultiplier ? "cost (per 1K)" : "cost") : metricKey.replace(/_/g, " ")}` + ? `Average ${ + metricKey === "cost" + ? (usePer1kMultiplier ? "cost (per 1K)" : "cost") + : metricKey === "reasoning" + ? "reasoning" + : metricKey.replace(/_/g, " ") + }` : metricKey === "cost" ? usePer1kMultiplier ? "cost (per 1K)" : "cost" + : metricKey === "reasoning" + ? "reasoning" : metricKey.replace(/_/g, " "); if (percentiles && showAvgPrefix) { diff --git a/web/src/components/completion-modal/TracesView.tsx b/web/src/components/completion-modal/TracesView.tsx index 3b7bbeba..2ba388f5 100644 --- a/web/src/components/completion-modal/TracesView.tsx +++ b/web/src/components/completion-modal/TracesView.tsx @@ -7,21 +7,134 @@ type Props = { traces?: Trace[]; }; +type UsageInfoProps = { + trace: Extract; + traceIndex: number; +}; + +function UsageInfo({ trace, traceIndex }: UsageInfoProps) { + if (!trace.usage) return null; + + // Check if this is the new detailed usage structure + if ("prompt" in trace.usage && "completion" in trace.usage) { + const detailedUsage = trace.usage as { + prompt: { text_token_count?: number; cost_usd: number }; + completion: { text_token_count?: number; reasoning_token_count?: number; cached_token_count?: number; cost_usd: number }; + }; + + const items = []; + + // Prompt tokens + if (detailedUsage.prompt.text_token_count) { + items.push( + + ); + } + + // Completion tokens + if (detailedUsage.completion.text_token_count) { + items.push( + + ); + } + + // Reasoning tokens (new!) + if (detailedUsage.completion.reasoning_token_count && detailedUsage.completion.reasoning_token_count > 0) { + items.push( + + ); + } + + // Cached tokens + if (detailedUsage.completion.cached_token_count && detailedUsage.completion.cached_token_count > 0) { + items.push( + + ); + } + + // Total cost + const totalCost = detailedUsage.prompt.cost_usd + detailedUsage.completion.cost_usd; + if (totalCost > 0) { + items.push( + + ); + } + + return <>{items}; + } + + // Fallback to old simple structure + const simpleUsage = trace.usage as { input_tokens?: number; output_tokens?: number; total_tokens?: number }; + const items = []; + + if (simpleUsage.input_tokens) { + items.push( + + ); + } + + if (simpleUsage.output_tokens) { + items.push( + + ); + } + + if (simpleUsage.total_tokens) { + items.push( + + ); + } + + return <>{items}; +} + export function TracesView({ traces }: Props) { const llmTracesWithUsage = useMemo(() => { if (!traces || traces.length === 0) { return []; } - // Filter for LLM traces that have usage data with text_token_count + // Filter for LLM traces that have usage data return traces.filter((trace): trace is Extract => { if (trace.kind !== "llm" || !trace.usage) return false; - // Check if any usage entry has text_token_count - return Object.values(trace.usage).some( - (usageValue: unknown) => - typeof usageValue === "object" && usageValue !== null && "text_token_count" in usageValue - ); + // Check if usage has the new detailed structure or old simple structure + const hasDetailedUsage = "prompt" in trace.usage && "completion" in trace.usage; + const hasSimpleUsage = "input_tokens" in trace.usage || "output_tokens" in trace.usage; + + return hasDetailedUsage || hasSimpleUsage; }); }, [traces]); @@ -29,6 +142,7 @@ export function TracesView({ traces }: Props) { return null; } + return (
@@ -37,43 +151,7 @@ export function TracesView({ traces }: Props) {
{/* Show provider if available */} {trace.provider && } - {trace.usage && - Object.entries(trace.usage).map(([key, usageValue]) => { - // Type guard to check if usageValue has the expected properties - if (typeof usageValue !== "object" || usageValue === null || !("text_token_count" in usageValue)) { - return null; - } - - const usageData = usageValue as { - text_token_count: number; - cost_usd?: number; - }; - - const textTokenCount = usageData.text_token_count; - const costUsd = usageData.cost_usd; - - if (textTokenCount === undefined) return null; - - // Format the title based on the key - const formatTitle = (key: string) => { - const capitalizedKey = key.charAt(0).toUpperCase() + key.slice(1); - return `${capitalizedKey} Token Count`; - }; - - const formatCostTitle = (key: string) => { - const capitalizedKey = key.charAt(0).toUpperCase() + key.slice(1); - return `${capitalizedKey} Cost`; - }; - - return ( -
- - {costUsd !== undefined && ( - - )} -
- ); - })} +
))}
diff --git a/web/src/components/utils/__tests__/utils.test.ts b/web/src/components/utils/__tests__/utils.test.ts index e8b0ebfd..70e5d291 100644 --- a/web/src/components/utils/__tests__/utils.test.ts +++ b/web/src/components/utils/__tests__/utils.test.ts @@ -9,6 +9,7 @@ import { getDifferingVersionKeys, getMetricBadgeColor, getMetricBadgeWithRelative, + getReasoningTokenCount, getSharedPartsOfPrompts, getValidCosts, getValidDurations, @@ -19,6 +20,7 @@ import { resolveRef, shouldIncludeCostMetric, shouldIncludeDurationMetric, + shouldIncludeReasoningMetric, sortVersionKeys, stripMarkdown, transformCompletionsData, @@ -306,8 +308,10 @@ describe("Calculation Functions", () => { expect(result).toEqual({ avgCost: undefined, avgDuration: undefined, + avgReasoningTokens: undefined, costs: [], durations: [], + reasoningTokens: [], }); }); @@ -318,8 +322,10 @@ describe("Calculation Functions", () => { expect(result).toEqual({ avgCost: 2, avgDuration: 3, + avgReasoningTokens: undefined, costs: [1, 3], durations: [2, 4], + reasoningTokens: [], }); }); @@ -333,6 +339,89 @@ describe("Calculation Functions", () => { expect(result.avgCost).toBe(1); // (0 + 2) / 2 expect(result.avgDuration).toBe(2); // (0 + 4) / 2 }); + + it("calculates reasoning tokens correctly", () => { + const completions = [ + { + ...mockExperimentCompletion(1, 2), + reasoning_token_count: 100 + }, + { + ...mockExperimentCompletion(2, 3), + reasoning_token_count: 200 + } + ]; + + const result = calculateAverageMetrics(completions); + expect(result.avgReasoningTokens).toBe(150); // (100 + 200) / 2 + expect(result.reasoningTokens).toEqual([100, 200]); + }); + }); + + describe("getReasoningTokenCount", () => { + it("returns reasoning tokens from ExperimentCompletion field", () => { + const completion = { + reasoning_token_count: 150 + }; + expect(getReasoningTokenCount(completion as any)).toBe(150); + }); + + it("returns 0 for explicitly 0 reasoning tokens", () => { + const completion = { + reasoning_token_count: 0 + }; + expect(getReasoningTokenCount(completion as any)).toBe(0); + }); + + it("returns undefined when reasoning tokens field not present", () => { + const completion = {}; + expect(getReasoningTokenCount(completion as any)).toBeUndefined(); + }); + + it("falls back to traces for regular Completion type", () => { + const completion = { + traces: [ + { + kind: "llm", + usage: { + completion: { reasoning_token_count: 100 }, + }, + }, + ], + }; + expect(getReasoningTokenCount(completion as any)).toBe(100); + }); + }); + + describe("shouldIncludeReasoningMetric", () => { + it("returns true for valid completion with reasoning tokens", () => { + const completion = { + reasoning_token_count: 150, + output: { messages: [] }, + }; + expect(shouldIncludeReasoningMetric(completion as any)).toBe(true); + }); + + it("returns false for undefined completion", () => { + expect(shouldIncludeReasoningMetric(undefined)).toBe(false); + }); + + it("returns false when output has error", () => { + const completion = { + reasoning_token_count: 150, + output: { + error: { error: "Something went wrong" }, + }, + }; + expect(shouldIncludeReasoningMetric(completion as any)).toBe(false); + }); + + it("returns false when reasoning tokens are undefined", () => { + const completion = { + output: { messages: [] }, + }; + expect(shouldIncludeReasoningMetric(completion as any)).toBe(false); + }); }); }); diff --git a/web/src/components/utils/utils.ts b/web/src/components/utils/utils.ts index a902c6c9..11ce4cd4 100644 --- a/web/src/components/utils/utils.ts +++ b/web/src/components/utils/utils.ts @@ -1,11 +1,13 @@ // Utility functions for experiment components import { Annotation, + Completion, ExperimentCompletion, ExperimentWithLookups, ExtendedVersion, Message, OutputSchema, + Trace, Version, } from "@/types/models"; import { findCommonSubstrings } from "./stringMatchingUtils"; @@ -32,7 +34,7 @@ export function getMetricBadgeWithRelative( value: number, values: number[], isHigherBetter: boolean = false, - metricType?: "cost" | "duration" + metricType?: "cost" | "duration" | "reasoning" ) { if (!values || values.length === 0) { return { @@ -70,6 +72,9 @@ export function getMetricBadgeWithRelative( if (metricType === "duration") { return isBetterValue ? "faster" : "slower"; } + if (metricType === "reasoning") { + return isBetterValue ? "more efficient" : "less efficient"; + } return ""; // Don't show any descriptor for unknown metric types }; @@ -79,14 +84,14 @@ export function getMetricBadgeWithRelative( if (isBest) { color = - metricType === "cost" || metricType === "duration" + metricType === "cost" || metricType === "duration" || metricType === "reasoning" ? "bg-green-200 border border-green-400 text-green-900" : "bg-transparent border border-gray-200 text-gray-700"; const comparisonText = getComparisonText(true); relativeText = comparisonText ? `${(max / min).toFixed(1)}x ${comparisonText}` : `${(max / min).toFixed(1)}x`; } else if (isWorst) { color = - metricType === "cost" || metricType === "duration" + metricType === "cost" || metricType === "duration" || metricType === "reasoning" ? "bg-red-200 border border-red-300 text-red-900" : "bg-transparent border border-gray-200 text-gray-700"; const comparisonText = getComparisonText(false); @@ -97,7 +102,7 @@ export function getMetricBadgeWithRelative( // For non-best values, show how much worse they are if (!isBest && max > 0) { - if (metricType === "cost" || metricType === "duration") { + if (metricType === "cost" || metricType === "duration" || metricType === "reasoning") { relativeText = `${(max / value).toFixed(1)}x ${getComparisonText(false)}`; } else { relativeText = `${(max / value).toFixed(1)}x`; @@ -109,14 +114,14 @@ export function getMetricBadgeWithRelative( if (isBest) { color = - metricType === "cost" || metricType === "duration" + metricType === "cost" || metricType === "duration" || metricType === "reasoning" ? "bg-green-200 border border-green-400 text-green-900" : "bg-transparent border border-gray-200 text-gray-700"; const comparisonText = getComparisonText(true); relativeText = comparisonText ? `${(max / min).toFixed(1)}x ${comparisonText}` : `${(max / min).toFixed(1)}x`; } else if (isWorst) { color = - metricType === "cost" || metricType === "duration" + metricType === "cost" || metricType === "duration" || metricType === "reasoning" ? "bg-red-200 border border-red-300 text-red-900" : "bg-transparent border border-gray-200 text-gray-700"; const comparisonText = getComparisonText(false); @@ -127,7 +132,7 @@ export function getMetricBadgeWithRelative( // For non-best values, show how much worse they are if (!isBest && min > 0) { - if (metricType === "cost" || metricType === "duration") { + if (metricType === "cost" || metricType === "duration" || metricType === "reasoning") { relativeText = `${(value / min).toFixed(1)}x ${getComparisonText(false)}`; } else { relativeText = `${(value / min).toFixed(1)}x`; @@ -256,6 +261,16 @@ export function shouldIncludeDurationMetric( ); } +export function shouldIncludeReasoningMetric( + completion: ExperimentCompletion | undefined +): completion is ExperimentCompletion { + return ( + completion != null && + getReasoningTokenCount(completion) !== undefined && + !completion.output?.error + ); +} + export function getValidCosts(completions: (ExperimentCompletion | undefined)[]): number[] { return completions .filter((completion): completion is ExperimentCompletion => shouldIncludeCostMetric(completion)) @@ -268,26 +283,39 @@ export function getValidDurations(completions: (ExperimentCompletion | undefined .map((completion) => completion.duration_seconds); } +export function getValidReasoningTokens(completions: (ExperimentCompletion | undefined)[]): number[] { + return completions + .filter((completion): completion is ExperimentCompletion => shouldIncludeReasoningMetric(completion)) + .map((completion) => getReasoningTokenCount(completion)) + .filter((tokens): tokens is number => tokens !== undefined); +} + export function calculateAverageMetrics(completions: ExperimentCompletion[]): { avgCost: number | undefined; avgDuration: number | undefined; + avgReasoningTokens: number | undefined; costs: number[]; durations: number[]; + reasoningTokens: number[]; } { - if (completions.length === 0) return { avgCost: undefined, avgDuration: undefined, costs: [], durations: [] }; + if (completions.length === 0) return { avgCost: undefined, avgDuration: undefined, avgReasoningTokens: undefined, costs: [], durations: [], reasoningTokens: [] }; // Use centralized filtering logic const costs = getValidCosts(completions); const durations = getValidDurations(completions); + const reasoningTokens = getValidReasoningTokens(completions); const totalCost = costs.reduce((sum, cost) => sum + cost, 0); const totalDuration = durations.reduce((sum, duration) => sum + duration, 0); + const totalReasoningTokens = reasoningTokens.reduce((sum, tokens) => sum + tokens, 0); return { avgCost: costs.length > 0 ? totalCost / costs.length : undefined, avgDuration: durations.length > 0 ? totalDuration / durations.length : undefined, + avgReasoningTokens: reasoningTokens.length > 0 ? totalReasoningTokens / reasoningTokens.length : undefined, costs, durations, + reasoningTokens, }; } @@ -325,7 +353,14 @@ export function getPriceAndLatencyPerVersion( }> ): Array<{ versionId: string; - metrics: { avgCost: number | undefined; avgDuration: number | undefined; costs: number[]; durations: number[] }; + metrics: { + avgCost: number | undefined; + avgDuration: number | undefined; + avgReasoningTokens: number | undefined; + costs: number[]; + durations: number[]; + reasoningTokens: number[]; + }; }> { return completionsPerVersion.map(({ versionId, completions }) => ({ versionId, @@ -1134,3 +1169,131 @@ export function stripMarkdown(markdown: string): string { .replace(/\n+/g, " ") // Replace newlines with spaces .trim(); } + +/** + * Extracts the reasoning token count from a completion's traces. + * Looks through all LLM traces and returns the total reasoning tokens used. + * + * @param completion - The completion object containing traces + * @returns The total number of reasoning tokens used, or undefined if reasoning tokens are not present in the trace structure + */ +export function getReasoningTokenCount(completion: Completion | ExperimentCompletion): number | undefined { + // For ExperimentCompletion, use the direct reasoning_token_count field + if ('reasoning_token_count' in completion) { + return completion.reasoning_token_count; + } + + // For regular Completion, fall back to parsing traces + const traces = 'traces' in completion ? completion.traces : undefined; + + if (!traces || !Array.isArray(traces)) { + return undefined; + } + + let totalReasoningTokens = 0; + let hasReasoningField = false; + + for (const trace of traces) { + // Only check LLM traces + if (trace.kind !== "llm") continue; + + const llmTrace = trace as Extract; + + // Check if trace has usage data + if (!llmTrace.usage) continue; + + // Handle both new detailed usage structure and old simple structure + if ("completion" in llmTrace.usage && llmTrace.usage.completion) { + const completionUsage = llmTrace.usage.completion; + + // Check if reasoning_token_count field exists (even if it's 0) + if ('reasoning_token_count' in completionUsage && completionUsage.reasoning_token_count !== undefined) { + hasReasoningField = true; + totalReasoningTokens += completionUsage.reasoning_token_count || 0; + } + } + } + + // Return undefined if no traces had reasoning_token_count field + // Return the total (including 0) if the field was present + return hasReasoningField ? totalReasoningTokens : undefined; +} + +/** + * Checks if a completion used reasoning (has reasoning tokens > 0) + * + * @param completion - The completion object to check + * @returns True if the completion used reasoning, false if no reasoning or reasoning tokens not present + */ +export function hasReasoningTokens(completion: Completion | ExperimentCompletion): boolean { + const reasoningTokens = getReasoningTokenCount(completion); + return reasoningTokens !== undefined && reasoningTokens > 0; +} + +/** + * Gets a summary of token usage from completion traces including reasoning tokens + * + * @param completion - The completion object containing traces + * @returns Object with token usage breakdown, reasoningTokens is undefined if not present in trace + */ +export function getTokenUsageSummary(completion: Completion | ExperimentCompletion): { + promptTokens: number; + completionTokens: number; + reasoningTokens: number | undefined; + cachedTokens: number; + totalTokens: number; +} { + const traces = 'traces' in completion ? completion.traces : undefined; + + let promptTokens = 0; + let completionTokens = 0; + let reasoningTokens: number | undefined = undefined; + let cachedTokens = 0; + let hasReasoningField = false; + + if (traces && Array.isArray(traces)) { + for (const trace of traces) { + if (trace.kind !== "llm") continue; + + const llmTrace = trace as Extract; + + if (!llmTrace.usage) continue; + + // Handle detailed usage structure + if ("prompt" in llmTrace.usage && "completion" in llmTrace.usage) { + const usage = llmTrace.usage as { + prompt: { text_token_count?: number }; + completion: { text_token_count?: number; reasoning_token_count?: number; cached_token_count?: number }; + }; + + if (usage.prompt.text_token_count) { + promptTokens += usage.prompt.text_token_count; + } + + if (usage.completion.text_token_count) { + completionTokens += usage.completion.text_token_count; + } + + if ('reasoning_token_count' in usage.completion && usage.completion.reasoning_token_count !== undefined) { + if (!hasReasoningField) { + hasReasoningField = true; + reasoningTokens = 0; // Initialize when we first find the field + } + reasoningTokens = (reasoningTokens || 0) + (usage.completion.reasoning_token_count || 0); + } + + if (usage.completion.cached_token_count) { + cachedTokens += usage.completion.cached_token_count; + } + } + } + } + + return { + promptTokens, + completionTokens, + reasoningTokens, + cachedTokens, + totalTokens: promptTokens + completionTokens + (reasoningTokens || 0), + }; +} diff --git a/web/src/types/models.ts b/web/src/types/models.ts index cbd6698c..c1bfcdb8 100644 --- a/web/src/types/models.ts +++ b/web/src/types/models.ts @@ -145,10 +145,23 @@ export interface Output { error?: Error; } +export interface TokenUsage { + text_token_count?: number; + audio_token_count?: number; + audio_count?: number; + image_token_count?: number; + image_count?: number; + cost_usd: number; +} + +export interface CompletionUsage extends TokenUsage { + cached_token_count?: number; + reasoning_token_count?: number; +} + export interface InferenceUsage { - input_tokens: number; - output_tokens: number; - total_tokens: number; + prompt: TokenUsage; + completion: CompletionUsage; } export interface LLMTrace { @@ -199,6 +212,7 @@ export interface ExperimentCompletion { output: Output; cost_usd: number; duration_seconds: number; + reasoning_token_count?: number; } export interface Completion { From 8876ec2c2b32a5bcb9c9b95cb956c9c2d01bf075 Mon Sep 17 00:00:00 2001 From: Jacek Zimonski <39839016+jacekzimonski@users.noreply.github.com> Date: Mon, 13 Oct 2025 16:11:00 +0200 Subject: [PATCH 2/4] Fixes --- web/eslint.config.mjs | 10 +++++ .../app/home/components/bundled-content.ts | 9 ++--- .../components/utils/__tests__/utils.test.ts | 37 +++++++++++-------- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/web/eslint.config.mjs b/web/eslint.config.mjs index 22331647..051521d9 100644 --- a/web/eslint.config.mjs +++ b/web/eslint.config.mjs @@ -12,6 +12,16 @@ const compat = new FlatCompat({ const eslintConfig = [ ...compat.extends("next/core-web-vitals", "next/typescript"), { + ignores: [ + ".next/**/*", + "out/**/*", + "build/**/*", + "dist/**/*", + "node_modules/**/*", + ".next/types/**/*", + ".next/static/**/*", + ".next/server/**/*", + ], rules: { "no-restricted-imports": [ "error", diff --git a/web/src/app/home/components/bundled-content.ts b/web/src/app/home/components/bundled-content.ts index d7763d8d..a91c6d8f 100644 --- a/web/src/app/home/components/bundled-content.ts +++ b/web/src/app/home/components/bundled-content.ts @@ -8,9 +8,8 @@ export interface BundledPreviewData { export const bundledPreviewContent: BundledPreviewData = { frontmatter: { - title: "AnotherAI: a MCP server designed for AI engineering", - description: "Public preview", - }, - content: - '\nToday we\'re introducing a public preview of **AnotherAI**, a MCP server designed for AI engineering tasks that includes a set of tools that enables your AI assistant (such as Claude Code, Cursor, etc.) to:\n\n- run experiments to compare any models, and analyze the results (quality, speed, cost). [[docs](https://docs.anotherai.dev/use-cases/fundamentals/experiments)]\n- access production LLM completions to debug and improve agents based on real data. [[docs](https://docs.anotherai.dev/use-cases/fundamentals/debugging)]\n- collect and analyze users\' feedback to improve an agent. [[docs](https://docs.anotherai.dev/use-cases/user-feedback)]\n- answer any questions about metrics (usage, performance, etc.) [[docs](https://docs.anotherai.dev/use-cases/fundamentals/metrics)]\n- deploy a new prompt or model without any code change [[docs](https://docs.anotherai.dev/use-cases/fundamentals/deployments)]\n\nOur work is available at:\n\n- https://anotherai.dev as a managed service, billed at the same cost as the underlying models (no markup).\n- https://github.com/anotherai-dev/anotherai under the Apache 2.0 license.\n\n## AI that can compare models\' performance, price, and latency.\n\nAnotherAI\'s MCP server exposes tools that let your AI assistant access over 100 models, and compare their performance, price, and latency. In our own tests, we\'ve found that models like Opus 4 are very good at reviewing work from other models, and the latest improvements in longer context windows (Sonnet and Gemini support up to 1M tokens) make it possible to compare more parameters (models and prompts) and agents with longer inputs.\n\n[video]\n\nSome prompt examples:\n\n```\n> can you compare Gemini 2.5 Flash, GPT-4o mini and Mistral Small for this agent ""?\n> can you find a model that is faster but keeps the same quality and does not cost more?\n> can you test how GPT-5 performs on this agent ""?\n> can you adjust the prompt for the agent "" to include few shot examples? validate that the outputs are improved.\n> ...\n```\n\nBecause your AI assistant can\'t always be trusted without a human in the loop, we\'ve also implemented a web UI to review the experiments made by your AI assistant.\n\n[screenshot]\n\n## AI learns from production data.\n\nLearning from production usage is a key step to improving any AI agent. To learn from production usage, we have implemented an OpenAI compatible API that logs all the completions data coming through, and then our MCP server exposes these logs to your AI assistant.\n\nSome prompt examples:\n\n```\n> can you look at the last 20 completions for the agent "" and report back the ones that are not good?\n> can you understand why the customer "" had a bad experience with agent ""?\n> ...\n```\n\nLearn more about how to use the MCP server to learn from production data [here](https://docs.anotherai.dev/use-cases/fundamentals/debugging).\n\nSome people might not like the idea of adding a proxy as a new single point of failure in their LLM architecture, so we are also exploring exposing an API endpoint to import completions after they have been generated (like traditional observability tools). If you\'re interested in this feature, please get in touch on [Slack](https://join.slack.com/t/anotherai-dev/shared_invite/zt-3av2prezr-Lz10~8o~rSRQE72m_PyIJA) so we can design something that works well together.\n\n## AI learns from users\' feedback.\n\nOn top of the completions logs, collecting users\' feedback is another key step to improving any AI agent. To create a fluid feedback loop, we are exposing an Annotations API to let your end-users leave feedback on completions. Then our MCP server exposes these annotations to your AI assistant.\n\nWe believe that AI assistants are so good now that they are able to read users\' feedback, identify issues, propose improvements, and run experiments to test changes using production data.\n\nSome prompt examples:\n\n```\n> can you look at the users\' feedback for the agent "" in the last week, and write a report with the most common issues?\n> based on the users\' feedback, think about some improvements we can make to the agent "" and run an experiment to test them using the latest production data.\n> ...\n```\n\nLearn more about how to use the MCP server to learn from users\' feedback [here](https://docs.anotherai.dev/use-cases/user-feedback).\n\n## Deploy a new prompt or model without any code change.\n\nOne very popular feature of our previous product ([WorkflowAI](https://workflowai.com)) was the ability to update an agent\'s prompt or model without any code change. This feature enables faster iteration cycles, and fixing a prompt can be done without a PR and deployment. We\'ve implemented the same feature in AnotherAI\'s MCP server, with a human confirmation step to prevent your AI assistant from making changes that are not intended.\n\nSome prompt examples:\n\n```\n> can you update the deployment "" to use the model ""?\n> update the prompt from "" to use the prompt from this version ""?\n> ...\n```\n\nLearn more about how to use the MCP server to deploy a new prompt or model without any code change [here](https://docs.anotherai.dev/use-cases/fundamentals/deployments).\n\n## AI deep dives into metrics.\n\nBecause our LLM gateway logs all the completions data, we wanted to give you and your AI assistant the best way to leverage this data. So we\'ve designed two complementary components:\n\n- an MCP tool `query_completions(sql_query)` that allows your AI assistant to query the completions data using SQL queries. We\'ve been really impressed by how good AI assistants are at transforming a natural language question into a complex SQL query. Using SQL instead of a predefined API allows the AI assistant to query the data in very powerful ways.\n- a web UI to view graphs and metrics about your agents. Your AI assistant can use the tool `create_or_update_view(view)` to create a view that will be saved and can be accessed in the web UI.\n\nSome prompt examples:\n\n```\n> what is our most expensive agent? can we run an experiment to find a cheaper model that keeps the same quality?\n> what is the p90, and p99 latency for the agent ""?\n> can you create a graph that shows the cost by agent in the last month?\n```\n\n[video]\n\nWe\'ve also published a note about how we have secured the `query_completions` tool [here](https://docs.anotherai.dev/security#sql-query-tool-security) from malicious use. We welcome more feedback on our approach via our [Slack](https://join.slack.com/t/anotherai-dev/shared_invite/zt-3av2prezr-Lz10~8o~rSRQE72m_PyIJA)\n\n## Some (current) limitations.\n\nWe\'ve focused this initial preview on simple AI agent architectures, not complex agentic systems. Agents that have multiple back-and-forth interactions or custom tools are harder to reproduce with other prompts and models because you need to be able to simulate one end of the conversation, and for custom tools you need to run the code somehow. If you\'re building a complex agentic system, please get in touch on [Slack](https://join.slack.com/t/anotherai-dev/shared_invite/zt-3av2prezr-Lz10~8o~rSRQE72m_PyIJA) so we can design something that works well together.\n\nFor very low latency agents, using AnotherAI\'s LLM gateway might not be the best option due to the added latency of the gateway, which we estimate at ~100ms. It\'s also possible to use AnotherAI\'s MCP server independently from the AI gateway to run experiments between models and prompts.\n\n## Try it\n\nThe first step is to install the MCP server, you can find the instructions [here](https://docs.anotherai.dev/getting-started). Once the MCP server is installed, find your first use-case by looking at our use-cases in the [docs](https://docs.anotherai.dev/). New accounts will get $1 of free credits to try it out.\n\nWe are really excited to hear from you, please join our [Slack channel](https://join.slack.com/t/anotherai-dev/shared_invite/zt-3av2prezr-Lz10~8o~rSRQE72m_PyIJA) to share what you\'re building with AnotherAI, meet our team, ask any questions, or just say hi.\n\nAnotherAI\'s team.\n[Pierre](https://x.com/pierrevalade), Anya, Guillaume, Jacek.\n\n## FAQ\n\n\n \n AnotherAI MCP gets access to the completions data via the AnotherAI LLM gateway. The LLM gateway logs all the\n completions data and makes it available to the MCP server.\n \n\n', + "title": "AnotherAI: a MCP server designed for AI engineering", + "description": "Public preview" +}, + content: "\nToday we're introducing a public preview of **AnotherAI**, a MCP server designed for AI engineering tasks that includes a set of tools that enables your AI assistant (such as Claude Code, Cursor, etc.) to:\n\n- run experiments to compare any models, and analyze the results (quality, speed, cost). [[docs](https://docs.anotherai.dev/use-cases/fundamentals/experiments)]\n- access production LLM completions to debug and improve agents based on real data. [[docs](https://docs.anotherai.dev/use-cases/fundamentals/debugging)]\n- collect and analyze users' feedback to improve an agent. [[docs](https://docs.anotherai.dev/use-cases/user-feedback)]\n- answer any questions about metrics (usage, performance, etc.) [[docs](https://docs.anotherai.dev/use-cases/fundamentals/metrics)]\n- deploy a new prompt or model without any code change [[docs](https://docs.anotherai.dev/use-cases/fundamentals/deployments)]\n\nOur work is available at:\n\n- https://anotherai.dev as a managed service, billed at the same cost as the underlying models (no markup).\n- https://github.com/anotherai-dev/anotherai under the Apache 2.0 license.\n\n## AI that can compare models' performance, price, and latency.\n\nAnotherAI's MCP server exposes tools that let your AI assistant access over 100 models, and compare their performance, price, and latency. In our own tests, we've found that models like Opus 4 are very good at reviewing work from other models, and the latest improvements in longer context windows (Sonnet and Gemini support up to 1M tokens) make it possible to compare more parameters (models and prompts) and agents with longer inputs.\n\n[video]\n\nSome prompt examples:\n\n```\n> can you compare Gemini 2.5 Flash, GPT-4o mini and Mistral Small for this agent \"\"?\n> can you find a model that is faster but keeps the same quality and does not cost more?\n> can you test how GPT-5 performs on this agent \"\"?\n> can you adjust the prompt for the agent \"\" to include few shot examples? validate that the outputs are improved.\n> ...\n```\n\nBecause your AI assistant can't always be trusted without a human in the loop, we've also implemented a web UI to review the experiments made by your AI assistant.\n\n[screenshot]\n\n## AI learns from production data.\n\nLearning from production usage is a key step to improving any AI agent. To learn from production usage, we have implemented an OpenAI compatible API that logs all the completions data coming through, and then our MCP server exposes these logs to your AI assistant.\n\nSome prompt examples:\n\n```\n> can you look at the last 20 completions for the agent \"\" and report back the ones that are not good?\n> can you understand why the customer \"\" had a bad experience with agent \"\"?\n> ...\n```\n\nLearn more about how to use the MCP server to learn from production data [here](https://docs.anotherai.dev/use-cases/fundamentals/debugging).\n\nSome people might not like the idea of adding a proxy as a new single point of failure in their LLM architecture, so we are also exploring exposing an API endpoint to import completions after they have been generated (like traditional observability tools). If you're interested in this feature, please get in touch on [Slack](https://join.slack.com/t/anotherai-dev/shared_invite/zt-3av2prezr-Lz10~8o~rSRQE72m_PyIJA) so we can design something that works well together.\n\n## AI learns from users' feedback.\n\nOn top of the completions logs, collecting users' feedback is another key step to improving any AI agent. To create a fluid feedback loop, we are exposing an Annotations API to let your end-users leave feedback on completions. Then our MCP server exposes these annotations to your AI assistant.\n\nWe believe that AI assistants are so good now that they are able to read users' feedback, identify issues, propose improvements, and run experiments to test changes using production data.\n\nSome prompt examples:\n\n```\n> can you look at the users' feedback for the agent \"\" in the last week, and write a report with the most common issues?\n> based on the users' feedback, think about some improvements we can make to the agent \"\" and run an experiment to test them using the latest production data.\n> ...\n```\n\nLearn more about how to use the MCP server to learn from users' feedback [here](https://docs.anotherai.dev/use-cases/user-feedback).\n\n## Deploy a new prompt or model without any code change.\n\nOne very popular feature of our previous product ([WorkflowAI](https://workflowai.com)) was the ability to update an agent's prompt or model without any code change. This feature enables faster iteration cycles, and fixing a prompt can be done without a PR and deployment. We've implemented the same feature in AnotherAI's MCP server, with a human confirmation step to prevent your AI assistant from making changes that are not intended.\n\nSome prompt examples:\n\n```\n> can you update the deployment \"\" to use the model \"\"?\n> update the prompt from \"\" to use the prompt from this version \"\"?\n> ...\n```\n\nLearn more about how to use the MCP server to deploy a new prompt or model without any code change [here](https://docs.anotherai.dev/use-cases/fundamentals/deployments).\n\n## AI deep dives into metrics.\n\nBecause our LLM gateway logs all the completions data, we wanted to give you and your AI assistant the best way to leverage this data. So we've designed two complementary components:\n\n- an MCP tool `query_completions(sql_query)` that allows your AI assistant to query the completions data using SQL queries. We've been really impressed by how good AI assistants are at transforming a natural language question into a complex SQL query. Using SQL instead of a predefined API allows the AI assistant to query the data in very powerful ways.\n- a web UI to view graphs and metrics about your agents. Your AI assistant can use the tool `create_or_update_view(view)` to create a view that will be saved and can be accessed in the web UI.\n\nSome prompt examples:\n\n```\n> what is our most expensive agent? can we run an experiment to find a cheaper model that keeps the same quality?\n> what is the p90, and p99 latency for the agent \"\"?\n> can you create a graph that shows the cost by agent in the last month?\n```\n\n[video]\n\nWe've also published a note about how we have secured the `query_completions` tool [here](https://docs.anotherai.dev/security#sql-query-tool-security) from malicious use. We welcome more feedback on our approach via our [Slack](https://join.slack.com/t/anotherai-dev/shared_invite/zt-3av2prezr-Lz10~8o~rSRQE72m_PyIJA)\n\n## Some (current) limitations.\n\nWe've focused this initial preview on simple AI agent architectures, not complex agentic systems. Agents that have multiple back-and-forth interactions or custom tools are harder to reproduce with other prompts and models because you need to be able to simulate one end of the conversation, and for custom tools you need to run the code somehow. If you're building a complex agentic system, please get in touch on [Slack](https://join.slack.com/t/anotherai-dev/shared_invite/zt-3av2prezr-Lz10~8o~rSRQE72m_PyIJA) so we can design something that works well together.\n\nFor very low latency agents, using AnotherAI's LLM gateway might not be the best option due to the added latency of the gateway, which we estimate at ~100ms. It's also possible to use AnotherAI's MCP server independently from the AI gateway to run experiments between models and prompts.\n\n## Try it\n\nThe first step is to install the MCP server, you can find the instructions [here](https://docs.anotherai.dev/getting-started). Once the MCP server is installed, find your first use-case by looking at our use-cases in the [docs](https://docs.anotherai.dev/). New accounts will get $1 of free credits to try it out.\n\nWe are really excited to hear from you, please join our [Slack channel](https://join.slack.com/t/anotherai-dev/shared_invite/zt-3av2prezr-Lz10~8o~rSRQE72m_PyIJA) to share what you're building with AnotherAI, meet our team, ask any questions, or just say hi.\n\nAnotherAI's team.\n[Pierre](https://x.com/pierrevalade), Anya, Guillaume, Jacek.\n\n## FAQ\n\n\n \n AnotherAI MCP gets access to the completions data via the AnotherAI LLM gateway. The LLM gateway logs all the\n completions data and makes it available to the MCP server.\n \n\n", }; diff --git a/web/src/components/utils/__tests__/utils.test.ts b/web/src/components/utils/__tests__/utils.test.ts index 70e5d291..aca1cd13 100644 --- a/web/src/components/utils/__tests__/utils.test.ts +++ b/web/src/components/utils/__tests__/utils.test.ts @@ -1,4 +1,4 @@ -import { Version } from "@/types/models"; +import { Version, Completion, ExperimentCompletion } from "@/types/models"; import { calculateAverageMetrics, filterAnnotations, @@ -360,46 +360,51 @@ describe("Calculation Functions", () => { describe("getReasoningTokenCount", () => { it("returns reasoning tokens from ExperimentCompletion field", () => { - const completion = { + const completion: Partial = { reasoning_token_count: 150 }; - expect(getReasoningTokenCount(completion as any)).toBe(150); + expect(getReasoningTokenCount(completion as ExperimentCompletion)).toBe(150); }); it("returns 0 for explicitly 0 reasoning tokens", () => { - const completion = { + const completion: Partial = { reasoning_token_count: 0 }; - expect(getReasoningTokenCount(completion as any)).toBe(0); + expect(getReasoningTokenCount(completion as ExperimentCompletion)).toBe(0); }); it("returns undefined when reasoning tokens field not present", () => { - const completion = {}; - expect(getReasoningTokenCount(completion as any)).toBeUndefined(); + const completion: Partial = {}; + expect(getReasoningTokenCount(completion as ExperimentCompletion)).toBeUndefined(); }); it("falls back to traces for regular Completion type", () => { - const completion = { + const completion: Partial = { traces: [ { kind: "llm", + duration_seconds: 1, + cost_usd: 0, + model: "test-model", + provider: "test-provider", usage: { - completion: { reasoning_token_count: 100 }, + prompt: { text_token_count: 50, cost_usd: 0 }, + completion: { text_token_count: 100, reasoning_token_count: 100, cost_usd: 0 }, }, }, ], }; - expect(getReasoningTokenCount(completion as any)).toBe(100); + expect(getReasoningTokenCount(completion as Completion)).toBe(100); }); }); describe("shouldIncludeReasoningMetric", () => { it("returns true for valid completion with reasoning tokens", () => { - const completion = { + const completion: Partial = { reasoning_token_count: 150, output: { messages: [] }, }; - expect(shouldIncludeReasoningMetric(completion as any)).toBe(true); + expect(shouldIncludeReasoningMetric(completion as ExperimentCompletion)).toBe(true); }); it("returns false for undefined completion", () => { @@ -407,20 +412,20 @@ describe("Calculation Functions", () => { }); it("returns false when output has error", () => { - const completion = { + const completion: Partial = { reasoning_token_count: 150, output: { error: { error: "Something went wrong" }, }, }; - expect(shouldIncludeReasoningMetric(completion as any)).toBe(false); + expect(shouldIncludeReasoningMetric(completion as ExperimentCompletion)).toBe(false); }); it("returns false when reasoning tokens are undefined", () => { - const completion = { + const completion: Partial = { output: { messages: [] }, }; - expect(shouldIncludeReasoningMetric(completion as any)).toBe(false); + expect(shouldIncludeReasoningMetric(completion as ExperimentCompletion)).toBe(false); }); }); }); From c02bbdf796474294ba1b5a998bf8a672f18bd7f8 Mon Sep 17 00:00:00 2001 From: Jacek Zimonski <39839016+jacekzimonski@users.noreply.github.com> Date: Mon, 13 Oct 2025 16:17:42 +0200 Subject: [PATCH 3/4] Fix --- .../api/_services/experiment_service.py | 6 +-- .../Results/completion/CompletionCell.tsx | 7 ++- .../app/home/components/bundled-content.ts | 9 ++-- web/src/components/MetricItem.tsx | 14 ++--- .../completion-modal/TracesView.tsx | 16 +++--- .../components/utils/__tests__/utils.test.ts | 12 ++--- web/src/components/utils/utils.ts | 52 ++++++++++--------- 7 files changed, 63 insertions(+), 53 deletions(-) diff --git a/backend/protocol/api/_services/experiment_service.py b/backend/protocol/api/_services/experiment_service.py index b38a1bd8..2338af4a 100644 --- a/backend/protocol/api/_services/experiment_service.py +++ b/backend/protocol/api/_services/experiment_service.py @@ -87,9 +87,9 @@ async def get_experiment( # Fetch reasoning tokens for outputs if they exist if exp.outputs: completion_ids = [output.completion_id for output in exp.outputs] - # Get completions with traces from ClickHouse + # Get completions with traces from ClickHouse (only exclude agent_id to keep traces) completions_with_traces = await self.completion_storage.completions_by_ids( - completion_ids, exclude={"input_variables", "input_messages", "output_messages", "messages"}, + completion_ids, exclude={"agent_id"}, ) # Create a map of completion_id -> reasoning_token_count reasoning_tokens_map = {} @@ -116,7 +116,7 @@ async def get_experiment( # getting annotations as needed return experiment_from_domain(exp, annotations) - def _calculate_reasoning_tokens_from_traces(self, traces: list | None) -> float | None: + def _calculate_reasoning_tokens_from_traces(self, traces: list[Any] | None) -> float | None: """Calculate total reasoning tokens from traces. Args: diff --git a/web/src/app/experiments/[id]/sections/Results/completion/CompletionCell.tsx b/web/src/app/experiments/[id]/sections/Results/completion/CompletionCell.tsx index 5336b480..a9e07456 100644 --- a/web/src/app/experiments/[id]/sections/Results/completion/CompletionCell.tsx +++ b/web/src/app/experiments/[id]/sections/Results/completion/CompletionCell.tsx @@ -7,7 +7,12 @@ import { PageError } from "@/components/PageError"; import { useToast } from "@/components/ToastProvider"; import { AnnotationsView } from "@/components/annotations/AnnotationsView"; import { MessagesViewer } from "@/components/messages/MessagesViewer"; -import { shouldIncludeCostMetric, shouldIncludeDurationMetric, shouldIncludeReasoningMetric, getReasoningTokenCount } from "@/components/utils/utils"; +import { + getReasoningTokenCount, + shouldIncludeCostMetric, + shouldIncludeDurationMetric, + shouldIncludeReasoningMetric, +} from "@/components/utils/utils"; import { Annotation, ExperimentCompletion } from "@/types/models"; import { getMetricsForCompletion } from "../../../utils"; diff --git a/web/src/app/home/components/bundled-content.ts b/web/src/app/home/components/bundled-content.ts index a91c6d8f..d7763d8d 100644 --- a/web/src/app/home/components/bundled-content.ts +++ b/web/src/app/home/components/bundled-content.ts @@ -8,8 +8,9 @@ export interface BundledPreviewData { export const bundledPreviewContent: BundledPreviewData = { frontmatter: { - "title": "AnotherAI: a MCP server designed for AI engineering", - "description": "Public preview" -}, - content: "\nToday we're introducing a public preview of **AnotherAI**, a MCP server designed for AI engineering tasks that includes a set of tools that enables your AI assistant (such as Claude Code, Cursor, etc.) to:\n\n- run experiments to compare any models, and analyze the results (quality, speed, cost). [[docs](https://docs.anotherai.dev/use-cases/fundamentals/experiments)]\n- access production LLM completions to debug and improve agents based on real data. [[docs](https://docs.anotherai.dev/use-cases/fundamentals/debugging)]\n- collect and analyze users' feedback to improve an agent. [[docs](https://docs.anotherai.dev/use-cases/user-feedback)]\n- answer any questions about metrics (usage, performance, etc.) [[docs](https://docs.anotherai.dev/use-cases/fundamentals/metrics)]\n- deploy a new prompt or model without any code change [[docs](https://docs.anotherai.dev/use-cases/fundamentals/deployments)]\n\nOur work is available at:\n\n- https://anotherai.dev as a managed service, billed at the same cost as the underlying models (no markup).\n- https://github.com/anotherai-dev/anotherai under the Apache 2.0 license.\n\n## AI that can compare models' performance, price, and latency.\n\nAnotherAI's MCP server exposes tools that let your AI assistant access over 100 models, and compare their performance, price, and latency. In our own tests, we've found that models like Opus 4 are very good at reviewing work from other models, and the latest improvements in longer context windows (Sonnet and Gemini support up to 1M tokens) make it possible to compare more parameters (models and prompts) and agents with longer inputs.\n\n[video]\n\nSome prompt examples:\n\n```\n> can you compare Gemini 2.5 Flash, GPT-4o mini and Mistral Small for this agent \"\"?\n> can you find a model that is faster but keeps the same quality and does not cost more?\n> can you test how GPT-5 performs on this agent \"\"?\n> can you adjust the prompt for the agent \"\" to include few shot examples? validate that the outputs are improved.\n> ...\n```\n\nBecause your AI assistant can't always be trusted without a human in the loop, we've also implemented a web UI to review the experiments made by your AI assistant.\n\n[screenshot]\n\n## AI learns from production data.\n\nLearning from production usage is a key step to improving any AI agent. To learn from production usage, we have implemented an OpenAI compatible API that logs all the completions data coming through, and then our MCP server exposes these logs to your AI assistant.\n\nSome prompt examples:\n\n```\n> can you look at the last 20 completions for the agent \"\" and report back the ones that are not good?\n> can you understand why the customer \"\" had a bad experience with agent \"\"?\n> ...\n```\n\nLearn more about how to use the MCP server to learn from production data [here](https://docs.anotherai.dev/use-cases/fundamentals/debugging).\n\nSome people might not like the idea of adding a proxy as a new single point of failure in their LLM architecture, so we are also exploring exposing an API endpoint to import completions after they have been generated (like traditional observability tools). If you're interested in this feature, please get in touch on [Slack](https://join.slack.com/t/anotherai-dev/shared_invite/zt-3av2prezr-Lz10~8o~rSRQE72m_PyIJA) so we can design something that works well together.\n\n## AI learns from users' feedback.\n\nOn top of the completions logs, collecting users' feedback is another key step to improving any AI agent. To create a fluid feedback loop, we are exposing an Annotations API to let your end-users leave feedback on completions. Then our MCP server exposes these annotations to your AI assistant.\n\nWe believe that AI assistants are so good now that they are able to read users' feedback, identify issues, propose improvements, and run experiments to test changes using production data.\n\nSome prompt examples:\n\n```\n> can you look at the users' feedback for the agent \"\" in the last week, and write a report with the most common issues?\n> based on the users' feedback, think about some improvements we can make to the agent \"\" and run an experiment to test them using the latest production data.\n> ...\n```\n\nLearn more about how to use the MCP server to learn from users' feedback [here](https://docs.anotherai.dev/use-cases/user-feedback).\n\n## Deploy a new prompt or model without any code change.\n\nOne very popular feature of our previous product ([WorkflowAI](https://workflowai.com)) was the ability to update an agent's prompt or model without any code change. This feature enables faster iteration cycles, and fixing a prompt can be done without a PR and deployment. We've implemented the same feature in AnotherAI's MCP server, with a human confirmation step to prevent your AI assistant from making changes that are not intended.\n\nSome prompt examples:\n\n```\n> can you update the deployment \"\" to use the model \"\"?\n> update the prompt from \"\" to use the prompt from this version \"\"?\n> ...\n```\n\nLearn more about how to use the MCP server to deploy a new prompt or model without any code change [here](https://docs.anotherai.dev/use-cases/fundamentals/deployments).\n\n## AI deep dives into metrics.\n\nBecause our LLM gateway logs all the completions data, we wanted to give you and your AI assistant the best way to leverage this data. So we've designed two complementary components:\n\n- an MCP tool `query_completions(sql_query)` that allows your AI assistant to query the completions data using SQL queries. We've been really impressed by how good AI assistants are at transforming a natural language question into a complex SQL query. Using SQL instead of a predefined API allows the AI assistant to query the data in very powerful ways.\n- a web UI to view graphs and metrics about your agents. Your AI assistant can use the tool `create_or_update_view(view)` to create a view that will be saved and can be accessed in the web UI.\n\nSome prompt examples:\n\n```\n> what is our most expensive agent? can we run an experiment to find a cheaper model that keeps the same quality?\n> what is the p90, and p99 latency for the agent \"\"?\n> can you create a graph that shows the cost by agent in the last month?\n```\n\n[video]\n\nWe've also published a note about how we have secured the `query_completions` tool [here](https://docs.anotherai.dev/security#sql-query-tool-security) from malicious use. We welcome more feedback on our approach via our [Slack](https://join.slack.com/t/anotherai-dev/shared_invite/zt-3av2prezr-Lz10~8o~rSRQE72m_PyIJA)\n\n## Some (current) limitations.\n\nWe've focused this initial preview on simple AI agent architectures, not complex agentic systems. Agents that have multiple back-and-forth interactions or custom tools are harder to reproduce with other prompts and models because you need to be able to simulate one end of the conversation, and for custom tools you need to run the code somehow. If you're building a complex agentic system, please get in touch on [Slack](https://join.slack.com/t/anotherai-dev/shared_invite/zt-3av2prezr-Lz10~8o~rSRQE72m_PyIJA) so we can design something that works well together.\n\nFor very low latency agents, using AnotherAI's LLM gateway might not be the best option due to the added latency of the gateway, which we estimate at ~100ms. It's also possible to use AnotherAI's MCP server independently from the AI gateway to run experiments between models and prompts.\n\n## Try it\n\nThe first step is to install the MCP server, you can find the instructions [here](https://docs.anotherai.dev/getting-started). Once the MCP server is installed, find your first use-case by looking at our use-cases in the [docs](https://docs.anotherai.dev/). New accounts will get $1 of free credits to try it out.\n\nWe are really excited to hear from you, please join our [Slack channel](https://join.slack.com/t/anotherai-dev/shared_invite/zt-3av2prezr-Lz10~8o~rSRQE72m_PyIJA) to share what you're building with AnotherAI, meet our team, ask any questions, or just say hi.\n\nAnotherAI's team.\n[Pierre](https://x.com/pierrevalade), Anya, Guillaume, Jacek.\n\n## FAQ\n\n\n \n AnotherAI MCP gets access to the completions data via the AnotherAI LLM gateway. The LLM gateway logs all the\n completions data and makes it available to the MCP server.\n \n\n", + title: "AnotherAI: a MCP server designed for AI engineering", + description: "Public preview", + }, + content: + '\nToday we\'re introducing a public preview of **AnotherAI**, a MCP server designed for AI engineering tasks that includes a set of tools that enables your AI assistant (such as Claude Code, Cursor, etc.) to:\n\n- run experiments to compare any models, and analyze the results (quality, speed, cost). [[docs](https://docs.anotherai.dev/use-cases/fundamentals/experiments)]\n- access production LLM completions to debug and improve agents based on real data. [[docs](https://docs.anotherai.dev/use-cases/fundamentals/debugging)]\n- collect and analyze users\' feedback to improve an agent. [[docs](https://docs.anotherai.dev/use-cases/user-feedback)]\n- answer any questions about metrics (usage, performance, etc.) [[docs](https://docs.anotherai.dev/use-cases/fundamentals/metrics)]\n- deploy a new prompt or model without any code change [[docs](https://docs.anotherai.dev/use-cases/fundamentals/deployments)]\n\nOur work is available at:\n\n- https://anotherai.dev as a managed service, billed at the same cost as the underlying models (no markup).\n- https://github.com/anotherai-dev/anotherai under the Apache 2.0 license.\n\n## AI that can compare models\' performance, price, and latency.\n\nAnotherAI\'s MCP server exposes tools that let your AI assistant access over 100 models, and compare their performance, price, and latency. In our own tests, we\'ve found that models like Opus 4 are very good at reviewing work from other models, and the latest improvements in longer context windows (Sonnet and Gemini support up to 1M tokens) make it possible to compare more parameters (models and prompts) and agents with longer inputs.\n\n[video]\n\nSome prompt examples:\n\n```\n> can you compare Gemini 2.5 Flash, GPT-4o mini and Mistral Small for this agent ""?\n> can you find a model that is faster but keeps the same quality and does not cost more?\n> can you test how GPT-5 performs on this agent ""?\n> can you adjust the prompt for the agent "" to include few shot examples? validate that the outputs are improved.\n> ...\n```\n\nBecause your AI assistant can\'t always be trusted without a human in the loop, we\'ve also implemented a web UI to review the experiments made by your AI assistant.\n\n[screenshot]\n\n## AI learns from production data.\n\nLearning from production usage is a key step to improving any AI agent. To learn from production usage, we have implemented an OpenAI compatible API that logs all the completions data coming through, and then our MCP server exposes these logs to your AI assistant.\n\nSome prompt examples:\n\n```\n> can you look at the last 20 completions for the agent "" and report back the ones that are not good?\n> can you understand why the customer "" had a bad experience with agent ""?\n> ...\n```\n\nLearn more about how to use the MCP server to learn from production data [here](https://docs.anotherai.dev/use-cases/fundamentals/debugging).\n\nSome people might not like the idea of adding a proxy as a new single point of failure in their LLM architecture, so we are also exploring exposing an API endpoint to import completions after they have been generated (like traditional observability tools). If you\'re interested in this feature, please get in touch on [Slack](https://join.slack.com/t/anotherai-dev/shared_invite/zt-3av2prezr-Lz10~8o~rSRQE72m_PyIJA) so we can design something that works well together.\n\n## AI learns from users\' feedback.\n\nOn top of the completions logs, collecting users\' feedback is another key step to improving any AI agent. To create a fluid feedback loop, we are exposing an Annotations API to let your end-users leave feedback on completions. Then our MCP server exposes these annotations to your AI assistant.\n\nWe believe that AI assistants are so good now that they are able to read users\' feedback, identify issues, propose improvements, and run experiments to test changes using production data.\n\nSome prompt examples:\n\n```\n> can you look at the users\' feedback for the agent "" in the last week, and write a report with the most common issues?\n> based on the users\' feedback, think about some improvements we can make to the agent "" and run an experiment to test them using the latest production data.\n> ...\n```\n\nLearn more about how to use the MCP server to learn from users\' feedback [here](https://docs.anotherai.dev/use-cases/user-feedback).\n\n## Deploy a new prompt or model without any code change.\n\nOne very popular feature of our previous product ([WorkflowAI](https://workflowai.com)) was the ability to update an agent\'s prompt or model without any code change. This feature enables faster iteration cycles, and fixing a prompt can be done without a PR and deployment. We\'ve implemented the same feature in AnotherAI\'s MCP server, with a human confirmation step to prevent your AI assistant from making changes that are not intended.\n\nSome prompt examples:\n\n```\n> can you update the deployment "" to use the model ""?\n> update the prompt from "" to use the prompt from this version ""?\n> ...\n```\n\nLearn more about how to use the MCP server to deploy a new prompt or model without any code change [here](https://docs.anotherai.dev/use-cases/fundamentals/deployments).\n\n## AI deep dives into metrics.\n\nBecause our LLM gateway logs all the completions data, we wanted to give you and your AI assistant the best way to leverage this data. So we\'ve designed two complementary components:\n\n- an MCP tool `query_completions(sql_query)` that allows your AI assistant to query the completions data using SQL queries. We\'ve been really impressed by how good AI assistants are at transforming a natural language question into a complex SQL query. Using SQL instead of a predefined API allows the AI assistant to query the data in very powerful ways.\n- a web UI to view graphs and metrics about your agents. Your AI assistant can use the tool `create_or_update_view(view)` to create a view that will be saved and can be accessed in the web UI.\n\nSome prompt examples:\n\n```\n> what is our most expensive agent? can we run an experiment to find a cheaper model that keeps the same quality?\n> what is the p90, and p99 latency for the agent ""?\n> can you create a graph that shows the cost by agent in the last month?\n```\n\n[video]\n\nWe\'ve also published a note about how we have secured the `query_completions` tool [here](https://docs.anotherai.dev/security#sql-query-tool-security) from malicious use. We welcome more feedback on our approach via our [Slack](https://join.slack.com/t/anotherai-dev/shared_invite/zt-3av2prezr-Lz10~8o~rSRQE72m_PyIJA)\n\n## Some (current) limitations.\n\nWe\'ve focused this initial preview on simple AI agent architectures, not complex agentic systems. Agents that have multiple back-and-forth interactions or custom tools are harder to reproduce with other prompts and models because you need to be able to simulate one end of the conversation, and for custom tools you need to run the code somehow. If you\'re building a complex agentic system, please get in touch on [Slack](https://join.slack.com/t/anotherai-dev/shared_invite/zt-3av2prezr-Lz10~8o~rSRQE72m_PyIJA) so we can design something that works well together.\n\nFor very low latency agents, using AnotherAI\'s LLM gateway might not be the best option due to the added latency of the gateway, which we estimate at ~100ms. It\'s also possible to use AnotherAI\'s MCP server independently from the AI gateway to run experiments between models and prompts.\n\n## Try it\n\nThe first step is to install the MCP server, you can find the instructions [here](https://docs.anotherai.dev/getting-started). Once the MCP server is installed, find your first use-case by looking at our use-cases in the [docs](https://docs.anotherai.dev/). New accounts will get $1 of free credits to try it out.\n\nWe are really excited to hear from you, please join our [Slack channel](https://join.slack.com/t/anotherai-dev/shared_invite/zt-3av2prezr-Lz10~8o~rSRQE72m_PyIJA) to share what you\'re building with AnotherAI, meet our team, ask any questions, or just say hi.\n\nAnotherAI\'s team.\n[Pierre](https://x.com/pierrevalade), Anya, Guillaume, Jacek.\n\n## FAQ\n\n\n \n AnotherAI MCP gets access to the completions data via the AnotherAI LLM gateway. The LLM gateway logs all the\n completions data and makes it available to the MCP server.\n \n\n', }; diff --git a/web/src/components/MetricItem.tsx b/web/src/components/MetricItem.tsx index 61e785dc..b19a3f87 100644 --- a/web/src/components/MetricItem.tsx +++ b/web/src/components/MetricItem.tsx @@ -120,19 +120,21 @@ export function MetricItem({ const displayLabel = showAvgPrefix ? `Average ${ - metricKey === "cost" - ? (usePer1kMultiplier ? "cost (per 1K)" : "cost") + metricKey === "cost" + ? usePer1kMultiplier + ? "cost (per 1K)" + : "cost" : metricKey === "reasoning" - ? "reasoning" - : metricKey.replace(/_/g, " ") + ? "reasoning" + : metricKey.replace(/_/g, " ") }` : metricKey === "cost" ? usePer1kMultiplier ? "cost (per 1K)" : "cost" : metricKey === "reasoning" - ? "reasoning" - : metricKey.replace(/_/g, " "); + ? "reasoning" + : metricKey.replace(/_/g, " "); if (percentiles && showAvgPrefix) { return ( diff --git a/web/src/components/completion-modal/TracesView.tsx b/web/src/components/completion-modal/TracesView.tsx index 2ba388f5..b74e1895 100644 --- a/web/src/components/completion-modal/TracesView.tsx +++ b/web/src/components/completion-modal/TracesView.tsx @@ -19,7 +19,12 @@ function UsageInfo({ trace, traceIndex }: UsageInfoProps) { if ("prompt" in trace.usage && "completion" in trace.usage) { const detailedUsage = trace.usage as { prompt: { text_token_count?: number; cost_usd: number }; - completion: { text_token_count?: number; reasoning_token_count?: number; cached_token_count?: number; cost_usd: number }; + completion: { + text_token_count?: number; + reasoning_token_count?: number; + cached_token_count?: number; + cost_usd: number; + }; }; const items = []; @@ -71,13 +76,7 @@ function UsageInfo({ trace, traceIndex }: UsageInfoProps) { // Total cost const totalCost = detailedUsage.prompt.cost_usd + detailedUsage.completion.cost_usd; if (totalCost > 0) { - items.push( - - ); + items.push(); } return <>{items}; @@ -142,7 +141,6 @@ export function TracesView({ traces }: Props) { return null; } - return (
diff --git a/web/src/components/utils/__tests__/utils.test.ts b/web/src/components/utils/__tests__/utils.test.ts index aca1cd13..ad6e8d71 100644 --- a/web/src/components/utils/__tests__/utils.test.ts +++ b/web/src/components/utils/__tests__/utils.test.ts @@ -1,4 +1,4 @@ -import { Version, Completion, ExperimentCompletion } from "@/types/models"; +import { Completion, ExperimentCompletion, Version } from "@/types/models"; import { calculateAverageMetrics, filterAnnotations, @@ -344,12 +344,12 @@ describe("Calculation Functions", () => { const completions = [ { ...mockExperimentCompletion(1, 2), - reasoning_token_count: 100 + reasoning_token_count: 100, }, { ...mockExperimentCompletion(2, 3), - reasoning_token_count: 200 - } + reasoning_token_count: 200, + }, ]; const result = calculateAverageMetrics(completions); @@ -361,14 +361,14 @@ describe("Calculation Functions", () => { describe("getReasoningTokenCount", () => { it("returns reasoning tokens from ExperimentCompletion field", () => { const completion: Partial = { - reasoning_token_count: 150 + reasoning_token_count: 150, }; expect(getReasoningTokenCount(completion as ExperimentCompletion)).toBe(150); }); it("returns 0 for explicitly 0 reasoning tokens", () => { const completion: Partial = { - reasoning_token_count: 0 + reasoning_token_count: 0, }; expect(getReasoningTokenCount(completion as ExperimentCompletion)).toBe(0); }); diff --git a/web/src/components/utils/utils.ts b/web/src/components/utils/utils.ts index 11ce4cd4..4bc00a40 100644 --- a/web/src/components/utils/utils.ts +++ b/web/src/components/utils/utils.ts @@ -264,11 +264,7 @@ export function shouldIncludeDurationMetric( export function shouldIncludeReasoningMetric( completion: ExperimentCompletion | undefined ): completion is ExperimentCompletion { - return ( - completion != null && - getReasoningTokenCount(completion) !== undefined && - !completion.output?.error - ); + return completion != null && getReasoningTokenCount(completion) !== undefined && !completion.output?.error; } export function getValidCosts(completions: (ExperimentCompletion | undefined)[]): number[] { @@ -298,7 +294,15 @@ export function calculateAverageMetrics(completions: ExperimentCompletion[]): { durations: number[]; reasoningTokens: number[]; } { - if (completions.length === 0) return { avgCost: undefined, avgDuration: undefined, avgReasoningTokens: undefined, costs: [], durations: [], reasoningTokens: [] }; + if (completions.length === 0) + return { + avgCost: undefined, + avgDuration: undefined, + avgReasoningTokens: undefined, + costs: [], + durations: [], + reasoningTokens: [], + }; // Use centralized filtering logic const costs = getValidCosts(completions); @@ -353,11 +357,11 @@ export function getPriceAndLatencyPerVersion( }> ): Array<{ versionId: string; - metrics: { - avgCost: number | undefined; - avgDuration: number | undefined; + metrics: { + avgCost: number | undefined; + avgDuration: number | undefined; avgReasoningTokens: number | undefined; - costs: number[]; + costs: number[]; durations: number[]; reasoningTokens: number[]; }; @@ -1173,19 +1177,19 @@ export function stripMarkdown(markdown: string): string { /** * Extracts the reasoning token count from a completion's traces. * Looks through all LLM traces and returns the total reasoning tokens used. - * + * * @param completion - The completion object containing traces * @returns The total number of reasoning tokens used, or undefined if reasoning tokens are not present in the trace structure */ export function getReasoningTokenCount(completion: Completion | ExperimentCompletion): number | undefined { // For ExperimentCompletion, use the direct reasoning_token_count field - if ('reasoning_token_count' in completion) { + if ("reasoning_token_count" in completion) { return completion.reasoning_token_count; } - + // For regular Completion, fall back to parsing traces - const traces = 'traces' in completion ? completion.traces : undefined; - + const traces = "traces" in completion ? completion.traces : undefined; + if (!traces || !Array.isArray(traces)) { return undefined; } @@ -1198,16 +1202,16 @@ export function getReasoningTokenCount(completion: Completion | ExperimentComple if (trace.kind !== "llm") continue; const llmTrace = trace as Extract; - + // Check if trace has usage data if (!llmTrace.usage) continue; // Handle both new detailed usage structure and old simple structure if ("completion" in llmTrace.usage && llmTrace.usage.completion) { const completionUsage = llmTrace.usage.completion; - + // Check if reasoning_token_count field exists (even if it's 0) - if ('reasoning_token_count' in completionUsage && completionUsage.reasoning_token_count !== undefined) { + if ("reasoning_token_count" in completionUsage && completionUsage.reasoning_token_count !== undefined) { hasReasoningField = true; totalReasoningTokens += completionUsage.reasoning_token_count || 0; } @@ -1221,7 +1225,7 @@ export function getReasoningTokenCount(completion: Completion | ExperimentComple /** * Checks if a completion used reasoning (has reasoning tokens > 0) - * + * * @param completion - The completion object to check * @returns True if the completion used reasoning, false if no reasoning or reasoning tokens not present */ @@ -1232,7 +1236,7 @@ export function hasReasoningTokens(completion: Completion | ExperimentCompletion /** * Gets a summary of token usage from completion traces including reasoning tokens - * + * * @param completion - The completion object containing traces * @returns Object with token usage breakdown, reasoningTokens is undefined if not present in trace */ @@ -1243,8 +1247,8 @@ export function getTokenUsageSummary(completion: Completion | ExperimentCompleti cachedTokens: number; totalTokens: number; } { - const traces = 'traces' in completion ? completion.traces : undefined; - + const traces = "traces" in completion ? completion.traces : undefined; + let promptTokens = 0; let completionTokens = 0; let reasoningTokens: number | undefined = undefined; @@ -1256,7 +1260,7 @@ export function getTokenUsageSummary(completion: Completion | ExperimentCompleti if (trace.kind !== "llm") continue; const llmTrace = trace as Extract; - + if (!llmTrace.usage) continue; // Handle detailed usage structure @@ -1274,7 +1278,7 @@ export function getTokenUsageSummary(completion: Completion | ExperimentCompleti completionTokens += usage.completion.text_token_count; } - if ('reasoning_token_count' in usage.completion && usage.completion.reasoning_token_count !== undefined) { + if ("reasoning_token_count" in usage.completion && usage.completion.reasoning_token_count !== undefined) { if (!hasReasoningField) { hasReasoningField = true; reasoningTokens = 0; // Initialize when we first find the field From 537c71034b6bcc601b84ce73ecfe32a3879f12ce Mon Sep 17 00:00:00 2001 From: Jacek Zimonski <39839016+jacekzimonski@users.noreply.github.com> Date: Mon, 13 Oct 2025 16:31:34 +0200 Subject: [PATCH 4/4] Fix for the excluded fields --- backend/core/storage/completion_storage.py | 2 +- backend/protocol/api/_services/experiment_service.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/core/storage/completion_storage.py b/backend/core/storage/completion_storage.py index a138ee0e..8f290e64 100644 --- a/backend/core/storage/completion_storage.py +++ b/backend/core/storage/completion_storage.py @@ -6,7 +6,7 @@ from core.domain.experiment import Experiment from core.domain.version import Version -type CompletionField = Literal["traces", "agent_id"] +type CompletionField = Literal["traces", "agent_id", "input_variables", "input_messages", "output_messages", "messages"] class CompletionStorage(Protocol): diff --git a/backend/protocol/api/_services/experiment_service.py b/backend/protocol/api/_services/experiment_service.py index 2338af4a..86943df4 100644 --- a/backend/protocol/api/_services/experiment_service.py +++ b/backend/protocol/api/_services/experiment_service.py @@ -87,9 +87,9 @@ async def get_experiment( # Fetch reasoning tokens for outputs if they exist if exp.outputs: completion_ids = [output.completion_id for output in exp.outputs] - # Get completions with traces from ClickHouse (only exclude agent_id to keep traces) + # Get completions with traces from ClickHouse (exclude large fields we don't need) completions_with_traces = await self.completion_storage.completions_by_ids( - completion_ids, exclude={"agent_id"}, + completion_ids, exclude={"agent_id", "input_variables", "input_messages", "output_messages", "messages"}, ) # Create a map of completion_id -> reasoning_token_count reasoning_tokens_map = {}