anotherai-dev · jacekzimonski · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025
diff --git a/backend/core/domain/experiment.py b/backend/core/domain/experiment.py
@@ -24,6 +24,7 @@ class ExperimentOutput(BaseModel):
     output: AgentOutput | None
     cost_usd: float | None
     duration_seconds: float | None
+    reasoning_token_count: float | None = None
 
 
 class ExperimentVersion(Version):

diff --git a/backend/core/storage/completion_storage.py b/backend/core/storage/completion_storage.py
@@ -6,7 +6,7 @@
 from core.domain.experiment import Experiment
 from core.domain.version import Version
 
-type CompletionField = Literal["traces", "agent_id"]
+type CompletionField = Literal["traces", "agent_id", "input_variables", "input_messages", "output_messages", "messages"]
 
 
 class CompletionStorage(Protocol):

diff --git a/backend/protocol/api/_api_models.py b/backend/protocol/api/_api_models.py
@@ -496,6 +496,10 @@ class Completion(BaseModel):
         output: Output
         cost_usd: float
         duration_seconds: float
+        reasoning_token_count: float | None = Field(
+            default=None,
+            description="The number of reasoning tokens used in the inference, if applicable.",
+        )
 
     completions: list[Completion] | None = Field(default=None, description="The completions of the experiment.")
 

diff --git a/backend/protocol/api/_services/conversions.py b/backend/protocol/api/_services/conversions.py
@@ -919,6 +919,7 @@ def experiment_completion_from_domain(completion: ExperimentOutput) -> Experimen
         output=output_from_domain(completion.output) if completion.output else Output(),
         cost_usd=completion.cost_usd or 0.0,
         duration_seconds=completion.duration_seconds or 0.0,
+        reasoning_token_count=completion.reasoning_token_count,
     )
 
 

diff --git a/backend/protocol/api/_services/experiment_service.py b/backend/protocol/api/_services/experiment_service.py
@@ -84,6 +84,25 @@ async def get_experiment(
             input_ids=input_ids,
         )
 
+        # Fetch reasoning tokens for outputs if they exist
+        if exp.outputs:
+            completion_ids = [output.completion_id for output in exp.outputs]
+            # Get completions with traces from ClickHouse (exclude large fields we don't need)
+            completions_with_traces = await self.completion_storage.completions_by_ids(
+                completion_ids, exclude={"agent_id", "input_variables", "input_messages", "output_messages", "messages"},
+            )
+            # Create a map of completion_id -> reasoning_token_count
+            reasoning_tokens_map = {}
+            for comp in completions_with_traces:
+                reasoning_tokens = self._calculate_reasoning_tokens_from_traces(comp.traces)
+                if reasoning_tokens is not None:
+                    reasoning_tokens_map[comp.id] = reasoning_tokens
+
+            # Attach reasoning token counts to experiment outputs
+            for output in exp.outputs:
+                if output.completion_id in reasoning_tokens_map:
+                    output.reasoning_token_count = reasoning_tokens_map[output.completion_id]
+
         annotations: list[Annotation] = []
         if include is None or "annotations" in include:
             annotations = await self.annotation_storage.list(
@@ -97,6 +116,41 @@ async def get_experiment(
         # getting annotations as needed
         return experiment_from_domain(exp, annotations)
 
+    def _calculate_reasoning_tokens_from_traces(self, traces: list[Any] | None) -> float | None:
+        """Calculate total reasoning tokens from traces.
+
+        Args:
+            traces: List of trace objects
+
+        Returns:
+            Total reasoning tokens as float, or None if no reasoning tokens found
+        """
+        if not traces:
+            return None
+
+        total_reasoning_tokens = 0.0
+        has_reasoning_field = False
+
+        for trace in traces:
+            # Only check LLM traces
+            if not hasattr(trace, "kind") or trace.kind != "llm":
+                continue
+
+            # Check if trace has usage data
+            if not hasattr(trace, "usage") or not trace.usage:
+                continue
+
+            # Handle detailed usage structure
+            if hasattr(trace.usage, "completion") and trace.usage.completion:
+                completion_usage = trace.usage.completion
+
+                # Check if reasoning_token_count field exists
+                if hasattr(completion_usage, "reasoning_token_count") and completion_usage.reasoning_token_count is not None:
+                    has_reasoning_field = True
+                    total_reasoning_tokens += float(completion_usage.reasoning_token_count or 0)
+
+        return total_reasoning_tokens if has_reasoning_field else None
+
     async def list_experiments(self, agent_id: str | None = None, limit: int = 10, offset: int = 0) -> Page[Experiment]:
         if agent_id:
             agent = await self.agent_storage.get_agent(agent_id)

diff --git a/web/eslint.config.mjs b/web/eslint.config.mjs
@@ -12,6 +12,16 @@ const compat = new FlatCompat({
 const eslintConfig = [
   ...compat.extends("next/core-web-vitals", "next/typescript"),
   {
+    ignores: [
+      ".next/**/*",
+      "out/**/*",
+      "build/**/*",
+      "dist/**/*",
+      "node_modules/**/*",
+      ".next/types/**/*",
+      ".next/static/**/*",
+      ".next/server/**/*",
+    ],
     rules: {
       "no-restricted-imports": [
         "error",

diff --git a/web/src/app/experiments/[id]/sections/Results/MatrixSection.tsx b/web/src/app/experiments/[id]/sections/Results/MatrixSection.tsx
@@ -10,6 +10,7 @@ import {
   getSharedPartsOfPrompts,
   getValidCosts,
   getValidDurations,
+  getValidReasoningTokens,
 } from "@/components/utils/utils";
 import { useColumnWidths } from "@/hooks/useColumnWidths";
 import { useVersionHiding } from "@/hooks/useVersionHiding";
@@ -115,6 +116,9 @@ export function MatrixSection(props: Props) {
     const allAvgDurations = priceAndLatencyPerVersion
       .map(({ metrics }) => metrics.avgDuration)
       .filter((duration): duration is number => duration !== undefined);
+    const allAvgReasoningTokens = priceAndLatencyPerVersion
+      .map(({ metrics }) => metrics.avgReasoningTokens)
+      .filter((tokens): tokens is number => tokens !== undefined);
 
     // Calculate raw metrics lookup for percentile data
     const rawMetricsPerVersionPerKey = getRawMetricsPerVersionPerKey(experiment, annotations);
@@ -140,6 +144,10 @@ export function MatrixSection(props: Props) {
         if (priceAndLatency.metrics.avgDuration !== undefined) {
           allMetrics.unshift({ key: "duration", average: priceAndLatency.metrics.avgDuration });
         }
+        // Only add reasoning tokens metric if it has a valid value
+        if (priceAndLatency.metrics.avgReasoningTokens !== undefined) {
+          allMetrics.unshift({ key: "reasoning", average: priceAndLatency.metrics.avgReasoningTokens });
+        }
       }
 
       // Combine allMetricsPerKey with price and latency data
@@ -151,13 +159,19 @@ export function MatrixSection(props: Props) {
         if (allAvgDurations.length > 0) {
           allMetricsPerKeyForVersion.duration = allAvgDurations;
         }
+        if (allAvgReasoningTokens.length > 0) {
+          allMetricsPerKeyForVersion.reasoning = allAvgReasoningTokens;
+        }
       }
 
       // Combine rawMetricsPerKey with price and latency data
       const versionMetricsPerKeyForVersion = { ...rawMetricsForVersion };
       if (priceAndLatency?.metrics) {
         versionMetricsPerKeyForVersion.cost = priceAndLatency.metrics.costs;
         versionMetricsPerKeyForVersion.duration = priceAndLatency.metrics.durations;
+        if (priceAndLatency.metrics.reasoningTokens.length > 0) {
+          versionMetricsPerKeyForVersion.reasoning = priceAndLatency.metrics.reasoningTokens;
+        }
       }
 
       // Find the original index of this version in the sorted versions array
@@ -204,9 +218,10 @@ export function MatrixSection(props: Props) {
           .map((version) => findCompletionForInputAndVersion(experiment.completions || [], input.id, version.id))
           .filter(Boolean); // Remove undefined completions
 
-        // Calculate cost and duration arrays for this row using centralized utility functions
+        // Calculate cost, duration, and reasoning token arrays for this row using centralized utility functions
         const allCostsForRow = getValidCosts(completionsForInput);
         const allDurationsForRow = getValidDurations(completionsForInput);
+        const allReasoningTokensForRow = getValidReasoningTokens(completionsForInput);
 
         // Calculate metrics per key for this row (for row-based comparison coloring)
         const allMetricsPerKeyForRowData = getAllMetricsPerKeyForRow(experiment, annotations, input.id);
@@ -216,13 +231,16 @@ export function MatrixSection(props: Props) {
           ...allMetricsPerKeyForRowData,
         };
 
-        // Add cost and duration arrays if they have data
+        // Add cost, duration, and reasoning token arrays if they have data
         if (allCostsForRow.length > 0) {
           allMetricsPerKeyForRow.cost = allCostsForRow;
         }
         if (allDurationsForRow.length > 0) {
           allMetricsPerKeyForRow.duration = allDurationsForRow;
         }
+        if (allReasoningTokensForRow.length > 0) {
+          allMetricsPerKeyForRow.reasoning = allReasoningTokensForRow;
+        }
 
         return orderedVersions.map((version) => {
           const completion = findCompletionForInputAndVersion(experiment.completions || [], input.id, version.id);

diff --git a/web/src/app/experiments/[id]/sections/Results/completion/CompletionCell.tsx b/web/src/app/experiments/[id]/sections/Results/completion/CompletionCell.tsx
@@ -7,7 +7,12 @@ import { PageError } from "@/components/PageError";
 import { useToast } from "@/components/ToastProvider";
 import { AnnotationsView } from "@/components/annotations/AnnotationsView";
 import { MessagesViewer } from "@/components/messages/MessagesViewer";
-import { shouldIncludeCostMetric, shouldIncludeDurationMetric } from "@/components/utils/utils";
+import {
+  getReasoningTokenCount,
+  shouldIncludeCostMetric,
+  shouldIncludeDurationMetric,
+  shouldIncludeReasoningMetric,
+} from "@/components/utils/utils";
 import { Annotation, ExperimentCompletion } from "@/types/models";
 import { getMetricsForCompletion } from "../../../utils";
 
@@ -48,6 +53,14 @@ function CompletionCell(props: CompletionCellProps) {
       metrics.push({ key: "duration", average: completion.duration_seconds });
     }
 
+    // Add reasoning tokens metric if valid using centralized utility
+    if (shouldIncludeReasoningMetric(completion)) {
+      const reasoningTokens = getReasoningTokenCount(completion);
+      if (reasoningTokens !== undefined) {
+        metrics.push({ key: "reasoning", average: reasoningTokens });
+      }
+    }
+
     // Add custom metrics from annotations
     if (completionMetrics.length > 0) {
       metrics.push(...completionMetrics);

diff --git a/web/src/components/MetricItem.tsx b/web/src/components/MetricItem.tsx
@@ -71,6 +71,9 @@ export function MetricItem({
     if (metricKey.includes("duration") || metricKey.includes("latency")) {
       return "duration";
     }
+    if (metricKey.includes("reasoning")) {
+      return "reasoning";
+    }
     return undefined;
   }, [metricKey]);
 
@@ -108,18 +111,30 @@ export function MetricItem({
       return (value: number) => (usePer1kMultiplier ? formatCurrency(value, 1000) : `$${formatNumber(value)}`);
     } else if (metricType === "duration") {
       return formatDuration;
+    } else if (metricType === "reasoning") {
+      return (value: number) => `${Math.round(value).toLocaleString()} tokens`;
     } else {
       return (value: number) => value.toFixed(2);
     }
   }, [metricType, usePer1kMultiplier]);
 
   const displayLabel = showAvgPrefix
-    ? `Average ${metricKey === "cost" ? (usePer1kMultiplier ? "cost (per 1K)" : "cost") : metricKey.replace(/_/g, " ")}`
+    ? `Average ${
+        metricKey === "cost"
+          ? usePer1kMultiplier
+            ? "cost (per 1K)"
+            : "cost"
+          : metricKey === "reasoning"
+            ? "reasoning"
+            : metricKey.replace(/_/g, " ")
+      }`
     : metricKey === "cost"
       ? usePer1kMultiplier
         ? "cost (per 1K)"
         : "cost"
-      : metricKey.replace(/_/g, " ");
+      : metricKey === "reasoning"
+        ? "reasoning"
+        : metricKey.replace(/_/g, " ");
 
   if (percentiles && showAvgPrefix) {
     return (