diff --git a/backend/core/domain/experiment.py b/backend/core/domain/experiment.py
index ebb527e8..72c8bf8c 100644
--- a/backend/core/domain/experiment.py
+++ b/backend/core/domain/experiment.py
@@ -24,6 +24,7 @@ class ExperimentOutput(BaseModel):
     output: AgentOutput | None
     cost_usd: float | None
     duration_seconds: float | None
+    reasoning_token_count: float | None = None
 
 
 class ExperimentVersion(Version):
diff --git a/backend/core/storage/completion_storage.py b/backend/core/storage/completion_storage.py
index a138ee0e..8f290e64 100644
--- a/backend/core/storage/completion_storage.py
+++ b/backend/core/storage/completion_storage.py
@@ -6,7 +6,7 @@
 from core.domain.experiment import Experiment
 from core.domain.version import Version
 
-type CompletionField = Literal["traces", "agent_id"]
+type CompletionField = Literal["traces", "agent_id", "input_variables", "input_messages", "output_messages", "messages"]
 
 
 class CompletionStorage(Protocol):
diff --git a/backend/protocol/api/_api_models.py b/backend/protocol/api/_api_models.py
index 927b7eeb..0ce25f71 100644
--- a/backend/protocol/api/_api_models.py
+++ b/backend/protocol/api/_api_models.py
@@ -496,6 +496,10 @@ class Completion(BaseModel):
         output: Output
         cost_usd: float
         duration_seconds: float
+        reasoning_token_count: float | None = Field(
+            default=None,
+            description="The number of reasoning tokens used in the inference, if applicable.",
+        )
 
     completions: list[Completion] | None = Field(default=None, description="The completions of the experiment.")
 
diff --git a/backend/protocol/api/_services/conversions.py b/backend/protocol/api/_services/conversions.py
index f4a7499c..f39eb6fc 100644
--- a/backend/protocol/api/_services/conversions.py
+++ b/backend/protocol/api/_services/conversions.py
@@ -919,6 +919,7 @@ def experiment_completion_from_domain(completion: ExperimentOutput) -> Experimen
         output=output_from_domain(completion.output) if completion.output else Output(),
         cost_usd=completion.cost_usd or 0.0,
         duration_seconds=completion.duration_seconds or 0.0,
+        reasoning_token_count=completion.reasoning_token_count,
     )
 
 
diff --git a/backend/protocol/api/_services/experiment_service.py b/backend/protocol/api/_services/experiment_service.py
index 15b21146..86943df4 100644
--- a/backend/protocol/api/_services/experiment_service.py
+++ b/backend/protocol/api/_services/experiment_service.py
@@ -84,6 +84,25 @@ async def get_experiment(
             input_ids=input_ids,
         )
 
+        # Fetch reasoning tokens for outputs if they exist
+        if exp.outputs:
+            completion_ids = [output.completion_id for output in exp.outputs]
+            # Get completions with traces from ClickHouse (exclude large fields we don't need)
+            completions_with_traces = await self.completion_storage.completions_by_ids(
+                completion_ids, exclude={"agent_id", "input_variables", "input_messages", "output_messages", "messages"},
+            )
+            # Create a map of completion_id -> reasoning_token_count
+            reasoning_tokens_map = {}
+            for comp in completions_with_traces:
+                reasoning_tokens = self._calculate_reasoning_tokens_from_traces(comp.traces)
+                if reasoning_tokens is not None:
+                    reasoning_tokens_map[comp.id] = reasoning_tokens
+
+            # Attach reasoning token counts to experiment outputs
+            for output in exp.outputs:
+                if output.completion_id in reasoning_tokens_map:
+                    output.reasoning_token_count = reasoning_tokens_map[output.completion_id]
+
         annotations: list[Annotation] = []
         if include is None or "annotations" in include:
             annotations = await self.annotation_storage.list(
@@ -97,6 +116,41 @@ async def get_experiment(
         # getting annotations as needed
         return experiment_from_domain(exp, annotations)
 
+    def _calculate_reasoning_tokens_from_traces(self, traces: list[Any] | None) -> float | None:
+        """Calculate total reasoning tokens from traces.
+
+        Args:
+            traces: List of trace objects
+
+        Returns:
+            Total reasoning tokens as float, or None if no reasoning tokens found
+        """
+        if not traces:
+            return None
+
+        total_reasoning_tokens = 0.0
+        has_reasoning_field = False
+
+        for trace in traces:
+            # Only check LLM traces
+            if not hasattr(trace, "kind") or trace.kind != "llm":
+                continue
+
+            # Check if trace has usage data
+            if not hasattr(trace, "usage") or not trace.usage:
+                continue
+
+            # Handle detailed usage structure
+            if hasattr(trace.usage, "completion") and trace.usage.completion:
+                completion_usage = trace.usage.completion
+
+                # Check if reasoning_token_count field exists
+                if hasattr(completion_usage, "reasoning_token_count") and completion_usage.reasoning_token_count is not None:
+                    has_reasoning_field = True
+                    total_reasoning_tokens += float(completion_usage.reasoning_token_count or 0)
+
+        return total_reasoning_tokens if has_reasoning_field else None
+
     async def list_experiments(self, agent_id: str | None = None, limit: int = 10, offset: int = 0) -> Page[Experiment]:
         if agent_id:
             agent = await self.agent_storage.get_agent(agent_id)
diff --git a/web/eslint.config.mjs b/web/eslint.config.mjs
index 22331647..051521d9 100644
--- a/web/eslint.config.mjs
+++ b/web/eslint.config.mjs
@@ -12,6 +12,16 @@ const compat = new FlatCompat({
 const eslintConfig = [
   ...compat.extends("next/core-web-vitals", "next/typescript"),
   {
+    ignores: [
+      ".next/**/*",
+      "out/**/*",
+      "build/**/*",
+      "dist/**/*",
+      "node_modules/**/*",
+      ".next/types/**/*",
+      ".next/static/**/*",
+      ".next/server/**/*",
+    ],
     rules: {
       "no-restricted-imports": [
         "error",
diff --git a/web/src/app/experiments/[id]/sections/Results/MatrixSection.tsx b/web/src/app/experiments/[id]/sections/Results/MatrixSection.tsx
index 33b024d2..59efff9d 100644
--- a/web/src/app/experiments/[id]/sections/Results/MatrixSection.tsx
+++ b/web/src/app/experiments/[id]/sections/Results/MatrixSection.tsx
@@ -10,6 +10,7 @@ import {
   getSharedPartsOfPrompts,
   getValidCosts,
   getValidDurations,
+  getValidReasoningTokens,
 } from "@/components/utils/utils";
 import { useColumnWidths } from "@/hooks/useColumnWidths";
 import { useVersionHiding } from "@/hooks/useVersionHiding";
@@ -115,6 +116,9 @@ export function MatrixSection(props: Props) {
     const allAvgDurations = priceAndLatencyPerVersion
       .map(({ metrics }) => metrics.avgDuration)
       .filter((duration): duration is number => duration !== undefined);
+    const allAvgReasoningTokens = priceAndLatencyPerVersion
+      .map(({ metrics }) => metrics.avgReasoningTokens)
+      .filter((tokens): tokens is number => tokens !== undefined);
 
     // Calculate raw metrics lookup for percentile data
     const rawMetricsPerVersionPerKey = getRawMetricsPerVersionPerKey(experiment, annotations);
@@ -140,6 +144,10 @@ export function MatrixSection(props: Props) {
         if (priceAndLatency.metrics.avgDuration !== undefined) {
           allMetrics.unshift({ key: "duration", average: priceAndLatency.metrics.avgDuration });
         }
+        // Only add reasoning tokens metric if it has a valid value
+        if (priceAndLatency.metrics.avgReasoningTokens !== undefined) {
+          allMetrics.unshift({ key: "reasoning", average: priceAndLatency.metrics.avgReasoningTokens });
+        }
       }
 
       // Combine allMetricsPerKey with price and latency data
@@ -151,6 +159,9 @@ export function MatrixSection(props: Props) {
         if (allAvgDurations.length > 0) {
           allMetricsPerKeyForVersion.duration = allAvgDurations;
         }
+        if (allAvgReasoningTokens.length > 0) {
+          allMetricsPerKeyForVersion.reasoning = allAvgReasoningTokens;
+        }
       }
 
       // Combine rawMetricsPerKey with price and latency data
@@ -158,6 +169,9 @@ export function MatrixSection(props: Props) {
       if (priceAndLatency?.metrics) {
         versionMetricsPerKeyForVersion.cost = priceAndLatency.metrics.costs;
         versionMetricsPerKeyForVersion.duration = priceAndLatency.metrics.durations;
+        if (priceAndLatency.metrics.reasoningTokens.length > 0) {
+          versionMetricsPerKeyForVersion.reasoning = priceAndLatency.metrics.reasoningTokens;
+        }
       }
 
       // Find the original index of this version in the sorted versions array
@@ -204,9 +218,10 @@ export function MatrixSection(props: Props) {
           .map((version) => findCompletionForInputAndVersion(experiment.completions || [], input.id, version.id))
           .filter(Boolean); // Remove undefined completions
 
-        // Calculate cost and duration arrays for this row using centralized utility functions
+        // Calculate cost, duration, and reasoning token arrays for this row using centralized utility functions
         const allCostsForRow = getValidCosts(completionsForInput);
         const allDurationsForRow = getValidDurations(completionsForInput);
+        const allReasoningTokensForRow = getValidReasoningTokens(completionsForInput);
 
         // Calculate metrics per key for this row (for row-based comparison coloring)
         const allMetricsPerKeyForRowData = getAllMetricsPerKeyForRow(experiment, annotations, input.id);
@@ -216,13 +231,16 @@ export function MatrixSection(props: Props) {
           ...allMetricsPerKeyForRowData,
         };
 
-        // Add cost and duration arrays if they have data
+        // Add cost, duration, and reasoning token arrays if they have data
         if (allCostsForRow.length > 0) {
           allMetricsPerKeyForRow.cost = allCostsForRow;
         }
         if (allDurationsForRow.length > 0) {
           allMetricsPerKeyForRow.duration = allDurationsForRow;
         }
+        if (allReasoningTokensForRow.length > 0) {
+          allMetricsPerKeyForRow.reasoning = allReasoningTokensForRow;
+        }
 
         return orderedVersions.map((version) => {
           const completion = findCompletionForInputAndVersion(experiment.completions || [], input.id, version.id);
diff --git a/web/src/app/experiments/[id]/sections/Results/completion/CompletionCell.tsx b/web/src/app/experiments/[id]/sections/Results/completion/CompletionCell.tsx
index 728a5397..a9e07456 100644
--- a/web/src/app/experiments/[id]/sections/Results/completion/CompletionCell.tsx
+++ b/web/src/app/experiments/[id]/sections/Results/completion/CompletionCell.tsx
@@ -7,7 +7,12 @@ import { PageError } from "@/components/PageError";
 import { useToast } from "@/components/ToastProvider";
 import { AnnotationsView } from "@/components/annotations/AnnotationsView";
 import { MessagesViewer } from "@/components/messages/MessagesViewer";
-import { shouldIncludeCostMetric, shouldIncludeDurationMetric } from "@/components/utils/utils";
+import {
+  getReasoningTokenCount,
+  shouldIncludeCostMetric,
+  shouldIncludeDurationMetric,
+  shouldIncludeReasoningMetric,
+} from "@/components/utils/utils";
 import { Annotation, ExperimentCompletion } from "@/types/models";
 import { getMetricsForCompletion } from "../../../utils";
 
@@ -48,6 +53,14 @@ function CompletionCell(props: CompletionCellProps) {
       metrics.push({ key: "duration", average: completion.duration_seconds });
     }
 
+    // Add reasoning tokens metric if valid using centralized utility
+    if (shouldIncludeReasoningMetric(completion)) {
+      const reasoningTokens = getReasoningTokenCount(completion);
+      if (reasoningTokens !== undefined) {
+        metrics.push({ key: "reasoning", average: reasoningTokens });
+      }
+    }
+
     // Add custom metrics from annotations
     if (completionMetrics.length > 0) {
       metrics.push(...completionMetrics);
diff --git a/web/src/components/MetricItem.tsx b/web/src/components/MetricItem.tsx
index b5f6998c..b19a3f87 100644
--- a/web/src/components/MetricItem.tsx
+++ b/web/src/components/MetricItem.tsx
@@ -71,6 +71,9 @@ export function MetricItem({
     if (metricKey.includes("duration") || metricKey.includes("latency")) {
       return "duration";
     }
+    if (metricKey.includes("reasoning")) {
+      return "reasoning";
+    }
     return undefined;
   }, [metricKey]);
 
@@ -108,18 +111,30 @@ export function MetricItem({
       return (value: number) => (usePer1kMultiplier ? formatCurrency(value, 1000) : `$${formatNumber(value)}`);
     } else if (metricType === "duration") {
       return formatDuration;
+    } else if (metricType === "reasoning") {
+      return (value: number) => `${Math.round(value).toLocaleString()} tokens`;
     } else {
       return (value: number) => value.toFixed(2);
     }
   }, [metricType, usePer1kMultiplier]);
 
   const displayLabel = showAvgPrefix
-    ? `Average ${metricKey === "cost" ? (usePer1kMultiplier ? "cost (per 1K)" : "cost") : metricKey.replace(/_/g, " ")}`
+    ? `Average ${
+        metricKey === "cost"
+          ? usePer1kMultiplier
+            ? "cost (per 1K)"
+            : "cost"
+          : metricKey === "reasoning"
+            ? "reasoning"
+            : metricKey.replace(/_/g, " ")
+      }`
     : metricKey === "cost"
       ? usePer1kMultiplier
         ? "cost (per 1K)"
         : "cost"
-      : metricKey.replace(/_/g, " ");
+      : metricKey === "reasoning"
+        ? "reasoning"
+        : metricKey.replace(/_/g, " ");
 
   if (percentiles && showAvgPrefix) {
     return (
diff --git a/web/src/components/completion-modal/TracesView.tsx b/web/src/components/completion-modal/TracesView.tsx
index 3b7bbeba..b74e1895 100644
--- a/web/src/components/completion-modal/TracesView.tsx
+++ b/web/src/components/completion-modal/TracesView.tsx
@@ -7,21 +7,133 @@ type Props = {
   traces?: Trace[];
 };
 
+type UsageInfoProps = {
+  trace: Extract<Trace, { kind: "llm" }>;
+  traceIndex: number;
+};
+
+function UsageInfo({ trace, traceIndex }: UsageInfoProps) {
+  if (!trace.usage) return null;
+
+  // Check if this is the new detailed usage structure
+  if ("prompt" in trace.usage && "completion" in trace.usage) {
+    const detailedUsage = trace.usage as {
+      prompt: { text_token_count?: number; cost_usd: number };
+      completion: {
+        text_token_count?: number;
+        reasoning_token_count?: number;
+        cached_token_count?: number;
+        cost_usd: number;
+      };
+    };
+
+    const items = [];
+
+    // Prompt tokens
+    if (detailedUsage.prompt.text_token_count) {
+      items.push(
+        <InfoRow
+          key={`${traceIndex}-prompt-tokens`}
+          title="Prompt Tokens"
+          value={detailedUsage.prompt.text_token_count.toLocaleString()}
+        />
+      );
+    }
+
+    // Completion tokens
+    if (detailedUsage.completion.text_token_count) {
+      items.push(
+        <InfoRow
+          key={`${traceIndex}-completion-tokens`}
+          title="Completion Tokens"
+          value={detailedUsage.completion.text_token_count.toLocaleString()}
+        />
+      );
+    }
+
+    // Reasoning tokens (new!)
+    if (detailedUsage.completion.reasoning_token_count && detailedUsage.completion.reasoning_token_count > 0) {
+      items.push(
+        <InfoRow
+          key={`${traceIndex}-reasoning-tokens`}
+          title="Reasoning Tokens"
+          value={detailedUsage.completion.reasoning_token_count.toLocaleString()}
+        />
+      );
+    }
+
+    // Cached tokens
+    if (detailedUsage.completion.cached_token_count && detailedUsage.completion.cached_token_count > 0) {
+      items.push(
+        <InfoRow
+          key={`${traceIndex}-cached-tokens`}
+          title="Cached Tokens"
+          value={detailedUsage.completion.cached_token_count.toLocaleString()}
+        />
+      );
+    }
+
+    // Total cost
+    const totalCost = detailedUsage.prompt.cost_usd + detailedUsage.completion.cost_usd;
+    if (totalCost > 0) {
+      items.push(<InfoRow key={`${traceIndex}-total-cost`} title="Total Cost" value={`$${formatNumber(totalCost)}`} />);
+    }
+
+    return <>{items}</>;
+  }
+
+  // Fallback to old simple structure
+  const simpleUsage = trace.usage as { input_tokens?: number; output_tokens?: number; total_tokens?: number };
+  const items = [];
+
+  if (simpleUsage.input_tokens) {
+    items.push(
+      <InfoRow
+        key={`${traceIndex}-input-tokens`}
+        title="Input Tokens"
+        value={simpleUsage.input_tokens.toLocaleString()}
+      />
+    );
+  }
+
+  if (simpleUsage.output_tokens) {
+    items.push(
+      <InfoRow
+        key={`${traceIndex}-output-tokens`}
+        title="Output Tokens"
+        value={simpleUsage.output_tokens.toLocaleString()}
+      />
+    );
+  }
+
+  if (simpleUsage.total_tokens) {
+    items.push(
+      <InfoRow
+        key={`${traceIndex}-total-tokens`}
+        title="Total Tokens"
+        value={simpleUsage.total_tokens.toLocaleString()}
+      />
+    );
+  }
+
+  return <>{items}</>;
+}
+
 export function TracesView({ traces }: Props) {
   const llmTracesWithUsage = useMemo(() => {
     if (!traces || traces.length === 0) {
       return [];
     }
 
-    // Filter for LLM traces that have usage data with text_token_count
+    // Filter for LLM traces that have usage data
     return traces.filter((trace): trace is Extract<Trace, { kind: "llm" }> => {
       if (trace.kind !== "llm" || !trace.usage) return false;
 
-      // Check if any usage entry has text_token_count
-      return Object.values(trace.usage).some(
-        (usageValue: unknown) =>
-          typeof usageValue === "object" && usageValue !== null && "text_token_count" in usageValue
-      );
+      // Check if usage has the new detailed structure or old simple structure
+      const hasDetailedUsage = "prompt" in trace.usage && "completion" in trace.usage;
+      const hasSimpleUsage = "input_tokens" in trace.usage || "output_tokens" in trace.usage;
+
+      return hasDetailedUsage || hasSimpleUsage;
     });
   }, [traces]);
 
@@ -37,43 +149,7 @@ export function TracesView({ traces }: Props) {
           <div key={traceIndex} className="space-y-2">
             {/* Show provider if available */}
             {trace.provider && <InfoRow key={`${traceIndex}-provider`} title="Provider" value={trace.provider} />}
-            {trace.usage &&
-              Object.entries(trace.usage).map(([key, usageValue]) => {
-                // Type guard to check if usageValue has the expected properties
-                if (typeof usageValue !== "object" || usageValue === null || !("text_token_count" in usageValue)) {
-                  return null;
-                }
-
-                const usageData = usageValue as {
-                  text_token_count: number;
-                  cost_usd?: number;
-                };
-
-                const textTokenCount = usageData.text_token_count;
-                const costUsd = usageData.cost_usd;
-
-                if (textTokenCount === undefined) return null;
-
-                // Format the title based on the key
-                const formatTitle = (key: string) => {
-                  const capitalizedKey = key.charAt(0).toUpperCase() + key.slice(1);
-                  return `${capitalizedKey} Token Count`;
-                };
-
-                const formatCostTitle = (key: string) => {
-                  const capitalizedKey = key.charAt(0).toUpperCase() + key.slice(1);
-                  return `${capitalizedKey} Cost`;
-                };
-
-                return (
-                  <div key={`${traceIndex}-${key}`} className="space-y-2">
-                    <InfoRow title={formatTitle(key)} value={`${textTokenCount.toLocaleString()}`} />
-                    {costUsd !== undefined && (
-                      <InfoRow title={formatCostTitle(key)} value={`$${formatNumber(costUsd)}`} />
-                    )}
-                  </div>
-                );
-              })}
+            <UsageInfo trace={trace} traceIndex={traceIndex} />
           </div>
         ))}
       </div>
diff --git a/web/src/components/utils/__tests__/utils.test.ts b/web/src/components/utils/__tests__/utils.test.ts
index e8b0ebfd..ad6e8d71 100644
--- a/web/src/components/utils/__tests__/utils.test.ts
+++ b/web/src/components/utils/__tests__/utils.test.ts
@@ -1,4 +1,4 @@
-import { Version } from "@/types/models";
+import { Completion, ExperimentCompletion, Version } from "@/types/models";
 import {
   calculateAverageMetrics,
   filterAnnotations,
@@ -9,6 +9,7 @@ import {
   getDifferingVersionKeys,
   getMetricBadgeColor,
   getMetricBadgeWithRelative,
+  getReasoningTokenCount,
   getSharedPartsOfPrompts,
   getValidCosts,
   getValidDurations,
@@ -19,6 +20,7 @@ import {
   resolveRef,
   shouldIncludeCostMetric,
   shouldIncludeDurationMetric,
+  shouldIncludeReasoningMetric,
   sortVersionKeys,
   stripMarkdown,
   transformCompletionsData,
@@ -306,8 +308,10 @@ describe("Calculation Functions", () => {
       expect(result).toEqual({
         avgCost: undefined,
         avgDuration: undefined,
+        avgReasoningTokens: undefined,
         costs: [],
         durations: [],
+        reasoningTokens: [],
       });
     });
 
@@ -318,8 +322,10 @@ describe("Calculation Functions", () => {
       expect(result).toEqual({
         avgCost: 2,
         avgDuration: 3,
+        avgReasoningTokens: undefined,
         costs: [1, 3],
         durations: [2, 4],
+        reasoningTokens: [],
       });
     });
 
@@ -333,6 +339,94 @@ describe("Calculation Functions", () => {
       expect(result.avgCost).toBe(1); // (0 + 2) / 2
       expect(result.avgDuration).toBe(2); // (0 + 4) / 2
     });
+
+    it("calculates reasoning tokens correctly", () => {
+      const completions = [
+        {
+          ...mockExperimentCompletion(1, 2),
+          reasoning_token_count: 100,
+        },
+        {
+          ...mockExperimentCompletion(2, 3),
+          reasoning_token_count: 200,
+        },
+      ];
+
+      const result = calculateAverageMetrics(completions);
+      expect(result.avgReasoningTokens).toBe(150); // (100 + 200) / 2
+      expect(result.reasoningTokens).toEqual([100, 200]);
+    });
+  });
+
+  describe("getReasoningTokenCount", () => {
+    it("returns reasoning tokens from ExperimentCompletion field", () => {
+      const completion: Partial<ExperimentCompletion> = {
+        reasoning_token_count: 150,
+      };
+      expect(getReasoningTokenCount(completion as ExperimentCompletion)).toBe(150);
+    });
+
+    it("returns 0 for explicitly 0 reasoning tokens", () => {
+      const completion: Partial<ExperimentCompletion> = {
+        reasoning_token_count: 0,
+      };
+      expect(getReasoningTokenCount(completion as ExperimentCompletion)).toBe(0);
+    });
+
+    it("returns undefined when reasoning tokens field not present", () => {
+      const completion: Partial<ExperimentCompletion> = {};
+      expect(getReasoningTokenCount(completion as ExperimentCompletion)).toBeUndefined();
+    });
+
+    it("falls back to traces for regular Completion type", () => {
+      const completion: Partial<Completion> = {
+        traces: [
+          {
+            kind: "llm",
+            duration_seconds: 1,
+            cost_usd: 0,
+            model: "test-model",
+            provider: "test-provider",
+            usage: {
+              prompt: { text_token_count: 50, cost_usd: 0 },
+              completion: { text_token_count: 100, reasoning_token_count: 100, cost_usd: 0 },
+            },
+          },
+        ],
+      };
+      expect(getReasoningTokenCount(completion as Completion)).toBe(100);
+    });
+  });
+
+  describe("shouldIncludeReasoningMetric", () => {
+    it("returns true for valid completion with reasoning tokens", () => {
+      const completion: Partial<ExperimentCompletion> = {
+        reasoning_token_count: 150,
+        output: { messages: [] },
+      };
+      expect(shouldIncludeReasoningMetric(completion as ExperimentCompletion)).toBe(true);
+    });
+
+    it("returns false for undefined completion", () => {
+      expect(shouldIncludeReasoningMetric(undefined)).toBe(false);
+    });
+
+    it("returns false when output has error", () => {
+      const completion: Partial<ExperimentCompletion> = {
+        reasoning_token_count: 150,
+        output: {
+          error: { error: "Something went wrong" },
+        },
+      };
+      expect(shouldIncludeReasoningMetric(completion as ExperimentCompletion)).toBe(false);
+    });
+
+    it("returns false when reasoning tokens are undefined", () => {
+      const completion: Partial<ExperimentCompletion> = {
+        output: { messages: [] },
+      };
+      expect(shouldIncludeReasoningMetric(completion as ExperimentCompletion)).toBe(false);
+    });
   });
 });
 
diff --git a/web/src/components/utils/utils.ts b/web/src/components/utils/utils.ts
index a902c6c9..4bc00a40 100644
--- a/web/src/components/utils/utils.ts
+++ b/web/src/components/utils/utils.ts
@@ -1,11 +1,13 @@
 // Utility functions for experiment components
 import {
   Annotation,
+  Completion,
   ExperimentCompletion,
   ExperimentWithLookups,
   ExtendedVersion,
   Message,
   OutputSchema,
+  Trace,
   Version,
 } from "@/types/models";
 import { findCommonSubstrings } from "./stringMatchingUtils";
@@ -32,7 +34,7 @@ export function getMetricBadgeWithRelative(
   value: number,
   values: number[],
   isHigherBetter: boolean = false,
-  metricType?: "cost" | "duration"
+  metricType?: "cost" | "duration" | "reasoning"
 ) {
   if (!values || values.length === 0) {
     return {
@@ -70,6 +72,9 @@ export function getMetricBadgeWithRelative(
     if (metricType === "duration") {
       return isBetterValue ? "faster" : "slower";
     }
+    if (metricType === "reasoning") {
+      return isBetterValue ? "more efficient" : "less efficient";
+    }
     return ""; // Don't show any descriptor for unknown metric types
   };
 
@@ -79,14 +84,14 @@ export function getMetricBadgeWithRelative(
 
     if (isBest) {
       color =
-        metricType === "cost" || metricType === "duration"
+        metricType === "cost" || metricType === "duration" || metricType === "reasoning"
           ? "bg-green-200 border border-green-400 text-green-900"
           : "bg-transparent border border-gray-200 text-gray-700";
       const comparisonText = getComparisonText(true);
       relativeText = comparisonText ? `${(max / min).toFixed(1)}x ${comparisonText}` : `${(max / min).toFixed(1)}x`;
     } else if (isWorst) {
       color =
-        metricType === "cost" || metricType === "duration"
+        metricType === "cost" || metricType === "duration" || metricType === "reasoning"
           ? "bg-red-200 border border-red-300 text-red-900"
           : "bg-transparent border border-gray-200 text-gray-700";
       const comparisonText = getComparisonText(false);
@@ -97,7 +102,7 @@ export function getMetricBadgeWithRelative(
 
     // For non-best values, show how much worse they are
     if (!isBest && max > 0) {
-      if (metricType === "cost" || metricType === "duration") {
+      if (metricType === "cost" || metricType === "duration" || metricType === "reasoning") {
         relativeText = `${(max / value).toFixed(1)}x ${getComparisonText(false)}`;
       } else {
         relativeText = `${(max / value).toFixed(1)}x`;
@@ -109,14 +114,14 @@ export function getMetricBadgeWithRelative(
 
     if (isBest) {
       color =
-        metricType === "cost" || metricType === "duration"
+        metricType === "cost" || metricType === "duration" || metricType === "reasoning"
           ? "bg-green-200 border border-green-400 text-green-900"
           : "bg-transparent border border-gray-200 text-gray-700";
       const comparisonText = getComparisonText(true);
       relativeText = comparisonText ? `${(max / min).toFixed(1)}x ${comparisonText}` : `${(max / min).toFixed(1)}x`;
     } else if (isWorst) {
       color =
-        metricType === "cost" || metricType === "duration"
+        metricType === "cost" || metricType === "duration" || metricType === "reasoning"
           ? "bg-red-200 border border-red-300 text-red-900"
           : "bg-transparent border border-gray-200 text-gray-700";
       const comparisonText = getComparisonText(false);
@@ -127,7 +132,7 @@ export function getMetricBadgeWithRelative(
 
     // For non-best values, show how much worse they are
     if (!isBest && min > 0) {
-      if (metricType === "cost" || metricType === "duration") {
+      if (metricType === "cost" || metricType === "duration" || metricType === "reasoning") {
         relativeText = `${(value / min).toFixed(1)}x ${getComparisonText(false)}`;
       } else {
         relativeText = `${(value / min).toFixed(1)}x`;
@@ -256,6 +261,12 @@ export function shouldIncludeDurationMetric(
   );
 }
 
+export function shouldIncludeReasoningMetric(
+  completion: ExperimentCompletion | undefined
+): completion is ExperimentCompletion {
+  return completion != null && getReasoningTokenCount(completion) !== undefined && !completion.output?.error;
+}
+
 export function getValidCosts(completions: (ExperimentCompletion | undefined)[]): number[] {
   return completions
     .filter((completion): completion is ExperimentCompletion => shouldIncludeCostMetric(completion))
@@ -268,26 +279,47 @@ export function getValidDurations(completions: (ExperimentCompletion | undefined
     .map((completion) => completion.duration_seconds);
 }
 
+export function getValidReasoningTokens(completions: (ExperimentCompletion | undefined)[]): number[] {
+  return completions
+    .filter((completion): completion is ExperimentCompletion => shouldIncludeReasoningMetric(completion))
+    .map((completion) => getReasoningTokenCount(completion))
+    .filter((tokens): tokens is number => tokens !== undefined);
+}
+
 export function calculateAverageMetrics(completions: ExperimentCompletion[]): {
   avgCost: number | undefined;
   avgDuration: number | undefined;
+  avgReasoningTokens: number | undefined;
   costs: number[];
   durations: number[];
+  reasoningTokens: number[];
 } {
-  if (completions.length === 0) return { avgCost: undefined, avgDuration: undefined, costs: [], durations: [] };
+  if (completions.length === 0)
+    return {
+      avgCost: undefined,
+      avgDuration: undefined,
+      avgReasoningTokens: undefined,
+      costs: [],
+      durations: [],
+      reasoningTokens: [],
+    };
 
   // Use centralized filtering logic
   const costs = getValidCosts(completions);
   const durations = getValidDurations(completions);
+  const reasoningTokens = getValidReasoningTokens(completions);
 
   const totalCost = costs.reduce((sum, cost) => sum + cost, 0);
   const totalDuration = durations.reduce((sum, duration) => sum + duration, 0);
+  const totalReasoningTokens = reasoningTokens.reduce((sum, tokens) => sum + tokens, 0);
 
   return {
     avgCost: costs.length > 0 ? totalCost / costs.length : undefined,
     avgDuration: durations.length > 0 ? totalDuration / durations.length : undefined,
+    avgReasoningTokens: reasoningTokens.length > 0 ? totalReasoningTokens / reasoningTokens.length : undefined,
     costs,
     durations,
+    reasoningTokens,
   };
 }
 
@@ -325,7 +357,14 @@ export function getPriceAndLatencyPerVersion(
   }>
 ): Array<{
   versionId: string;
-  metrics: { avgCost: number | undefined; avgDuration: number | undefined; costs: number[]; durations: number[] };
+  metrics: {
+    avgCost: number | undefined;
+    avgDuration: number | undefined;
+    avgReasoningTokens: number | undefined;
+    costs: number[];
+    durations: number[];
+    reasoningTokens: number[];
+  };
 }> {
   return completionsPerVersion.map(({ versionId, completions }) => ({
     versionId,
@@ -1134,3 +1173,131 @@ export function stripMarkdown(markdown: string): string {
     .replace(/\n+/g, " ") // Replace newlines with spaces
     .trim();
 }
+
+/**
+ * Extracts the reasoning token count from a completion's traces.
+ * Looks through all LLM traces and returns the total reasoning tokens used.
+ *
+ * @param completion - The completion object containing traces
+ * @returns The total number of reasoning tokens used, or undefined if reasoning tokens are not present in the trace structure
+ */
+export function getReasoningTokenCount(completion: Completion | ExperimentCompletion): number | undefined {
+  // For ExperimentCompletion, use the direct reasoning_token_count field
+  if ("reasoning_token_count" in completion) {
+    return completion.reasoning_token_count;
+  }
+
+  // For regular Completion, fall back to parsing traces
+  const traces = "traces" in completion ? completion.traces : undefined;
+
+  if (!traces || !Array.isArray(traces)) {
+    return undefined;
+  }
+
+  let totalReasoningTokens = 0;
+  let hasReasoningField = false;
+
+  for (const trace of traces) {
+    // Only check LLM traces
+    if (trace.kind !== "llm") continue;
+
+    const llmTrace = trace as Extract<Trace, { kind: "llm" }>;
+
+    // Check if trace has usage data
+    if (!llmTrace.usage) continue;
+
+    // Handle both new detailed usage structure and old simple structure
+    if ("completion" in llmTrace.usage && llmTrace.usage.completion) {
+      const completionUsage = llmTrace.usage.completion;
+
+      // Check if reasoning_token_count field exists (even if it's 0)
+      if ("reasoning_token_count" in completionUsage && completionUsage.reasoning_token_count !== undefined) {
+        hasReasoningField = true;
+        totalReasoningTokens += completionUsage.reasoning_token_count || 0;
+      }
+    }
+  }
+
+  // Return undefined if no traces had reasoning_token_count field
+  // Return the total (including 0) if the field was present
+  return hasReasoningField ? totalReasoningTokens : undefined;
+}
+
+/**
+ * Checks if a completion used reasoning (has reasoning tokens > 0)
+ *
+ * @param completion - The completion object to check
+ * @returns True if the completion used reasoning, false if no reasoning or reasoning tokens not present
+ */
+export function hasReasoningTokens(completion: Completion | ExperimentCompletion): boolean {
+  const reasoningTokens = getReasoningTokenCount(completion);
+  return reasoningTokens !== undefined && reasoningTokens > 0;
+}
+
+/**
+ * Gets a summary of token usage from completion traces including reasoning tokens
+ *
+ * @param completion - The completion object containing traces
+ * @returns Object with token usage breakdown, reasoningTokens is undefined if not present in trace
+ */
+export function getTokenUsageSummary(completion: Completion | ExperimentCompletion): {
+  promptTokens: number;
+  completionTokens: number;
+  reasoningTokens: number | undefined;
+  cachedTokens: number;
+  totalTokens: number;
+} {
+  const traces = "traces" in completion ? completion.traces : undefined;
+
+  let promptTokens = 0;
+  let completionTokens = 0;
+  let reasoningTokens: number | undefined = undefined;
+  let cachedTokens = 0;
+  let hasReasoningField = false;
+
+  if (traces && Array.isArray(traces)) {
+    for (const trace of traces) {
+      if (trace.kind !== "llm") continue;
+
+      const llmTrace = trace as Extract<Trace, { kind: "llm" }>;
+
+      if (!llmTrace.usage) continue;
+
+      // Handle detailed usage structure
+      if ("prompt" in llmTrace.usage && "completion" in llmTrace.usage) {
+        const usage = llmTrace.usage as {
+          prompt: { text_token_count?: number };
+          completion: { text_token_count?: number; reasoning_token_count?: number; cached_token_count?: number };
+        };
+
+        if (usage.prompt.text_token_count) {
+          promptTokens += usage.prompt.text_token_count;
+        }
+
+        if (usage.completion.text_token_count) {
+          completionTokens += usage.completion.text_token_count;
+        }
+
+        if ("reasoning_token_count" in usage.completion && usage.completion.reasoning_token_count !== undefined) {
+          if (!hasReasoningField) {
+            hasReasoningField = true;
+            reasoningTokens = 0; // Initialize when we first find the field
+          }
+          reasoningTokens = (reasoningTokens || 0) + (usage.completion.reasoning_token_count || 0);
+        }
+
+        if (usage.completion.cached_token_count) {
+          cachedTokens += usage.completion.cached_token_count;
+        }
+      }
+    }
+  }
+
+  return {
+    promptTokens,
+    completionTokens,
+    reasoningTokens,
+    cachedTokens,
+    totalTokens: promptTokens + completionTokens + (reasoningTokens || 0),
+  };
+}
diff --git a/web/src/types/models.ts b/web/src/types/models.ts
index cbd6698c..c1bfcdb8 100644
--- a/web/src/types/models.ts
+++ b/web/src/types/models.ts
@@ -145,10 +145,23 @@ export interface Output {
   error?: Error;
 }
 
+export interface TokenUsage {
+  text_token_count?: number;
+  audio_token_count?: number;
+  audio_count?: number;
+  image_token_count?: number;
+  image_count?: number;
+  cost_usd: number;
+}
+
+export interface CompletionUsage extends TokenUsage {
+  cached_token_count?: number;
+  reasoning_token_count?: number;
+}
+
 export interface InferenceUsage {
-  input_tokens: number;
-  output_tokens: number;
-  total_tokens: number;
+  prompt: TokenUsage;
+  completion: CompletionUsage;
 }
 
 export interface LLMTrace {
@@ -199,6 +212,7 @@ export interface ExperimentCompletion {
   output: Output;
   cost_usd: number;
   duration_seconds: number;
+  reasoning_token_count?: number;
 }
 
 export interface Completion {