Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/core/domain/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class ExperimentOutput(BaseModel):
output: AgentOutput | None
cost_usd: float | None
duration_seconds: float | None
reasoning_token_count: float | None = None


class ExperimentVersion(Version):
Expand Down
2 changes: 1 addition & 1 deletion backend/core/storage/completion_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from core.domain.experiment import Experiment
from core.domain.version import Version

type CompletionField = Literal["traces", "agent_id"]
type CompletionField = Literal["traces", "agent_id", "input_variables", "input_messages", "output_messages", "messages"]


class CompletionStorage(Protocol):
Expand Down
4 changes: 4 additions & 0 deletions backend/protocol/api/_api_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,10 @@ class Completion(BaseModel):
output: Output
cost_usd: float
duration_seconds: float
reasoning_token_count: float | None = Field(
default=None,
description="The number of reasoning tokens used in the inference, if applicable.",
)

completions: list[Completion] | None = Field(default=None, description="The completions of the experiment.")

Expand Down
1 change: 1 addition & 0 deletions backend/protocol/api/_services/conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,6 +919,7 @@ def experiment_completion_from_domain(completion: ExperimentOutput) -> Experimen
output=output_from_domain(completion.output) if completion.output else Output(),
cost_usd=completion.cost_usd or 0.0,
duration_seconds=completion.duration_seconds or 0.0,
reasoning_token_count=completion.reasoning_token_count,
)


Expand Down
54 changes: 54 additions & 0 deletions backend/protocol/api/_services/experiment_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,25 @@ async def get_experiment(
input_ids=input_ids,
)

# Fetch reasoning tokens for outputs if they exist
if exp.outputs:
completion_ids = [output.completion_id for output in exp.outputs]
# Get completions with traces from ClickHouse (exclude large fields we don't need)
completions_with_traces = await self.completion_storage.completions_by_ids(
completion_ids, exclude={"agent_id", "input_variables", "input_messages", "output_messages", "messages"},
)
# Create a map of completion_id -> reasoning_token_count
reasoning_tokens_map = {}
for comp in completions_with_traces:
reasoning_tokens = self._calculate_reasoning_tokens_from_traces(comp.traces)
if reasoning_tokens is not None:
reasoning_tokens_map[comp.id] = reasoning_tokens

# Attach reasoning token counts to experiment outputs
for output in exp.outputs:
if output.completion_id in reasoning_tokens_map:
output.reasoning_token_count = reasoning_tokens_map[output.completion_id]

annotations: list[Annotation] = []
if include is None or "annotations" in include:
annotations = await self.annotation_storage.list(
Expand All @@ -97,6 +116,41 @@ async def get_experiment(
# getting annotations as needed
return experiment_from_domain(exp, annotations)

def _calculate_reasoning_tokens_from_traces(self, traces: list[Any] | None) -> float | None:
"""Calculate total reasoning tokens from traces.

Args:
traces: List of trace objects

Returns:
Total reasoning tokens as float, or None if no reasoning tokens found
"""
if not traces:
return None

total_reasoning_tokens = 0.0
has_reasoning_field = False

for trace in traces:
# Only check LLM traces
if not hasattr(trace, "kind") or trace.kind != "llm":
continue

# Check if trace has usage data
if not hasattr(trace, "usage") or not trace.usage:
continue

# Handle detailed usage structure
if hasattr(trace.usage, "completion") and trace.usage.completion:
completion_usage = trace.usage.completion

# Check if reasoning_token_count field exists
if hasattr(completion_usage, "reasoning_token_count") and completion_usage.reasoning_token_count is not None:
has_reasoning_field = True
total_reasoning_tokens += float(completion_usage.reasoning_token_count or 0)

return total_reasoning_tokens if has_reasoning_field else None

async def list_experiments(self, agent_id: str | None = None, limit: int = 10, offset: int = 0) -> Page[Experiment]:
if agent_id:
agent = await self.agent_storage.get_agent(agent_id)
Expand Down
10 changes: 10 additions & 0 deletions web/eslint.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,16 @@ const compat = new FlatCompat({
const eslintConfig = [
...compat.extends("next/core-web-vitals", "next/typescript"),
{
ignores: [
".next/**/*",
"out/**/*",
"build/**/*",
"dist/**/*",
"node_modules/**/*",
".next/types/**/*",
".next/static/**/*",
".next/server/**/*",
],
rules: {
"no-restricted-imports": [
"error",
Expand Down
22 changes: 20 additions & 2 deletions web/src/app/experiments/[id]/sections/Results/MatrixSection.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import {
getSharedPartsOfPrompts,
getValidCosts,
getValidDurations,
getValidReasoningTokens,
} from "@/components/utils/utils";
import { useColumnWidths } from "@/hooks/useColumnWidths";
import { useVersionHiding } from "@/hooks/useVersionHiding";
Expand Down Expand Up @@ -115,6 +116,9 @@ export function MatrixSection(props: Props) {
const allAvgDurations = priceAndLatencyPerVersion
.map(({ metrics }) => metrics.avgDuration)
.filter((duration): duration is number => duration !== undefined);
const allAvgReasoningTokens = priceAndLatencyPerVersion
.map(({ metrics }) => metrics.avgReasoningTokens)
.filter((tokens): tokens is number => tokens !== undefined);

// Calculate raw metrics lookup for percentile data
const rawMetricsPerVersionPerKey = getRawMetricsPerVersionPerKey(experiment, annotations);
Expand All @@ -140,6 +144,10 @@ export function MatrixSection(props: Props) {
if (priceAndLatency.metrics.avgDuration !== undefined) {
allMetrics.unshift({ key: "duration", average: priceAndLatency.metrics.avgDuration });
}
// Only add reasoning tokens metric if it has a valid value
if (priceAndLatency.metrics.avgReasoningTokens !== undefined) {
allMetrics.unshift({ key: "reasoning", average: priceAndLatency.metrics.avgReasoningTokens });
}
}

// Combine allMetricsPerKey with price and latency data
Expand All @@ -151,13 +159,19 @@ export function MatrixSection(props: Props) {
if (allAvgDurations.length > 0) {
allMetricsPerKeyForVersion.duration = allAvgDurations;
}
if (allAvgReasoningTokens.length > 0) {
allMetricsPerKeyForVersion.reasoning = allAvgReasoningTokens;
}
}

// Combine rawMetricsPerKey with price and latency data
const versionMetricsPerKeyForVersion = { ...rawMetricsForVersion };
if (priceAndLatency?.metrics) {
versionMetricsPerKeyForVersion.cost = priceAndLatency.metrics.costs;
versionMetricsPerKeyForVersion.duration = priceAndLatency.metrics.durations;
if (priceAndLatency.metrics.reasoningTokens.length > 0) {
versionMetricsPerKeyForVersion.reasoning = priceAndLatency.metrics.reasoningTokens;
}
}

// Find the original index of this version in the sorted versions array
Expand Down Expand Up @@ -204,9 +218,10 @@ export function MatrixSection(props: Props) {
.map((version) => findCompletionForInputAndVersion(experiment.completions || [], input.id, version.id))
.filter(Boolean); // Remove undefined completions

// Calculate cost and duration arrays for this row using centralized utility functions
// Calculate cost, duration, and reasoning token arrays for this row using centralized utility functions
const allCostsForRow = getValidCosts(completionsForInput);
const allDurationsForRow = getValidDurations(completionsForInput);
const allReasoningTokensForRow = getValidReasoningTokens(completionsForInput);

// Calculate metrics per key for this row (for row-based comparison coloring)
const allMetricsPerKeyForRowData = getAllMetricsPerKeyForRow(experiment, annotations, input.id);
Expand All @@ -216,13 +231,16 @@ export function MatrixSection(props: Props) {
...allMetricsPerKeyForRowData,
};

// Add cost and duration arrays if they have data
// Add cost, duration, and reasoning token arrays if they have data
if (allCostsForRow.length > 0) {
allMetricsPerKeyForRow.cost = allCostsForRow;
}
if (allDurationsForRow.length > 0) {
allMetricsPerKeyForRow.duration = allDurationsForRow;
}
if (allReasoningTokensForRow.length > 0) {
allMetricsPerKeyForRow.reasoning = allReasoningTokensForRow;
}

return orderedVersions.map((version) => {
const completion = findCompletionForInputAndVersion(experiment.completions || [], input.id, version.id);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,12 @@ import { PageError } from "@/components/PageError";
import { useToast } from "@/components/ToastProvider";
import { AnnotationsView } from "@/components/annotations/AnnotationsView";
import { MessagesViewer } from "@/components/messages/MessagesViewer";
import { shouldIncludeCostMetric, shouldIncludeDurationMetric } from "@/components/utils/utils";
import {
getReasoningTokenCount,
shouldIncludeCostMetric,
shouldIncludeDurationMetric,
shouldIncludeReasoningMetric,
} from "@/components/utils/utils";
import { Annotation, ExperimentCompletion } from "@/types/models";
import { getMetricsForCompletion } from "../../../utils";

Expand Down Expand Up @@ -48,6 +53,14 @@ function CompletionCell(props: CompletionCellProps) {
metrics.push({ key: "duration", average: completion.duration_seconds });
}

// Add reasoning tokens metric if valid using centralized utility
if (shouldIncludeReasoningMetric(completion)) {
const reasoningTokens = getReasoningTokenCount(completion);
if (reasoningTokens !== undefined) {
metrics.push({ key: "reasoning", average: reasoningTokens });
}
}

// Add custom metrics from annotations
if (completionMetrics.length > 0) {
metrics.push(...completionMetrics);
Expand Down
19 changes: 17 additions & 2 deletions web/src/components/MetricItem.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ export function MetricItem({
if (metricKey.includes("duration") || metricKey.includes("latency")) {
return "duration";
}
if (metricKey.includes("reasoning")) {
return "reasoning";
}
return undefined;
}, [metricKey]);

Expand Down Expand Up @@ -108,18 +111,30 @@ export function MetricItem({
return (value: number) => (usePer1kMultiplier ? formatCurrency(value, 1000) : `$${formatNumber(value)}`);
} else if (metricType === "duration") {
return formatDuration;
} else if (metricType === "reasoning") {
return (value: number) => `${Math.round(value).toLocaleString()} tokens`;
} else {
return (value: number) => value.toFixed(2);
}
}, [metricType, usePer1kMultiplier]);

const displayLabel = showAvgPrefix
? `Average ${metricKey === "cost" ? (usePer1kMultiplier ? "cost (per 1K)" : "cost") : metricKey.replace(/_/g, " ")}`
? `Average ${
metricKey === "cost"
? usePer1kMultiplier
? "cost (per 1K)"
: "cost"
: metricKey === "reasoning"
? "reasoning"
: metricKey.replace(/_/g, " ")
}`
: metricKey === "cost"
? usePer1kMultiplier
? "cost (per 1K)"
: "cost"
: metricKey.replace(/_/g, " ");
: metricKey === "reasoning"
? "reasoning"
: metricKey.replace(/_/g, " ");

if (percentiles && showAvgPrefix) {
return (
Expand Down
Loading