Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions competitors.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,15 @@
size_class: large
knowledge_cutoff: 2025-06

- name: raw-api-loop/gpt-5.5-pro
model: openai/gpt-5.5-pro
runtime: raw-api-loop
tool_profile: read-grep
auth_profile: openrouter-api-key
cost_model: '{"base_url": "https://openrouter.ai/api/v1", "input_usd_per_mtok": 30.0, "output_usd_per_mtok": 180.0}'
size_class: large
knowledge_cutoff: 2025-06

- name: raw-api-loop/glm-5.1
model: z-ai/glm-5.1
runtime: raw-api-loop
Expand Down
22 changes: 20 additions & 2 deletions nelson/html_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -997,17 +997,27 @@ def generate_leaderboard_report(
'<th class="num">Cases</th>'
"</tr>"
)
# A competitor that completed fewer than the fullest run's case count has a
# detection rate over a partial denominator — flag it so its rate is not read
# as rank-comparable with full-corpus competitors (see the footnote below).
full_n = max((e.cases for e in entries), default=0)
for i, e in enumerate(entries, 1):
star = (
' <span class="frontier-star" title="On a Pareto frontier">★</span>'
if e.competitor_name in cost_front or e.competitor_name in lat_front
else ""
)
partial = (
f'<sup style="color: var(--yellow)" title="Partial coverage: audited '
f'{e.cases} of {full_n} cases — see note below">*</sup>'
if full_n and e.cases < full_n
else ""
)
Comment thread
swelljoe marked this conversation as resolved.
parts.append(
f'<tr><td class="lead-rank">{i}</td>'
f"<td>{escape(e.competitor_name)}{star}</td>"
f"<td>{escape(e.competitor_name)}{partial}{star}</td>"
f"<td>{escape(e.size_class or '—')}</td>"
f'<td class="num">{_pct(e.detection_rate) if e.eligible else "—"}</td>'
f'<td class="num">{_pct(e.detection_rate) if e.eligible else "—"}{partial}</td>'
f'<td class="num">{e.hits}/{e.eligible}</td>'
f'<td class="num">{_pct(e.precision)}</td>'
f'<td class="num">{f"{e.fp_per_case:.2f}" if e.fp_per_case is not None else "—"}</td>'
Expand All @@ -1028,6 +1038,14 @@ def generate_leaderboard_report(
"the competitor's own spend per audited case. ★ = on a Pareto "
"frontier below.</p>"
)
if any(full_n and e.cases < full_n for e in entries):
parts.append(
'<p class="muted">* Partial coverage: this competitor completed fewer '
f"than the full {full_n} cases (see the Cases column). Its detection "
"rate is therefore based on fewer audited cases and is not directly "
"rank-comparable with full-corpus competitors — read it alongside the "
"Cases count, not the rank.</p>"
Comment thread
swelljoe marked this conversation as resolved.
)

# Pareto scatter plots.
parts.append("<h2>Pareto frontier</h2>")
Expand Down
Loading