diff --git a/competitors.example.yaml b/competitors.example.yaml index ab624ed..ffb26f0 100644 --- a/competitors.example.yaml +++ b/competitors.example.yaml @@ -305,6 +305,15 @@ size_class: large knowledge_cutoff: 2025-06 +- name: raw-api-loop/gpt-5.5-pro + model: openai/gpt-5.5-pro + runtime: raw-api-loop + tool_profile: read-grep + auth_profile: openrouter-api-key + cost_model: '{"base_url": "https://openrouter.ai/api/v1", "input_usd_per_mtok": 30.0, "output_usd_per_mtok": 180.0}' + size_class: large + knowledge_cutoff: 2025-06 + - name: raw-api-loop/glm-5.1 model: z-ai/glm-5.1 runtime: raw-api-loop diff --git a/nelson/html_report.py b/nelson/html_report.py index d6ae1aa..c44d71a 100644 --- a/nelson/html_report.py +++ b/nelson/html_report.py @@ -997,17 +997,27 @@ def generate_leaderboard_report( 'Cases' "" ) + # A competitor that completed fewer than the fullest run's case count has a + # detection rate over a partial denominator — flag it so its rate is not read + # as rank-comparable with full-corpus competitors (see the footnote below). + full_n = max((e.cases for e in entries), default=0) for i, e in enumerate(entries, 1): star = ( ' ' if e.competitor_name in cost_front or e.competitor_name in lat_front else "" ) + partial = ( + f'*' + if full_n and e.cases < full_n + else "" + ) parts.append( f'{i}' - f"{escape(e.competitor_name)}{star}" + f"{escape(e.competitor_name)}{partial}{star}" f"{escape(e.size_class or '—')}" - f'{_pct(e.detection_rate) if e.eligible else "—"}' + f'{_pct(e.detection_rate) if e.eligible else "—"}{partial}' f'{e.hits}/{e.eligible}' f'{_pct(e.precision)}' f'{f"{e.fp_per_case:.2f}" if e.fp_per_case is not None else "—"}' @@ -1028,6 +1038,14 @@ def generate_leaderboard_report( "the competitor's own spend per audited case. ★ = on a Pareto " "frontier below.

" ) + if any(full_n and e.cases < full_n for e in entries): + parts.append( + '

* Partial coverage: this competitor completed fewer ' + f"than the full {full_n} cases (see the Cases column). Its detection " + "rate is therefore based on fewer audited cases and is not directly " + "rank-comparable with full-corpus competitors — read it alongside the " + "Cases count, not the rank.

" + ) # Pareto scatter plots. parts.append("

Pareto frontier

")