From 16dfbe5eed5b3d064656a1544fc6b7d7648796d6 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 05:48:06 -0700 Subject: [PATCH 1/9] feat: support multiple comma-separated run IDs for unofficial runs Accept `?unofficialrun=123,456,789` on the dashboard URL to merge benchmark and evaluation data from multiple GitHub Actions runs into a single view. Each run's benchmarks are tagged with their originating run_url for per-point traceability, and eval config ids are offset per-run to avoid collisions in the merged set. A NON-OFFICIAL banner is rendered per run. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/app/cypress/support/mock-data.ts | 1 + .../src/app/api/unofficial-run/route.test.ts | 182 ++++++++++++++- .../app/src/app/api/unofficial-run/route.ts | 208 ++++++++++++------ .../components/unofficial-run-provider.tsx | 33 +-- 4 files changed, 338 insertions(+), 86 deletions(-) diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts index e6720c0b..15a3b741 100644 --- a/packages/app/cypress/support/mock-data.ts +++ b/packages/app/cypress/support/mock-data.ts @@ -436,6 +436,7 @@ export function createMockUnofficialRunContext( return { isUnofficialRun: false, unofficialRunInfo: null, + unofficialRunInfos: [], unofficialChartData: null, unofficialEvalRows: null, loading: false, diff --git a/packages/app/src/app/api/unofficial-run/route.test.ts b/packages/app/src/app/api/unofficial-run/route.test.ts index 9077edbf..be87e016 100644 --- a/packages/app/src/app/api/unofficial-run/route.test.ts +++ b/packages/app/src/app/api/unofficial-run/route.test.ts @@ -206,7 +206,7 @@ describe('normalizeArtifactRows', () => { describe('normalizeEvalArtifactRows', () => { it('converts aggregate eval rows to EvalRow shape with synthetic config ids', () => { - const rows = normalizeEvalArtifactRows( + const { rows, maxConfigId } = normalizeEvalArtifactRows( [rawEvalRow({ task: 'gsm8k', conc: 16 }), rawEvalRow({ task: 'mmlu', conc: 32 })], '2026-03-01', '2026-03-01T12:34:56Z', @@ -229,10 +229,26 @@ describe('normalizeEvalArtifactRows', () => { }); expect(rows[1].config_id).toBe(1); expect(rows[1].metrics.em_strict).toBe(0.91); + expect(maxConfigId).toBe(1); + }); + + it('offsets config ids when configIdOffset is provided', () => { + const { rows, maxConfigId } = normalizeEvalArtifactRows( + [rawEvalRow({ task: 'gsm8k', conc: 16 }), rawEvalRow({ task: 'mmlu', hw: 'h200-nv' })], + '2026-03-01', + '2026-03-01T12:34:56Z', + 'https://github.com/SemiAnalysisAI/InferenceX/actions/runs/123', + 10, + ); + + expect(rows).toHaveLength(2); + // Two distinct configs (different hw) → local ids 1 and 2, plus offset = 11 and 12 + expect(rows.map((r) => r.config_id).toSorted()).toEqual([11, 12]); + expect(maxConfigId).toBe(12); }); it('skips eval rows with unmapped hardware/model/task', () => { - const rows = normalizeEvalArtifactRows( + const { rows } = normalizeEvalArtifactRows( [ rawEvalRow({ hw: 'unknown-gpu' }), rawEvalRow({ model_prefix: 'unknown', model: 'unknown/model' }), @@ -278,6 +294,13 @@ describe('GET /api/unofficial-run', () => { expect(res.status).toBe(400); }); + it('returns 400 when comma-separated list contains a non-numeric id', async () => { + const res = await GET(makeRequest('runId=123,abc,456')); + expect(res.status).toBe(400); + const body = await res.json(); + expect(body.error).toContain('comma-separated'); + }); + it('returns 500 when GITHUB_TOKEN is not set', async () => { delete process.env.GITHUB_TOKEN; const mod = await import('./route'); @@ -375,10 +398,12 @@ describe('GET /api/unofficial-run', () => { const res = await GET(makeRequest('runId=123')); expect(res.status).toBe(200); const body = await res.json(); - expect(body.runInfo.id).toBe(123); - expect(body.runInfo.isNonMainBranch).toBe(false); + expect(body.runInfos).toHaveLength(1); + expect(body.runInfos[0].id).toBe(123); + expect(body.runInfos[0].isNonMainBranch).toBe(false); expect(body.benchmarks).toHaveLength(1); expect(body.benchmarks[0].hardware).toBe('h200'); + expect(body.benchmarks[0].run_url).toBe('http://github.com/run/123'); expect(body.evaluations).toEqual([]); }); @@ -466,7 +491,154 @@ describe('GET /api/unofficial-run', () => { const res = await GET(makeRequest('runId=456')); expect(res.status).toBe(200); const body = await res.json(); - expect(body.runInfo.isNonMainBranch).toBe(true); + expect(body.runInfos).toHaveLength(1); + expect(body.runInfos[0].isNonMainBranch).toBe(true); expect(body.benchmarks).toHaveLength(0); }); + + it('merges data from multiple comma-separated runIds', async () => { + // Run 1 metadata + mockFetch.mockResolvedValueOnce({ + ok: true, + json: () => + Promise.resolve({ + id: 111, + name: 'run-1', + head_branch: 'feature/a', + head_sha: 'aaa', + created_at: '2026-01-01T00:00:00Z', + html_url: 'http://github.com/run/111', + conclusion: 'success', + status: 'completed', + }), + }); + // Run 1 artifacts + mockFetch.mockResolvedValueOnce({ + ok: true, + json: () => + Promise.resolve({ + artifacts: [{ name: 'results_bmk', id: 10, archive_download_url: 'http://dl-1' }], + }), + }); + // Run 1 download + mockFetch.mockResolvedValueOnce({ + ok: true, + arrayBuffer: () => Promise.resolve(new ArrayBuffer(8)), + }); + mockGetEntries.mockReturnValueOnce([ + { entryName: 'r1.json', getData: () => Buffer.from(JSON.stringify([rawRow()])) }, + ]); + + // Run 2 metadata + mockFetch.mockResolvedValueOnce({ + ok: true, + json: () => + Promise.resolve({ + id: 222, + name: 'run-2', + head_branch: 'feature/b', + head_sha: 'bbb', + created_at: '2026-01-02T00:00:00Z', + html_url: 'http://github.com/run/222', + conclusion: 'success', + status: 'completed', + }), + }); + // Run 2 artifacts + mockFetch.mockResolvedValueOnce({ + ok: true, + json: () => + Promise.resolve({ + artifacts: [{ name: 'results_bmk', id: 20, archive_download_url: 'http://dl-2' }], + }), + }); + // Run 2 download + mockFetch.mockResolvedValueOnce({ + ok: true, + arrayBuffer: () => Promise.resolve(new ArrayBuffer(8)), + }); + mockGetEntries.mockReturnValueOnce([ + { + entryName: 'r2.json', + getData: () => Buffer.from(JSON.stringify([rawRow({ hw: 'mi355x-amds' })])), + }, + ]); + + const res = await GET(makeRequest('runId=111,222')); + expect(res.status).toBe(200); + const body = await res.json(); + expect(body.runInfos).toHaveLength(2); + expect(body.runInfos.map((r: { id: number }) => r.id)).toEqual([111, 222]); + expect(body.benchmarks).toHaveLength(2); + // Each benchmark row is tagged with its originating run_url + expect(body.benchmarks[0].run_url).toBe('http://github.com/run/111'); + expect(body.benchmarks[1].run_url).toBe('http://github.com/run/222'); + }); + + it('dedupes repeated runIds in the comma-separated list', async () => { + // Only one set of fetches expected since 123 is deduped + mockFetch.mockResolvedValueOnce({ + ok: true, + json: () => + Promise.resolve({ + id: 123, + head_branch: 'main', + html_url: 'http://github.com/run/123', + created_at: '2026-01-01T00:00:00Z', + }), + }); + mockFetch.mockResolvedValueOnce({ + ok: true, + json: () => + Promise.resolve({ + artifacts: [{ name: 'results_bmk', id: 10, archive_download_url: 'http://dl' }], + }), + }); + mockFetch.mockResolvedValueOnce({ + ok: true, + arrayBuffer: () => Promise.resolve(new ArrayBuffer(8)), + }); + mockGetEntries.mockReturnValueOnce([]); + + const res = await GET(makeRequest('runId=123,123')); + expect(res.status).toBe(200); + const body = await res.json(); + expect(body.runInfos).toHaveLength(1); + // Only three fetches were made (run, artifacts, download) — not six + expect(mockFetch).toHaveBeenCalledTimes(3); + }); + + it('fails with the upstream status when any runId in the list errors', async () => { + // First run succeeds + mockFetch.mockResolvedValueOnce({ + ok: true, + json: () => + Promise.resolve({ + id: 111, + head_branch: 'main', + html_url: 'http://github.com/run/111', + created_at: '2026-01-01T00:00:00Z', + }), + }); + mockFetch.mockResolvedValueOnce({ + ok: true, + json: () => + Promise.resolve({ + artifacts: [{ name: 'results_bmk', id: 10, archive_download_url: 'http://dl' }], + }), + }); + mockFetch.mockResolvedValueOnce({ + ok: true, + arrayBuffer: () => Promise.resolve(new ArrayBuffer(8)), + }); + mockGetEntries.mockReturnValueOnce([]); + + // Second run 404s on metadata fetch + mockFetch.mockResolvedValueOnce({ ok: false, status: 404, statusText: 'Not Found' }); + + const res = await GET(makeRequest('runId=111,999')); + expect(res.status).toBe(404); + const body = await res.json(); + expect(body.error).toContain('999'); + }); }); diff --git a/packages/app/src/app/api/unofficial-run/route.ts b/packages/app/src/app/api/unofficial-run/route.ts index 79ac0665..4e5b5265 100644 --- a/packages/app/src/app/api/unofficial-run/route.ts +++ b/packages/app/src/app/api/unofficial-run/route.ts @@ -24,6 +24,7 @@ import { export function normalizeArtifactRows( rawRows: Record[], date: string, + runUrl: string | null = null, ): BenchmarkRow[] { const tracker = createSkipTracker(); const results: BenchmarkRow[] = []; @@ -55,7 +56,7 @@ export function normalizeArtifactRows( image: params.image, metrics: params.metrics, date, - run_url: null, + run_url: runUrl, }); } return results; @@ -82,32 +83,40 @@ function evalConfigKey(config: EvalParams['config']): string { ].join('|'); } -/** Normalize aggregate eval rows into the EvalRow shape the frontend expects. */ +/** + * Normalize aggregate eval rows into the EvalRow shape the frontend expects. + * + * When merging rows from multiple runs, pass `configIdOffset` so synthetic config + * ids from this batch don't collide with ids already emitted by earlier batches. + * Returns the rows and the maximum config id assigned, so the caller can advance + * the offset for the next batch. + */ export function normalizeEvalArtifactRows( rawRows: Record[], date: string, timestamp: string, runUrl: string, -): EvalRow[] { + configIdOffset = 0, +): { rows: EvalRow[]; maxConfigId: number } { const tracker = createSkipTracker(); const configIds = new Map(); - let nextConfigId = 1; - const results: EvalRow[] = []; + let nextLocalId = 1; + const rows: EvalRow[] = []; for (const raw of rawRows) { const params = mapAggEvalRow(raw as Record, tracker); if (!params) continue; const key = evalConfigKey(params.config); - let configId = configIds.get(key); - if (!configId) { - configId = nextConfigId; - configIds.set(key, configId); - nextConfigId += 1; + let localId = configIds.get(key); + if (!localId) { + localId = nextLocalId; + configIds.set(key, localId); + nextLocalId += 1; } - results.push({ - config_id: configId, + rows.push({ + config_id: configIdOffset + localId, hardware: params.config.hardware, framework: params.config.framework, model: params.config.model, @@ -134,7 +143,7 @@ export function normalizeEvalArtifactRows( }); } - return results; + return { rows, maxConfigId: configIdOffset + (nextLocalId - 1) }; } /** Extract all valid JSON files from a ZIP buffer; malformed JSON entries are skipped. */ @@ -161,10 +170,111 @@ async function downloadArtifactRows(archiveUrl: string, githubToken: string) { return { rows, errorResponse: null }; } +/** Parse the runId query param into a list of unique numeric ids. */ +function parseRunIds(raw: string | null): { ids: string[]; error: string | null } { + if (!raw) return { ids: [], error: 'runId must be provided' }; + const ids = [ + ...new Set( + raw + .split(',') + .map((s) => s.trim()) + .filter(Boolean), + ), + ]; + if (ids.length === 0 || !ids.every((id) => /^\d+$/.test(id))) { + return { ids: [], error: 'runId must be a comma-separated list of numeric values' }; + } + return { ids, error: null }; +} + +/** Fetch, download, and normalize data for a single run. Errors bubble as NextResponse. */ +async function processSingleRun( + runId: string, + githubToken: string, + evalConfigIdOffset: number, +): Promise< + | { errorResponse: NextResponse } + | { + errorResponse: null; + runInfo: ReturnType & { isNonMainBranch: boolean }; + benchmarks: BenchmarkRow[]; + evaluations: EvalRow[]; + nextEvalConfigIdOffset: number; + } +> { + const runResp = await fetchGithubWorkflowRun(runId, githubToken); + if (!runResp.ok) { + return { + errorResponse: NextResponse.json( + { error: `GitHub API error for runId ${runId}: ${runResp.statusText}` }, + { status: runResp.status }, + ), + }; + } + const run = (await runResp.json()) as GithubWorkflowRun; + + const artifacts = await fetchGithubRunArtifacts(runId, githubToken); + const bmkArtifact = artifacts + .filter((a) => a.name === 'results_bmk') + .toSorted((a, b) => b.id - a.id)[0]; + const evalArtifact = artifacts + .filter((a) => a.name === 'eval_results_all') + .toSorted((a, b) => b.id - a.id)[0]; + + if (!bmkArtifact && !evalArtifact) { + return { + errorResponse: NextResponse.json( + { + error: `No results_bmk or eval_results_all artifact found for runId ${runId}`, + }, + { status: 404 }, + ), + }; + } + + const date = getRunDate(run); + const runUrl = run.html_url ?? ''; + const timestamp = run.created_at ?? `${date}T00:00:00Z`; + let benchmarks: BenchmarkRow[] = []; + let evaluations: EvalRow[] = []; + let nextEvalConfigIdOffset = evalConfigIdOffset; + + if (bmkArtifact) { + const { rows, errorResponse } = await downloadArtifactRows( + bmkArtifact.archive_download_url, + githubToken, + ); + if (errorResponse) return { errorResponse }; + benchmarks = normalizeArtifactRows(rows, date, runUrl || null); + } + + if (evalArtifact) { + const { rows, errorResponse } = await downloadArtifactRows( + evalArtifact.archive_download_url, + githubToken, + ); + if (errorResponse) return { errorResponse }; + const normalized = normalizeEvalArtifactRows(rows, date, timestamp, runUrl, evalConfigIdOffset); + evaluations = normalized.rows; + nextEvalConfigIdOffset = normalized.maxConfigId; + } + + return { + errorResponse: null, + runInfo: { + ...normalizeGithubRunInfo(run), + isNonMainBranch: run.head_branch !== 'main', + }, + benchmarks, + evaluations, + nextEvalConfigIdOffset, + }; +} + export async function GET(request: NextRequest) { - const runId = request.nextUrl.searchParams.get('runId'); - if (!runId || !/^\d+$/.test(runId)) { - return NextResponse.json({ error: 'runId must be a numeric value' }, { status: 400 }); + const { ids: runIds, error: runIdError } = parseRunIds(request.nextUrl.searchParams.get('runId')); + if (runIdError) { + return NextResponse.json({ error: runIdError }, { status: 400 }); } const githubToken = getGithubToken(); @@ -173,63 +283,25 @@ export async function GET(request: NextRequest) { } try { - // Fetch workflow run metadata - const runResp = await fetchGithubWorkflowRun(runId, githubToken); - if (!runResp.ok) { - return NextResponse.json( - { error: `GitHub API: ${runResp.statusText}` }, - { status: runResp.status }, - ); - } - const run = (await runResp.json()) as GithubWorkflowRun; - - // Fetch artifacts, find latest benchmark/eval aggregates - const artifacts = await fetchGithubRunArtifacts(runId, githubToken); - - const bmkArtifact = artifacts - .filter((a) => a.name === 'results_bmk') - .toSorted((a, b) => b.id - a.id)[0]; + const runInfos: (ReturnType & { + isNonMainBranch: boolean; + })[] = []; + const benchmarks: BenchmarkRow[] = []; + const evaluations: EvalRow[] = []; + let evalConfigIdOffset = 0; - const evalArtifact = artifacts - .filter((a) => a.name === 'eval_results_all') - .toSorted((a, b) => b.id - a.id)[0]; - - if (!bmkArtifact && !evalArtifact) { - return NextResponse.json( - { error: 'No results_bmk or eval_results_all artifact found' }, - { status: 404 }, - ); - } - - const date = getRunDate(run); - const runUrl = run.html_url ?? ''; - const timestamp = run.created_at ?? `${date}T00:00:00Z`; - let benchmarks: BenchmarkRow[] = []; - let evaluations: EvalRow[] = []; - - if (bmkArtifact) { - const { rows, errorResponse } = await downloadArtifactRows( - bmkArtifact.archive_download_url, - githubToken, - ); - if (errorResponse) return errorResponse; - benchmarks = normalizeArtifactRows(rows, date); - } + for (const runId of runIds) { + const result = await processSingleRun(runId, githubToken, evalConfigIdOffset); + if (result.errorResponse) return result.errorResponse; - if (evalArtifact) { - const { rows, errorResponse } = await downloadArtifactRows( - evalArtifact.archive_download_url, - githubToken, - ); - if (errorResponse) return errorResponse; - evaluations = normalizeEvalArtifactRows(rows, date, timestamp, runUrl); + runInfos.push(result.runInfo); + benchmarks.push(...result.benchmarks); + evaluations.push(...result.evaluations); + evalConfigIdOffset = result.nextEvalConfigIdOffset; } return NextResponse.json({ - runInfo: { - ...normalizeGithubRunInfo(run), - isNonMainBranch: run.head_branch !== 'main', - }, + runInfos, benchmarks, evaluations, }); diff --git a/packages/app/src/components/unofficial-run-provider.tsx b/packages/app/src/components/unofficial-run-provider.tsx index a44d5c1c..262373b5 100644 --- a/packages/app/src/components/unofficial-run-provider.tsx +++ b/packages/app/src/components/unofficial-run-provider.tsx @@ -51,7 +51,10 @@ interface AvailableModelSequence { export interface UnofficialRunContextType { isUnofficialRun: boolean; + /** First run in the loaded set — kept as a convenience alias for overlay labels. */ unofficialRunInfo: UnofficialRunInfo | null; + /** All runs loaded from the `unofficialrun(s)` URL param (comma-separated). */ + unofficialRunInfos: UnofficialRunInfo[]; unofficialChartData: UnofficialChartData | null; unofficialEvalRows: EvalRow[] | null; loading: boolean; @@ -150,7 +153,8 @@ export function parseAvailableModelsAndSequences( } export function UnofficialRunProvider({ children }: { children: ReactNode }) { - const [unofficialRunInfo, setUnofficialRunInfo] = useState(null); + const [unofficialRunInfos, setUnofficialRunInfos] = useState([]); + const unofficialRunInfo = unofficialRunInfos[0] ?? null; const [unofficialChartData, setUnofficialChartData] = useState(null); const [unofficialEvalRows, setUnofficialEvalRows] = useState(null); const [loading, setLoading] = useState(false); @@ -212,7 +216,7 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) { ); const clearUnofficialRun = useCallback(() => { - setUnofficialRunInfo(null); + setUnofficialRunInfos([]); setUnofficialChartData(null); setUnofficialEvalRows(null); setError(null); @@ -239,15 +243,15 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) { useEffect(() => { const load = () => { const params = new URLSearchParams(window.location.search); - let unofficialRunId: string | undefined; + let unofficialRunIdParam: string | undefined; for (const [key, value] of params) { if (UNOFFICIAL_RUN_PARAM_RE.test(key) && value) { - unofficialRunId = value; + unofficialRunIdParam = value; break; } } - if (!unofficialRunId) { - setUnofficialRunInfo(null); + if (!unofficialRunIdParam) { + setUnofficialRunInfos([]); setUnofficialChartData(null); setUnofficialEvalRows(null); setError(null); @@ -258,12 +262,14 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) { setLoading(true); setError(null); - fetch(`/api/unofficial-run?runId=${unofficialRunId}`) + // Pass the raw param value through — it may be a single id or a comma-separated list. + // encodeURIComponent preserves commas while escaping any accidental whitespace/symbols. + fetch(`/api/unofficial-run?runId=${encodeURIComponent(unofficialRunIdParam)}`) .then(async (response) => { const data = await response.json(); if (!response.ok) throw new Error(data.error || 'Failed to fetch unofficial run'); - setUnofficialRunInfo(data.runInfo); + setUnofficialRunInfos(Array.isArray(data.runInfos) ? data.runInfos : []); const chartData = buildChartData(data.benchmarks ?? []); setUnofficialChartData(chartData); setUnofficialEvalRows(data.evaluations ?? []); @@ -271,7 +277,7 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) { }) .catch((caughtError) => { setError(caughtError instanceof Error ? caughtError.message : 'Unknown error'); - setUnofficialRunInfo(null); + setUnofficialRunInfos([]); setUnofficialChartData(null); setUnofficialEvalRows(null); setAvailableModelsAndSequences([]); @@ -287,8 +293,9 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) { return ( 0, unofficialRunInfo, + unofficialRunInfos, unofficialChartData, unofficialEvalRows, loading, @@ -305,9 +312,9 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) { setLocalOfficialOverride, }} > - {unofficialRunInfo && ( - - )} + {unofficialRunInfos.map((info) => ( + + ))} {children} ); From 1692e4c29d1484aa419c0af3dd8d56e7d70f0c25 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 06:08:44 -0700 Subject: [PATCH 2/9] feat(unofficial-run): hue-shift overlay strokes per run index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When multiple unofficial runs are loaded, overlay points/rooflines for the same GPU were rendered in identical colors, making it impossible to tell runs apart. Derive a per-run hue rotation from the run's position in the loaded set and apply it via CSS filter — run 0 unchanged, each subsequent run shifted by 55°. Roofline grouping now includes runIndex so each run gets its own Pareto front. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/app/cypress/support/mock-data.ts | 1 + .../components/inference/ui/ScatterGraph.tsx | 86 ++++++++++++++----- .../components/unofficial-run-provider.tsx | 19 ++++ 3 files changed, 83 insertions(+), 23 deletions(-) diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts index 15a3b741..6589df66 100644 --- a/packages/app/cypress/support/mock-data.ts +++ b/packages/app/cypress/support/mock-data.ts @@ -437,6 +437,7 @@ export function createMockUnofficialRunContext( isUnofficialRun: false, unofficialRunInfo: null, unofficialRunInfos: [], + runIndexByUrl: {}, unofficialChartData: null, unofficialEvalRows: null, loading: false, diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 2e078f89..939da519 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -63,6 +63,29 @@ const getXPath = (size: number) => { return `M ${-s} ${-s} L ${s} ${s} M ${s} ${-s} L ${-s} ${s}`; }; +/** + * CSS filter to visually distinguish overlay points from multiple unofficial + * runs when more than one is loaded. The first run (index 0) is untouched so + * the common single-run case is unaffected. Applied via `style('filter', ...)` + * on SVG groups — works regardless of whether the underlying stroke color is + * a CSS variable, oklch, or hex. + */ +const OVERLAY_HUE_STEP_DEG = 55; +function overlayFilterForRunIndex(idx: number): string | null { + if (idx <= 0) return null; + const hue = (idx * OVERLAY_HUE_STEP_DEG) % 360; + return `hue-rotate(${hue}deg) saturate(1.2)`; +} +function overlayRunIndex(runUrl: string | null | undefined, map: Record): number { + if (!runUrl) return 0; + if (runUrl in map) return map[runUrl]; + // Fall back to the numeric run id parsed from the URL — handles cases where + // `updateRepoUrl` rewrote the host/org and the full-URL key no longer matches. + const idMatch = runUrl.match(/\/runs\/(\d+)/); + if (idMatch && idMatch[1] in map) return map[idMatch[1]]; + return 0; +} + const formatChangelogDescription = (desc: string | string[]): React.JSX.Element => { if (typeof desc === 'string') { return ( @@ -147,6 +170,7 @@ const ScatterGraph = React.memo( resetOverlayHwTypes, localOfficialOverride, setLocalOfficialOverride, + runIndexByUrl, } = useUnofficialRun(); const chartRef = useRef(null); @@ -323,17 +347,24 @@ const ScatterGraph = React.memo( }, [filteredData, processedOverlayData]); const overlayRooflines = useMemo(() => { - if (processedOverlayData.length === 0) return {}; + interface Entry { + hwKey: string; + runIndex: number; + points: InferenceData[]; + } + if (processedOverlayData.length === 0) return {} as Record; + // Group by hwKey + precision + runIndex so overlay rooflines from different + // unofficial runs stay separate and can be styled with per-run hue shifts. const grouped = processedOverlayData.reduce( (acc, p) => { - const key = `${p.hwKey}_${p.precision}`; - if (!acc[key]) acc[key] = []; - acc[key].push(p); + const runIndex = overlayRunIndex(p.run_url ?? null, runIndexByUrl); + const key = `${p.hwKey}_${p.precision}_run${runIndex}`; + if (!acc[key]) acc[key] = { hwKey: String(p.hwKey), runIndex, points: [] }; + acc[key].points.push(p); return acc; }, - {} as Record, + {} as Record, ); - const result: Record = {}; const rooflineKey = `${selectedYAxisMetric}_roofline` as keyof ChartDefinition; const dir = chartDefinition[rooflineKey] as | 'upper_right' @@ -341,20 +372,21 @@ const ScatterGraph = React.memo( | 'lower_left' | 'lower_right' | undefined; - for (const hw of Object.keys(grouped)) { + const result: Record = {}; + for (const [key, group] of Object.entries(grouped)) { const front = dir === 'upper_right' - ? paretoFrontUpperRight(grouped[hw]) + ? paretoFrontUpperRight(group.points) : dir === 'upper_left' - ? paretoFrontUpperLeft(grouped[hw]) + ? paretoFrontUpperLeft(group.points) : dir === 'lower_left' - ? paretoFrontLowerLeft(grouped[hw]) - : paretoFrontLowerRight(grouped[hw]); + ? paretoFrontLowerLeft(group.points) + : paretoFrontLowerRight(group.points); front.sort((a, b) => a.x - b.x); - result[hw] = front; + result[key] = { hwKey: group.hwKey, runIndex: group.runIndex, points: front }; } return result; - }, [processedOverlayData, selectedYAxisMetric, chartDefinition]); + }, [processedOverlayData, selectedYAxisMetric, chartDefinition, runIndexByUrl]); // All official points for rendering (unfiltered — visibility via opacity) const pointsData = useMemo(() => Object.values(groupedData).flat(), [groupedData]); @@ -1277,16 +1309,17 @@ const ScatterGraph = React.memo( key: string; points: InferenceData[]; stroke: string; + runIndex: number; } const ovEntries: OvEntry[] = []; - Object.entries(overlayRooflines).forEach(([key, pts]) => { - const hw = key.split('_').slice(0, -1).join('_'); - const hwCfg = overlayData.hardwareConfig[hw]; - if (hwCfg && pts.length > 1) { + Object.entries(overlayRooflines).forEach(([key, group]) => { + const hwCfg = overlayData.hardwareConfig[group.hwKey]; + if (hwCfg && group.points.length > 1) { ovEntries.push({ key, - points: pts, - stroke: getCssColor(resolveColor(hw)), + points: group.points, + stroke: getCssColor(resolveColor(group.hwKey)), + runIndex: group.runIndex, }); } }); @@ -1304,7 +1337,8 @@ const ScatterGraph = React.memo( .attr('stroke', (d) => d.stroke) .attr('stroke-width', 2) .attr('stroke-dasharray', '6 3') - .attr('d', (d) => lineGen(d.points)); + .attr('d', (d) => lineGen(d.points)) + .style('filter', (d) => overlayFilterForRunIndex(d.runIndex)); // Overlay X-shape points — index-keyed so every point renders const overlayPoints = zoomGroup @@ -1333,6 +1367,11 @@ const ScatterGraph = React.memo( }); overlayPoints.attr('transform', (d) => `translate(${xScale(d.x)},${yScale(d.y)})`); + // Apply per-run hue shift at the group level so the shape and its + // label inherit the same tone and stay visually grouped. + overlayPoints.style('filter', (d) => + overlayFilterForRunIndex(overlayRunIndex(d.run_url ?? null, runIndexByUrl)), + ); overlayPoints .select('.overlay-x') .attr('stroke', (d) => getCssColor(resolveColor(d.hwKey as string))); @@ -1445,10 +1484,10 @@ const ScatterGraph = React.memo( .y((d) => newYScale(d.y)) .curve(d3.curveMonotoneX); - Object.entries(overlayRooflines).forEach(([key, pts]) => { - if (pts.length < 2) return; + Object.entries(overlayRooflines).forEach(([key, group]) => { + if (group.points.length < 2) return; const sel = zoomGroup.select(`.overlay-roofline-${key}`); - if (!sel.empty()) sel.attr('d', lineGen(pts) as string); + if (!sel.empty()) sel.attr('d', lineGen(group.points) as string); }); // Update overlay points @@ -1481,6 +1520,7 @@ const ScatterGraph = React.memo( overlayData, processedOverlayData, overlayRooflines, + runIndexByUrl, hardwareConfig, xLabel, yLabel, diff --git a/packages/app/src/components/unofficial-run-provider.tsx b/packages/app/src/components/unofficial-run-provider.tsx index 262373b5..87145f1b 100644 --- a/packages/app/src/components/unofficial-run-provider.tsx +++ b/packages/app/src/components/unofficial-run-provider.tsx @@ -55,6 +55,12 @@ export interface UnofficialRunContextType { unofficialRunInfo: UnofficialRunInfo | null; /** All runs loaded from the `unofficialrun(s)` URL param (comma-separated). */ unofficialRunInfos: UnofficialRunInfo[]; + /** + * Position of each run in the loaded set, keyed by both `run.url` and the + * numeric id as a string. Used to derive a distinct hue shift per run for + * overlay points so multiple runs are visually separable. + */ + runIndexByUrl: Record; unofficialChartData: UnofficialChartData | null; unofficialEvalRows: EvalRow[] | null; loading: boolean; @@ -228,6 +234,18 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) { window.history.pushState({}, '', url); }, []); + // Build a url → index lookup. Keyed by the full run.url AND by the numeric id + // as a string, since `updateRepoUrl` can rewrite hosts/orgs between the + // overlay rendering path and the run metadata. + const runIndexByUrl = useMemo(() => { + const map: Record = {}; + unofficialRunInfos.forEach((info, idx) => { + if (info.url) map[info.url] = idx; + if (info.id !== undefined && info.id !== null) map[String(info.id)] = idx; + }); + return map; + }, [unofficialRunInfos]); + const getOverlayData = useCallback( (model: Model, sequence: Sequence, chartType: 'e2e' | 'interactivity') => { if (!unofficialChartData) return null; @@ -296,6 +314,7 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) { isUnofficialRun: unofficialRunInfos.length > 0, unofficialRunInfo, unofficialRunInfos, + runIndexByUrl, unofficialChartData, unofficialEvalRows, loading, From ece3b407ba95f0735b46a3843857a5e4f3bfd50b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 06:16:04 -0700 Subject: [PATCH 3/9] feat(unofficial-run): extend per-run hue shift to evaluation overlays BarChartD3's X-mark overlay points and their error-bar groups now use the same per-run hue rotation as the inference scatter overlay, so runs loaded via a comma-separated unofficialrun= list are visually separable on the evaluation tab too. Extracts the shared filter and runIndex helpers into lib/overlay-run-style.ts to avoid duplication. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../components/evaluation/ui/BarChartD3.tsx | 11 ++++++ .../components/inference/ui/ScatterGraph.tsx | 24 +----------- packages/app/src/lib/overlay-run-style.ts | 37 +++++++++++++++++++ 3 files changed, 49 insertions(+), 23 deletions(-) create mode 100644 packages/app/src/lib/overlay-run-style.ts diff --git a/packages/app/src/components/evaluation/ui/BarChartD3.tsx b/packages/app/src/components/evaluation/ui/BarChartD3.tsx index be5f0211..e4d961b3 100644 --- a/packages/app/src/components/evaluation/ui/BarChartD3.tsx +++ b/packages/app/src/components/evaluation/ui/BarChartD3.tsx @@ -24,6 +24,7 @@ import { Skeleton } from '@/components/ui/skeleton'; import { useUnofficialRun } from '@/components/unofficial-run-provider'; import { useThemeColors } from '@/hooks/useThemeColors'; import { computeToggle } from '@/hooks/useTogglableSet'; +import { overlayFilterForRunIndex, overlayRunIndex } from '@/lib/overlay-run-style'; const BASE_MARGIN = { top: 24, right: 24, bottom: 52 }; const OVERLAY_X_SIZE = 6; @@ -164,6 +165,7 @@ export default function EvalBarChartD3({ caption }: { caption?: ReactNode }) { resetOverlayHwTypes, localOfficialOverride, setLocalOfficialOverride, + runIndexByUrl, } = useUnofficialRun(); const chartRef = useRef(null); @@ -535,6 +537,9 @@ export default function EvalBarChartD3({ caption }: { caption?: ReactNode }) { return bar; }); + bars.style('filter', (d) => + overlayFilterForRunIndex(overlayRunIndex(d.runUrl ?? null, runIndexByUrl)), + ); bars .selectAll( '.unofficial-eb-stem, .unofficial-eb-cap-top, .unofficial-eb-cap-bot', @@ -684,6 +689,11 @@ export default function EvalBarChartD3({ caption }: { caption?: ReactNode }) { (d) => `translate(${xScale(d.score)},${(yScale(d.configLabel) || 0) + yScale.bandwidth() / 2})`, ); + // Per-run hue shift at the group level so the X-mark + score label + // inherit the same tone and stay visually grouped. + overlayPoints.style('filter', (d) => + overlayFilterForRunIndex(overlayRunIndex(d.runUrl ?? null, runIndexByUrl)), + ); overlayPoints .select('.unofficial-eval-x') @@ -775,6 +785,7 @@ export default function EvalBarChartD3({ caption }: { caption?: ReactNode }) { unofficialChartData, unofficialErrorData, unofficialRunInfo, + runIndexByUrl, ], ); diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 939da519..2d881fcb 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -21,6 +21,7 @@ import type { } from '@/lib/d3-chart/D3Chart/types'; import type { ContinuousScale } from '@/lib/d3-chart/types'; import { computeTooltipPosition } from '@/lib/d3-chart/layers/scatter-points'; +import { overlayFilterForRunIndex, overlayRunIndex } from '@/lib/overlay-run-style'; import { POINT_SIZE, HIT_AREA_RADIUS, @@ -63,29 +64,6 @@ const getXPath = (size: number) => { return `M ${-s} ${-s} L ${s} ${s} M ${s} ${-s} L ${-s} ${s}`; }; -/** - * CSS filter to visually distinguish overlay points from multiple unofficial - * runs when more than one is loaded. The first run (index 0) is untouched so - * the common single-run case is unaffected. Applied via `style('filter', ...)` - * on SVG groups — works regardless of whether the underlying stroke color is - * a CSS variable, oklch, or hex. - */ -const OVERLAY_HUE_STEP_DEG = 55; -function overlayFilterForRunIndex(idx: number): string | null { - if (idx <= 0) return null; - const hue = (idx * OVERLAY_HUE_STEP_DEG) % 360; - return `hue-rotate(${hue}deg) saturate(1.2)`; -} -function overlayRunIndex(runUrl: string | null | undefined, map: Record): number { - if (!runUrl) return 0; - if (runUrl in map) return map[runUrl]; - // Fall back to the numeric run id parsed from the URL — handles cases where - // `updateRepoUrl` rewrote the host/org and the full-URL key no longer matches. - const idMatch = runUrl.match(/\/runs\/(\d+)/); - if (idMatch && idMatch[1] in map) return map[idMatch[1]]; - return 0; -} - const formatChangelogDescription = (desc: string | string[]): React.JSX.Element => { if (typeof desc === 'string') { return ( diff --git a/packages/app/src/lib/overlay-run-style.ts b/packages/app/src/lib/overlay-run-style.ts new file mode 100644 index 00000000..5cb5de5f --- /dev/null +++ b/packages/app/src/lib/overlay-run-style.ts @@ -0,0 +1,37 @@ +/** + * Shared helpers for visually differentiating unofficial-run overlay points + * when more than one run is loaded. Consumed by the inference scatter plot + * and the evaluation bar chart so both charts apply the same per-run hue. + */ + +/** Degrees of hue rotation per run index. Tuned so that up to ~6 runs stay distinguishable. */ +const OVERLAY_HUE_STEP_DEG = 55; + +/** + * CSS `filter` value to apply to an overlay group for a given run index. + * Returns `null` for index 0 so single-run behavior is unchanged and no + * filter is added to the SVG (keeps it out of the GPU compositing path + * in the common case). + */ +export function overlayFilterForRunIndex(idx: number): string | null { + if (idx <= 0) return null; + const hue = (idx * OVERLAY_HUE_STEP_DEG) % 360; + return `hue-rotate(${hue}deg) saturate(1.2)`; +} + +/** + * Resolve a point's run index from its `run_url`. Falls back to parsing + * the numeric id out of `/runs/` — needed because `updateRepoUrl` + * may have rewritten the host/org between the tooltip path and the raw + * URL stored on the point. + */ +export function overlayRunIndex( + runUrl: string | null | undefined, + map: Record, +): number { + if (!runUrl) return 0; + if (runUrl in map) return map[runUrl]; + const idMatch = runUrl.match(/\/runs\/(\d+)/); + if (idMatch && idMatch[1] in map) return map[idMatch[1]]; + return 0; +} From 4316c1cc2a2c3809f49103b939974d55dd0fd41f Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 06:02:21 -0700 Subject: [PATCH 4/9] fix: map dsv4pro benchmark prefix to dsv4 DB key Benchmark artifacts for DeepSeek-V4-Pro runs (e.g. run 24884703163) emit `infmax_model_prefix: "dsv4pro"` while the canonical DB key is `dsv4`. Without an alias the prefix resolver fell through all three strategies (direct match, alias table, precision-suffix strip) and every row was dropped as `unmappedModel`, so unofficial-run queries for these runs returned an empty benchmark set. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/db/src/etl/normalizers.test.ts | 5 +++++ packages/db/src/etl/normalizers.ts | 1 + 2 files changed, 6 insertions(+) diff --git a/packages/db/src/etl/normalizers.test.ts b/packages/db/src/etl/normalizers.test.ts index 7b0ac0e2..5598b985 100644 --- a/packages/db/src/etl/normalizers.test.ts +++ b/packages/db/src/etl/normalizers.test.ts @@ -116,6 +116,11 @@ describe('resolveModelKey', () => { expect(resolveModelKey({ infmax_model_prefix: 'gptoss' })).toBe('gptoss120b'); }); + it('resolves dsv4pro alias from prefix', () => { + expect(resolveModelKey({ infmax_model_prefix: 'dsv4pro' })).toBe('dsv4'); + expect(resolveModelKey({ infmax_model_prefix: 'dsv4pro-fp8' })).toBe('dsv4'); + }); + it('falls back to MODEL_TO_KEY when prefix not present', () => { expect(resolveModelKey({ model: 'deepseek-ai/DeepSeek-R1' })).toBe('dsr1'); expect(resolveModelKey({ model: 'nvidia/Llama-3.3-70B-Instruct-FP8' })).toBe('llama70b'); diff --git a/packages/db/src/etl/normalizers.ts b/packages/db/src/etl/normalizers.ts index f3cc13dd..51b00df6 100644 --- a/packages/db/src/etl/normalizers.ts +++ b/packages/db/src/etl/normalizers.ts @@ -52,6 +52,7 @@ const PRECISION_SUFFIX = /-(?:fp4|fp8|mxfp4|nvfp4)(?:-.*)?$/i; /** Explicit aliases for prefixes that don't match any DB key after suffix stripping. */ const PREFIX_ALIASES: Record = { gptoss: 'gptoss120b', + dsv4pro: 'dsv4', }; function resolvePrefixToKey(prefix: string): string | null { From bf178bfba3b83d83e5ea608b9008f34de854b70d Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 06:35:49 -0700 Subject: [PATCH 5/9] fix(unofficial-run): make per-run hue shift actually visible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three stacked fixes so multiple unofficial runs don't all look the same: 1. Include overlay hw keys in the vendor-color active set so overlay strokes get a real hue instead of the muted-foreground fallback — hue-rotate on gray is a no-op, which was the main reason runs appeared identical. 2. Strengthen the per-run CSS filter: saturate(2.2) hue-rotate brightness(1.1), and widen the hue step from 55° to 80° for more separation. 3. Use a different stroke-dasharray per run index on overlay rooflines so runs stay distinguishable even when the filter can't produce a shift. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../components/inference/ui/ScatterGraph.tsx | 20 +++++++++---- packages/app/src/lib/overlay-run-style.ts | 30 +++++++++++++++---- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 2d881fcb..bb4411a1 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -21,7 +21,11 @@ import type { } from '@/lib/d3-chart/D3Chart/types'; import type { ContinuousScale } from '@/lib/d3-chart/types'; import { computeTooltipPosition } from '@/lib/d3-chart/layers/scatter-points'; -import { overlayFilterForRunIndex, overlayRunIndex } from '@/lib/overlay-run-style'; +import { + overlayFilterForRunIndex, + overlayRooflineDasharray, + overlayRunIndex, +} from '@/lib/overlay-run-style'; import { POINT_SIZE, HIT_AREA_RADIUS, @@ -201,14 +205,18 @@ const ScatterGraph = React.memo( activeOverlayHwTypes.forEach((k) => keys.push(`overlay:${k}`)); return keys; }, [effectiveOfficialHwTypes, activeOverlayHwTypes]); - const activeOfficialKeys = useMemo( - () => [...effectiveOfficialHwTypes], - [effectiveOfficialHwTypes], + // Vendor color map keys — include overlay hw keys (unprefixed) so overlay + // strokes resolve to a real hue instead of falling through to the muted + // fallback. Without this, `hue-rotate` on overlay lines would be a no-op + // because the input is gray. + const activeVendorKeys = useMemo( + () => [...new Set([...effectiveOfficialHwTypes, ...activeOverlayHwTypes])], + [effectiveOfficialHwTypes, activeOverlayHwTypes], ); const { resolveColor, getCssColor } = useThemeColors({ highContrast, identifiers: activeHwKeys, - activeKeys: activeOfficialKeys, + activeKeys: activeVendorKeys, }); // --- Changelog --- @@ -1314,7 +1322,7 @@ const ScatterGraph = React.memo( .attr('fill', 'none') .attr('stroke', (d) => d.stroke) .attr('stroke-width', 2) - .attr('stroke-dasharray', '6 3') + .attr('stroke-dasharray', (d) => overlayRooflineDasharray(d.runIndex)) .attr('d', (d) => lineGen(d.points)) .style('filter', (d) => overlayFilterForRunIndex(d.runIndex)); diff --git a/packages/app/src/lib/overlay-run-style.ts b/packages/app/src/lib/overlay-run-style.ts index 5cb5de5f..7b651808 100644 --- a/packages/app/src/lib/overlay-run-style.ts +++ b/packages/app/src/lib/overlay-run-style.ts @@ -4,19 +4,37 @@ * and the evaluation bar chart so both charts apply the same per-run hue. */ -/** Degrees of hue rotation per run index. Tuned so that up to ~6 runs stay distinguishable. */ -const OVERLAY_HUE_STEP_DEG = 55; +/** Degrees of hue rotation per run index. 80° per step cycles through 4-5 distinct bands. */ +const OVERLAY_HUE_STEP_DEG = 80; /** * CSS `filter` value to apply to an overlay group for a given run index. - * Returns `null` for index 0 so single-run behavior is unchanged and no - * filter is added to the SVG (keeps it out of the GPU compositing path - * in the common case). + * + * Returns `null` for index 0 so single-run behavior is unchanged. + * + * For index >= 1, we stack `hue-rotate + saturate + brightness`: + * - `saturate(2.2)` forces saturation up *before* rotating so near-gray base + * colors (e.g. when `resolveColor` falls back to `--muted-foreground`) pick + * up a visible hue rather than staying gray; + * - `brightness(1.1)` lifts the result slightly on dark backgrounds. + * + * Applied via `style('filter', ...)` — works regardless of whether the + * underlying stroke color is a CSS variable, oklch, or hex. */ export function overlayFilterForRunIndex(idx: number): string | null { if (idx <= 0) return null; const hue = (idx * OVERLAY_HUE_STEP_DEG) % 360; - return `hue-rotate(${hue}deg) saturate(1.2)`; + return `saturate(2.2) hue-rotate(${hue}deg) brightness(1.1)`; +} + +/** + * Dash pattern for an overlay roofline at a given run index. Different patterns + * stack on top of the color filter so runs remain distinguishable even when + * CSS filters can't produce a hue shift (e.g. pure-gray base strokes). + */ +const ROOFLINE_DASH_BY_RUN = ['6 3', '2 3', '10 3 2 3', '5 3 2 3 2 3', '12 2', '3 1']; +export function overlayRooflineDasharray(runIndex: number): string { + return ROOFLINE_DASH_BY_RUN[runIndex % ROOFLINE_DASH_BY_RUN.length]; } /** From 303a17d335f538717b4ac182623b640302e1a566 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 06:53:25 -0700 Subject: [PATCH 6/9] fix(unofficial-run): use explicit per-run palette instead of CSS filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CSS-filter approach made the legend and chart diverge: the legend rendered each overlay hwKey's vendor color (red for MI355X), while the chart stroke got the same base color *plus* a hue-rotate filter that shifted it to an unrelated hue. Since the legend's colored dot is a direct backgroundColor style, there was no clean way to apply the same filter to it. Switch to an explicit OKLch palette indexed by run order — both the overlay stroke and the legend swatch read from the same palette, so they match exactly. Restructure the overlay legend section to show one entry per loaded run (branch name) rather than per-hardware, since N runs × M hardware keys can't collapse to a single color per hw. Hardware identity for overlay points is still visible in the point label and tooltip; the X-mark shape and legend branch labels carry the run identity. Roofline dash-pattern per run is kept as a secondary (colorblind-friendly) encoding. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../components/evaluation/ui/BarChartD3.tsx | 91 +++++++++++-------- .../components/inference/ui/ScatterGraph.tsx | 70 +++++++------- packages/app/src/lib/overlay-run-style.ts | 78 ++++++++++------ 3 files changed, 135 insertions(+), 104 deletions(-) diff --git a/packages/app/src/components/evaluation/ui/BarChartD3.tsx b/packages/app/src/components/evaluation/ui/BarChartD3.tsx index e4d961b3..5af0fcd9 100644 --- a/packages/app/src/components/evaluation/ui/BarChartD3.tsx +++ b/packages/app/src/components/evaluation/ui/BarChartD3.tsx @@ -24,7 +24,7 @@ import { Skeleton } from '@/components/ui/skeleton'; import { useUnofficialRun } from '@/components/unofficial-run-provider'; import { useThemeColors } from '@/hooks/useThemeColors'; import { computeToggle } from '@/hooks/useTogglableSet'; -import { overlayFilterForRunIndex, overlayRunIndex } from '@/lib/overlay-run-style'; +import { overlayRunColor, overlayRunIndex } from '@/lib/overlay-run-style'; const BASE_MARGIN = { top: 24, right: 24, bottom: 52 }; const OVERLAY_X_SIZE = 6; @@ -159,6 +159,7 @@ export default function EvalBarChartD3({ caption }: { caption?: ReactNode }) { const { isUnofficialRun, unofficialRunInfo, + unofficialRunInfos, activeOverlayHwTypes, setActiveOverlayHwTypes, allOverlayHwTypes, @@ -320,33 +321,45 @@ export default function EvalBarChartD3({ caption }: { caption?: ReactNode }) { const legendItems = useMemo( () => [ - ...unofficialConfigurations.map(({ hwKey, configLabel }) => ({ - name: `✕ ${configLabel}`, - label: `✕ ${configLabel.replaceAll('\n', ' ')}`, - color: resolveColor(configLabel, hwKey), - title: `UNOFFICIAL: ${configLabel.replaceAll('\n', ' ')}`, - isHighlighted: true, - hw: `overlay:${hwKey}`, - isActive: true, - onClick: () => {}, - tooltip: ( -
-
UNOFFICIAL RUN
-
Branch: {unofficialRunInfo?.branch}
-
Config: {configLabel.replaceAll('\n', ' ')}
- {unofficialRunInfo?.url && ( - - View workflow run - - )} -
- ), - })), + // Overlay legend: one entry per loaded unofficial run that contributes + // points to the current chart. Same palette color as the chart strokes. + ...(unofficialConfigurations.length > 0 && unofficialRunInfos.length > 0 + ? unofficialRunInfos + .map((info, idx) => { + const hasPoints = unofficialChartData.some( + (d) => overlayRunIndex(d.runUrl ?? null, runIndexByUrl) === idx, + ); + if (!hasPoints) return null; + const branch = info.branch || `run ${info.id}`; + return { + name: `✕ unofficial-run-${info.id}`, + label: `✕ ${branch}`, + color: overlayRunColor(idx), + title: `UNOFFICIAL: ${branch}`, + isHighlighted: true, + hw: `overlay-run-${info.id}`, + isActive: true, + onClick: () => {}, + tooltip: ( +
+
UNOFFICIAL RUN
+
Branch: {branch}
+ {info.url && ( + + View workflow run + + )} +
+ ), + }; + }) + .filter((x): x is NonNullable => x !== null) + : []), ...configurations.map(({ hwKey, configLabel }) => ({ name: configLabel, label: configLabel.replaceAll('\n', ' '), @@ -368,7 +381,9 @@ export default function EvalBarChartD3({ caption }: { caption?: ReactNode }) { highlightedConfigs, resolveColor, unofficialConfigurations, - unofficialRunInfo, + unofficialChartData, + unofficialRunInfos, + runIndexByUrl, ], ); @@ -537,14 +552,14 @@ export default function EvalBarChartD3({ caption }: { caption?: ReactNode }) { return bar; }); - bars.style('filter', (d) => - overlayFilterForRunIndex(overlayRunIndex(d.runUrl ?? null, runIndexByUrl)), - ); + bars.style('filter', null); bars .selectAll( '.unofficial-eb-stem, .unofficial-eb-cap-top, .unofficial-eb-cap-bot', ) - .attr('stroke', (d) => getCssColor(resolveColor(d.configLabel, String(d.hwKey)))); + .attr('stroke', (d) => + overlayRunColor(overlayRunIndex(d.runUrl ?? null, runIndexByUrl)), + ); bars .select('.unofficial-eb-stem') @@ -689,15 +704,13 @@ export default function EvalBarChartD3({ caption }: { caption?: ReactNode }) { (d) => `translate(${xScale(d.score)},${(yScale(d.configLabel) || 0) + yScale.bandwidth() / 2})`, ); - // Per-run hue shift at the group level so the X-mark + score label - // inherit the same tone and stay visually grouped. - overlayPoints.style('filter', (d) => - overlayFilterForRunIndex(overlayRunIndex(d.runUrl ?? null, runIndexByUrl)), - ); + overlayPoints.style('filter', null); overlayPoints .select('.unofficial-eval-x') - .attr('stroke', (d) => getCssColor(resolveColor(d.configLabel, String(d.hwKey)))); + .attr('stroke', (d) => + overlayRunColor(overlayRunIndex(d.runUrl ?? null, runIndexByUrl)), + ); overlayPoints.each(function (d) { d3.select(this) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index bb4411a1..e6c6108e 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -22,8 +22,8 @@ import type { import type { ContinuousScale } from '@/lib/d3-chart/types'; import { computeTooltipPosition } from '@/lib/d3-chart/layers/scatter-points'; import { - overlayFilterForRunIndex, overlayRooflineDasharray, + overlayRunColor, overlayRunIndex, } from '@/lib/overlay-run-style'; import { @@ -153,6 +153,7 @@ const ScatterGraph = React.memo( localOfficialOverride, setLocalOfficialOverride, runIndexByUrl, + unofficialRunInfos, } = useUnofficialRun(); const chartRef = useRef(null); @@ -205,18 +206,14 @@ const ScatterGraph = React.memo( activeOverlayHwTypes.forEach((k) => keys.push(`overlay:${k}`)); return keys; }, [effectiveOfficialHwTypes, activeOverlayHwTypes]); - // Vendor color map keys — include overlay hw keys (unprefixed) so overlay - // strokes resolve to a real hue instead of falling through to the muted - // fallback. Without this, `hue-rotate` on overlay lines would be a no-op - // because the input is gray. - const activeVendorKeys = useMemo( - () => [...new Set([...effectiveOfficialHwTypes, ...activeOverlayHwTypes])], - [effectiveOfficialHwTypes, activeOverlayHwTypes], + const activeOfficialKeys = useMemo( + () => [...effectiveOfficialHwTypes], + [effectiveOfficialHwTypes], ); const { resolveColor, getCssColor } = useThemeColors({ highContrast, identifiers: activeHwKeys, - activeKeys: activeVendorKeys, + activeKeys: activeOfficialKeys, }); // --- Changelog --- @@ -1304,7 +1301,8 @@ const ScatterGraph = React.memo( ovEntries.push({ key, points: group.points, - stroke: getCssColor(resolveColor(group.hwKey)), + // Color by run — same palette entry the legend uses, so they match. + stroke: overlayRunColor(group.runIndex), runIndex: group.runIndex, }); } @@ -1324,7 +1322,7 @@ const ScatterGraph = React.memo( .attr('stroke-width', 2) .attr('stroke-dasharray', (d) => overlayRooflineDasharray(d.runIndex)) .attr('d', (d) => lineGen(d.points)) - .style('filter', (d) => overlayFilterForRunIndex(d.runIndex)); + .style('filter', null); // Overlay X-shape points — index-keyed so every point renders const overlayPoints = zoomGroup @@ -1353,14 +1351,12 @@ const ScatterGraph = React.memo( }); overlayPoints.attr('transform', (d) => `translate(${xScale(d.x)},${yScale(d.y)})`); - // Apply per-run hue shift at the group level so the shape and its - // label inherit the same tone and stay visually grouped. - overlayPoints.style('filter', (d) => - overlayFilterForRunIndex(overlayRunIndex(d.run_url ?? null, runIndexByUrl)), - ); + overlayPoints.style('filter', null); overlayPoints .select('.overlay-x') - .attr('stroke', (d) => getCssColor(resolveColor(d.hwKey as string))); + .attr('stroke', (d) => + overlayRunColor(overlayRunIndex(d.run_url ?? null, runIndexByUrl)), + ); // Labels const showLabels = !hidePointLabels && !showGradientLabels; @@ -1689,32 +1685,35 @@ const ScatterGraph = React.memo( onItemHoverEnd={handleLegendHoverEnd} onItemRemove={showAllHardwareTypes ? undefined : removeHwType} legendItems={[ - ...(overlayData - ? Object.entries(overlayData.hardwareConfig) - .filter(([key]) => - overlayData.data.some( - (d) => d.hwKey === key && selectedPrecisions.includes(d.precision), - ), - ) - .map(([key, hwConfig]) => { - const parsed = parseHwKeyToLabel(key); + // Overlay legend: one entry per loaded unofficial run that actually + // contributes points to this chart. Colored from the shared palette + // so the legend swatch matches the stroke color used in the chart. + ...(overlayData && unofficialRunInfos.length > 0 + ? unofficialRunInfos + .map((info, idx) => { + const hasPoints = overlayData.data.some( + (d) => + overlayRunIndex(d.run_url ?? null, runIndexByUrl) === idx && + selectedPrecisions.includes(d.precision), + ); + if (!hasPoints) return null; + const branch = info.branch || `run ${info.id}`; return { - name: `✕ ${key}`, - label: `✕ ${parsed.label}`, - color: resolveColor(key), - title: `UNOFFICIAL: ${hwConfig.framework || parsed.label}`, + name: `✕ unofficial-run-${info.id}`, + label: `✕ ${branch}`, + color: overlayRunColor(idx), + title: `UNOFFICIAL: ${branch}`, isHighlighted: true, - hw: `overlay-${key}`, + hw: `overlay-run-${info.id}`, isActive: true, onClick: () => {}, tooltip: (
UNOFFICIAL RUN
-
Branch: {overlayData.label}
-
Hardware: {parsed.label}
- {overlayData.runUrl && ( +
Branch: {branch}
+ {info.url && ( => x !== null) : []), ...Object.entries(hardwareConfig) .filter(([key]) => diff --git a/packages/app/src/lib/overlay-run-style.ts b/packages/app/src/lib/overlay-run-style.ts index 7b651808..62baa6fd 100644 --- a/packages/app/src/lib/overlay-run-style.ts +++ b/packages/app/src/lib/overlay-run-style.ts @@ -1,47 +1,65 @@ /** * Shared helpers for visually differentiating unofficial-run overlay points - * when more than one run is loaded. Consumed by the inference scatter plot - * and the evaluation bar chart so both charts apply the same per-run hue. + * when one or more runs are loaded. Consumed by the inference scatter plot + * and the evaluation bar chart. + * + * Design: instead of applying a CSS filter to an hwKey-derived base color + * (which is brittle — `hue-rotate` on gray is a no-op, and filter output + * can't be re-used in legend swatches that style `background-color` directly), + * we assign each run a fixed palette color. The same palette is used by the + * chart strokes AND the legend entries, so they always match visually. + * + * Trade-off: overlay points no longer encode hardware via color. Hardware is + * still identifiable via the X-mark shape, the point label (TP number or + * advanced label), and the tooltip. */ -/** Degrees of hue rotation per run index. 80° per step cycles through 4-5 distinct bands. */ -const OVERLAY_HUE_STEP_DEG = 80; - /** - * CSS `filter` value to apply to an overlay group for a given run index. - * - * Returns `null` for index 0 so single-run behavior is unchanged. - * - * For index >= 1, we stack `hue-rotate + saturate + brightness`: - * - `saturate(2.2)` forces saturation up *before* rotating so near-gray base - * colors (e.g. when `resolveColor` falls back to `--muted-foreground`) pick - * up a visible hue rather than staying gray; - * - `brightness(1.1)` lifts the result slightly on dark backgrounds. - * - * Applied via `style('filter', ...)` — works regardless of whether the - * underlying stroke color is a CSS variable, oklch, or hex. + * Palette for overlay runs, in load-order. Tuned for dark mode primarily but + * readable on light backgrounds too. Each entry is a saturated OKLch string + * so it shows even when the underlying theme colors are muted. */ -export function overlayFilterForRunIndex(idx: number): string | null { - if (idx <= 0) return null; - const hue = (idx * OVERLAY_HUE_STEP_DEG) % 360; - return `saturate(2.2) hue-rotate(${hue}deg) brightness(1.1)`; +const RUN_PALETTE: readonly string[] = [ + 'oklch(0.72 0.22 25)', // warm red + 'oklch(0.75 0.20 190)', // teal + 'oklch(0.78 0.20 90)', // amber + 'oklch(0.70 0.22 290)', // violet + 'oklch(0.75 0.20 150)', // green + 'oklch(0.70 0.22 330)', // magenta + 'oklch(0.72 0.20 230)', // blue + 'oklch(0.78 0.18 60)', // yellow-orange +]; + +/** Return the palette color for a given run index (wraps on overflow). */ +export function overlayRunColor(runIndex: number): string { + return RUN_PALETTE[((runIndex % RUN_PALETTE.length) + RUN_PALETTE.length) % RUN_PALETTE.length]; } /** - * Dash pattern for an overlay roofline at a given run index. Different patterns - * stack on top of the color filter so runs remain distinguishable even when - * CSS filters can't produce a hue shift (e.g. pure-gray base strokes). + * Dash pattern for an overlay roofline at a given run index. Layered on top + * of the per-run color so runs stay distinguishable even on grayscale + * screenshots or print. */ -const ROOFLINE_DASH_BY_RUN = ['6 3', '2 3', '10 3 2 3', '5 3 2 3 2 3', '12 2', '3 1']; +const ROOFLINE_DASH_BY_RUN: readonly string[] = [ + '6 3', + '2 3', + '10 3 2 3', + '5 3 2 3 2 3', + '12 2', + '3 1', +]; export function overlayRooflineDasharray(runIndex: number): string { - return ROOFLINE_DASH_BY_RUN[runIndex % ROOFLINE_DASH_BY_RUN.length]; + return ROOFLINE_DASH_BY_RUN[ + ((runIndex % ROOFLINE_DASH_BY_RUN.length) + ROOFLINE_DASH_BY_RUN.length) % + ROOFLINE_DASH_BY_RUN.length + ]; } /** - * Resolve a point's run index from its `run_url`. Falls back to parsing - * the numeric id out of `/runs/` — needed because `updateRepoUrl` - * may have rewritten the host/org between the tooltip path and the raw - * URL stored on the point. + * Resolve a point's run index from its `run_url`. Falls back to parsing the + * numeric id out of `/runs/` — needed because `updateRepoUrl` may + * rewrite the host/org between the raw URL stored on the point and the + * lookup map constructed from run metadata. */ export function overlayRunIndex( runUrl: string | null | undefined, From 4800118aa8d5792a6b15f970a3b98583dbc279c4 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 10:47:01 -0700 Subject: [PATCH 7/9] fix(unofficial-run): auto-switch model when URL loads a run for an unselected model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Navigating to ?unofficialrun= when `g_model` isn't set in the URL used to silently leave the dashboard on the default DeepSeek-R1 model. If the run only contained data for a different model (e.g. the DeepSeek-V4-Pro run 24889121634 on MI355X), the user saw no overlay and had to know to manually switch the model dropdown. Now, when an unofficial run is loaded and `g_model` wasn't provided, auto-switch to the first model the run contributes data for — once, so subsequent manual selections stick. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/components/GlobalFilterContext.tsx | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx index 2780a202..108eaca2 100644 --- a/packages/app/src/components/GlobalFilterContext.tsx +++ b/packages/app/src/components/GlobalFilterContext.tsx @@ -172,6 +172,27 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) { }); }, [availabilityRows, unofficialAvailable]); + // Auto-switch the selected model when an unofficial run is loaded that + // doesn't include the currently selected model. Without this, navigating + // to `?unofficialrun=` while the default `g_model=DeepSeek-R1` sticks + // leaves the user staring at a chart with no overlay points — they'd have + // to know to open the dropdown and pick the run's model themselves. + // + // Skipped when `g_model` was set explicitly in the URL (respect the user's + // intent) and when the current model is already covered by the overlay. + const autoSwitchedRef = useRef(false); + useEffect(() => { + if (autoSwitchedRef.current) return; + if (unofficialAvailable.length === 0) return; + const urlModel = getUrlParam('g_model'); + if (urlModel) return; + const unofficialModels = new Set(unofficialAvailable.map((a) => a.model)); + if (unofficialModels.has(selectedModel)) return; + const target = unofficialAvailable[0].model; + autoSwitchedRef.current = true; + setSelectedModel(target); + }, [unofficialAvailable, selectedModel]); + // Sequences available for the selected model (DB ∪ unofficial run for this model) const availableSequences = useMemo(() => { const unofficialSeqs = unofficialAvailable From 81a1ca2b77600dd3a75ec8abfe2da1c40e88d982 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 17:38:43 -0700 Subject: [PATCH 8/9] fix(unofficial-run): re-trigger auto-switch when run set changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous auto-switch used a one-shot ref, so navigating from one unofficial run to another in the same session (e.g. swapping the runId in the URL) wouldn't re-evaluate which model to land on. If a user had been viewing run A on DeepSeek-V4-Pro and then navigated to run B that also has DeepSeek-V4-Pro data, that's fine — but if run B has data for a different model and the user happens to currently sit on a model that B doesn't cover, they'd see an empty chart with no overlay. Switch the guard to a stringified key of the (model, sequence) set from the current unofficial run, so each new run set re-evaluates the switch. Manual model changes while the same run is loaded still stick because the key doesn't change. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/components/GlobalFilterContext.tsx | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx index 108eaca2..f605e1d1 100644 --- a/packages/app/src/components/GlobalFilterContext.tsx +++ b/packages/app/src/components/GlobalFilterContext.tsx @@ -180,17 +180,28 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) { // // Skipped when `g_model` was set explicitly in the URL (respect the user's // intent) and when the current model is already covered by the overlay. - const autoSwitchedRef = useRef(false); + // + // We key the "did we already switch?" check against the stringified set of + // (model, sequence) pairs from the unofficial run, so navigating from one + // run to another with a different model will re-trigger the switch — but + // a manual model change while the same run set is loaded will stick. + const lastAutoSwitchKeyRef = useRef(''); useEffect(() => { - if (autoSwitchedRef.current) return; - if (unofficialAvailable.length === 0) return; + if (unofficialAvailable.length === 0) { + lastAutoSwitchKeyRef.current = ''; + return; + } const urlModel = getUrlParam('g_model'); if (urlModel) return; + const key = unofficialAvailable + .map((a) => `${a.model}|${a.sequence}`) + .toSorted() + .join(','); + if (lastAutoSwitchKeyRef.current === key) return; + lastAutoSwitchKeyRef.current = key; const unofficialModels = new Set(unofficialAvailable.map((a) => a.model)); if (unofficialModels.has(selectedModel)) return; - const target = unofficialAvailable[0].model; - autoSwitchedRef.current = true; - setSelectedModel(target); + setSelectedModel(unofficialAvailable[0].model); }, [unofficialAvailable, selectedModel]); // Sequences available for the selected model (DB ∪ unofficial run for this model) From 5117bffc5b030d349bac373f1d6c083ad2313a6d Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 14:31:39 -0700 Subject: [PATCH 9/9] fix(normalizers): strip -cw hw suffix so gb300-cw maps to gb300 Run 24936260529 uses hw: "gb300-cw" which wasn't recognized. Co-Authored-By: Claude Opus 4.6 --- packages/db/src/etl/normalizers.test.ts | 4 ++++ packages/db/src/etl/normalizers.ts | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/packages/db/src/etl/normalizers.test.ts b/packages/db/src/etl/normalizers.test.ts index 5598b985..21281a2a 100644 --- a/packages/db/src/etl/normalizers.test.ts +++ b/packages/db/src/etl/normalizers.test.ts @@ -68,6 +68,10 @@ describe('hwToGpuKey', () => { expect(hwToGpuKey('b300-nb')).toBe('b300'); }); + it('strips -cw suffix', () => { + expect(hwToGpuKey('gb300-cw')).toBe('gb300'); + }); + it('strips runner index suffix before other suffixes', () => { expect(hwToGpuKey('mi355x-amd_0')).toBe('mi355x'); expect(hwToGpuKey('mi355x-amd_2')).toBe('mi355x'); diff --git a/packages/db/src/etl/normalizers.ts b/packages/db/src/etl/normalizers.ts index 51b00df6..1719a765 100644 --- a/packages/db/src/etl/normalizers.ts +++ b/packages/db/src/etl/normalizers.ts @@ -34,7 +34,8 @@ export function hwToGpuKey(hw: string): string | null { .replace(/-dgxc-slurm$/, '') .replace(/-dgxc$/, '') .replace(/-nb$/, '') - .replace(/-nv$/, ''); + .replace(/-nv$/, '') + .replace(/-cw$/, ''); return GPU_KEYS.has(base) ? base : null; }