From 7d12906e6ca974d7d52dbdce531ca9bf24b9eec6 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Fri, 17 Apr 2026 16:40:15 -0500 Subject: [PATCH] feat: add Pareto uplift panel for PR regression tracking --- .../components/inference/ui/ChartDisplay.tsx | 25 ++ .../inference/ui/ParetoUpliftPanel.test.tsx | 234 ++++++++++ .../inference/ui/ParetoUpliftPanel.tsx | 418 ++++++++++++++++++ packages/app/src/lib/pareto-uplift.test.ts | 262 +++++++++++ packages/app/src/lib/pareto-uplift.ts | 235 ++++++++++ packages/app/vitest.config.ts | 2 +- 6 files changed, 1175 insertions(+), 1 deletion(-) create mode 100644 packages/app/src/components/inference/ui/ParetoUpliftPanel.test.tsx create mode 100644 packages/app/src/components/inference/ui/ParetoUpliftPanel.tsx create mode 100644 packages/app/src/lib/pareto-uplift.test.ts create mode 100644 packages/app/src/lib/pareto-uplift.ts diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx index 2ecc93b2..fb9b859a 100644 --- a/packages/app/src/components/inference/ui/ChartDisplay.tsx +++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx @@ -43,6 +43,7 @@ import ComparisonChangelog from './ComparisonChangelog'; import CustomCosts from './CustomCosts'; import CustomPowers from './CustomPowers'; import GPUGraph from './GPUGraph'; +import ParetoUpliftPanel from './ParetoUpliftPanel'; import TrendChart from './TrendChart'; const ModelArchitectureDiagram = dynamic(() => import('./ModelArchitectureDiagram'), { @@ -144,6 +145,7 @@ export default function ChartDisplay() { activeHwTypes, activeDates, setSelectedE2eXAxisMetric, + hardwareConfig, } = useInference(); const { @@ -501,6 +503,29 @@ export default function ChartDisplay() { ); })()} + 0, + )} + overlayData={ + graph.chartDefinition.chartType === 'e2e' + ? (overlayDataByChartType.e2e ?? undefined) + : (overlayDataByChartType.interactivity ?? undefined) + } + chartType={graph.chartDefinition.chartType} + /> diff --git a/packages/app/src/components/inference/ui/ParetoUpliftPanel.test.tsx b/packages/app/src/components/inference/ui/ParetoUpliftPanel.test.tsx new file mode 100644 index 00000000..4b97f2b5 --- /dev/null +++ b/packages/app/src/components/inference/ui/ParetoUpliftPanel.test.tsx @@ -0,0 +1,234 @@ +// @vitest-environment jsdom +import React, { act } from 'react'; +import { createRoot, type Root } from 'react-dom/client'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; + +import type { + ChartDefinition, + HardwareConfig, + InferenceData, + OverlayData, +} from '@/components/inference/types'; + +vi.mock('@/lib/constants', () => ({ + getModelSortIndex: () => 0, +})); + +import ParetoUpliftPanel from './ParetoUpliftPanel'; + +let container: HTMLDivElement; +let root: Root; + +function renderUi(ui: React.ReactNode) { + act(() => root.render(ui)); +} + +beforeEach(() => { + container = document.createElement('div'); + document.body.append(container); + root = createRoot(container); +}); + +afterEach(() => { + act(() => root.unmount()); + container.remove(); +}); + +interface PtOpts { + ttft?: number; + p99_ttft?: number; + tpot?: number; + e2el?: number; + intvty?: number; +} + +function pt(x: number, y: number, hwKey: string, date: string, opts: PtOpts = {}): InferenceData { + return { + date, + x, + y, + tp: 1, + conc: 1, + hwKey, + precision: 'fp8', + tpPerGpu: { y, roof: false }, + tpPerMw: { y, roof: false }, + costh: { y, roof: false }, + costn: { y, roof: false }, + costr: { y, roof: false }, + costhi: { y, roof: false }, + costni: { y, roof: false }, + costri: { y, roof: false }, + median_ttft: opts.ttft, + p99_ttft: opts.p99_ttft, + median_tpot: opts.tpot, + median_e2el: opts.e2el, + median_intvty: opts.intvty, + }; +} + +const interactivityChartDef: ChartDefinition = { + chartType: 'interactivity', + heading: 'vs. Interactivity', + x: 'median_intvty', + x_label: 'Interactivity (tok/s/user)', + y: 'tput_per_gpu', + y_tpPerGpu: 'tpPerGpu.y', + y_tpPerGpu_label: 'Token Throughput per GPU', + y_tpPerGpu_title: 'Token Throughput per GPU', + y_tpPerGpu_roofline: 'upper_left', +}; + +const hardwareConfig: HardwareConfig = { + h100: { name: 'h100', label: 'H100', suffix: '(TRT)', gpu: 'H100 TRT' }, + b200: { name: 'b200', label: 'B200', suffix: '(TRT)', gpu: 'B200 TRT' }, +}; + +const baseProps = { + chartDefinition: interactivityChartDef, + selectedYAxisMetric: 'y_tpPerGpu', + hardwareConfig, + activeHwTypes: new Set(['h100', 'b200']), + activeDates: new Set(), + selectedPrecisions: ['fp8'], + selectedRunDate: '2025-04-17', + selectedDates: [] as string[], + selectedDateRange: { startDate: '', endDate: '' }, + isTimelineMode: false, + chartType: 'interactivity', +}; + +describe('ParetoUpliftPanel', () => { + it('renders nothing when there are no comparison dates and no overlay', () => { + const data = [pt(10, 300, 'h100', '2025-04-17'), pt(20, 200, 'h100', '2025-04-17')]; + renderUi(); + expect(container.querySelector('[data-testid="pareto-uplift-panel"]')).toBeNull(); + }); + + it('renders a GPU × metric table with a primary Pareto row plus scalar time-stat rows', () => { + // Reference (main date): Pareto-valid upper_left front with 2× throughput and halved latencies. + const data = [ + pt(10, 300, 'h100', '2025-04-17', { + ttft: 0.05, + p99_ttft: 0.1, + tpot: 0.02, + e2el: 0.5, + intvty: 50, + }), + pt(20, 200, 'h100', '2025-04-17', { + ttft: 0.04, + p99_ttft: 0.08, + tpot: 0.015, + e2el: 0.4, + intvty: 60, + }), + pt(10, 150, 'h100', '2025-04-10', { + ttft: 0.1, + p99_ttft: 0.2, + tpot: 0.04, + e2el: 1, + intvty: 25, + }), + pt(20, 100, 'h100', '2025-04-10', { + ttft: 0.08, + p99_ttft: 0.16, + tpot: 0.03, + e2el: 0.8, + intvty: 30, + }), + ]; + renderUi(); + + const panel = container.querySelector('[data-testid="pareto-uplift-panel"]'); + expect(panel).not.toBeNull(); + + const headers = [...container.querySelectorAll('thead th')].map((th) => th.textContent); + expect(headers[0]).toBe('GPU'); + expect(headers[1]).toBe('Metric'); + expect(headers[2]).toContain('2025-04-10'); + + const rows = [...container.querySelectorAll('tbody tr')]; + // 1 primary (Pareto) + 5 scalar metrics (Median TTFT, P99 TTFT, Median TPOT, Median E2EL, Interactivity) + expect(rows).toHaveLength(6); + + // First row: GPU name, "Token Throughput per GPU" metric label, +% cell (Pareto). + const firstCells = rows[0].querySelectorAll('td'); + expect(firstCells[0].textContent).toContain('H100'); + expect(firstCells[1].textContent).toContain('Token Throughput per GPU'); + expect(firstCells[2].textContent).toMatch(/\+/); + + // Subsequent rows carry no GPU label (only first row does). + expect(rows[1].querySelectorAll('td')[0].textContent).toBe(''); + // Time-stat metrics present in order. + const metricLabels = rows.map((r) => r.querySelectorAll('td')[1].textContent); + expect(metricLabels).toEqual([ + 'Token Throughput per GPU', + 'Median TTFT', + 'P99 TTFT', + 'Median TPOT', + 'Median E2EL', + 'Interactivity', + ]); + + // TTFT row: ref mean 0.045, hist mean 0.09 → ratio 2 → "+100%" + const ttftCell = rows[1].querySelectorAll('td')[2]; + expect(ttftCell.textContent).toContain('+100.0%'); + + // Interactivity (higher is better): ref mean 55, hist mean 27.5 → ratio 2 → "+100%" + const intvtyCell = rows[5].querySelectorAll('td')[2]; + expect(intvtyCell.textContent).toContain('+100.0%'); + }); + + it('uses overlay data as the reference when an unofficial PR run is present', () => { + const overlayData: OverlayData = { + data: [ + pt(10, 600, 'h100', '2025-04-17', { ttft: 0.025, intvty: 100 }), + pt(20, 400, 'h100', '2025-04-17', { ttft: 0.02, intvty: 120 }), + ], + hardwareConfig, + label: 'feat/new-kernel', + }; + const data = [ + pt(10, 300, 'h100', '2025-04-17', { ttft: 0.05, intvty: 50 }), + pt(20, 200, 'h100', '2025-04-17', { ttft: 0.04, intvty: 60 }), + ]; + renderUi(); + const panel = container.querySelector('[data-testid="pareto-uplift-panel"]'); + expect(panel).not.toBeNull(); + expect(panel?.textContent).toContain('feat/new-kernel'); + + const headers = [...container.querySelectorAll('thead th')].map((th) => th.textContent); + expect(headers.some((h) => h?.includes('2025-04-17'))).toBe(true); + }); + + it('renders scalar rows even when the primary Pareto row is not computable', () => { + // Only 1 point on the reference side — too few for a Pareto front. + const data = [ + pt(10, 300, 'h100', '2025-04-17', { ttft: 0.05 }), + pt(10, 150, 'h100', '2025-04-10', { ttft: 0.1 }), + ]; + renderUi(); + const rows = [...container.querySelectorAll('tbody tr')]; + const labels = rows.map((r) => r.querySelectorAll('td')[1].textContent); + expect(labels).not.toContain('Token Throughput per GPU'); + expect(labels).toContain('Median TTFT'); + }); + + it('hides rows whose hwKey has no usable historical overlap', () => { + const data = [ + pt(10, 300, 'h100', '2025-04-17', { ttft: 0.05 }), + pt(20, 200, 'h100', '2025-04-17', { ttft: 0.04 }), + // b200 has main-date data but no historical date data. + pt(10, 300, 'b200', '2025-04-17', { ttft: 0.05 }), + pt(20, 200, 'b200', '2025-04-17', { ttft: 0.04 }), + pt(10, 150, 'h100', '2025-04-10', { ttft: 0.1 }), + pt(20, 100, 'h100', '2025-04-10', { ttft: 0.08 }), + ]; + renderUi(); + const gpuLabels = [...container.querySelectorAll('tbody tr')] + .map((r) => r.querySelectorAll('td')[0].textContent) + .filter((s) => s && s.length > 0); + expect(gpuLabels).toContain('H100 (TRT)'); + expect(gpuLabels).not.toContain('B200 (TRT)'); + }); +}); diff --git a/packages/app/src/components/inference/ui/ParetoUpliftPanel.tsx b/packages/app/src/components/inference/ui/ParetoUpliftPanel.tsx new file mode 100644 index 00000000..289f5f2e --- /dev/null +++ b/packages/app/src/components/inference/ui/ParetoUpliftPanel.tsx @@ -0,0 +1,418 @@ +'use client'; + +import { Info } from 'lucide-react'; +import { useMemo } from 'react'; + +import type { + ChartDefinition, + HardwareConfig, + InferenceData, + OverlayData, +} from '@/components/inference/types'; +import { + TooltipContent, + TooltipProvider, + TooltipRoot, + TooltipTrigger, +} from '@/components/ui/tooltip'; +import { + computeMeanUplift, + computeUplift, + formatUpliftPercent, + type MeanUpliftResult, + type RooflineDirection, + type UpliftResult, +} from '@/lib/pareto-uplift'; +import { getModelSortIndex } from '@/lib/constants'; +import { getDisplayLabel } from '@/lib/utils'; + +interface ParetoUpliftPanelProps { + /** Chart data (already filtered by useChartData — includes main + comparison-date points). */ + data: InferenceData[]; + chartDefinition: ChartDefinition; + selectedYAxisMetric: string; + hardwareConfig: HardwareConfig; + activeHwTypes: Set; + activeDates: Set; + selectedPrecisions: string[]; + /** Main (official) run date. Used as the reference when no unofficial overlay is present. */ + selectedRunDate: string; + /** Discrete comparison dates selected by the user. */ + selectedDates: string[]; + /** Range endpoints, used when the user picks a date range instead of individual dates. */ + selectedDateRange: { startDate: string; endDate: string }; + isTimelineMode: boolean; + /** When an unofficial PR run is active, these points become the reference instead of the main date. */ + overlayData?: OverlayData; + chartType: string; +} + +interface ColumnSpec { + id: string; + label: string; + hint?: string; + date: string; +} + +/** Scalar time / interactivity stats added beneath each GPU's primary Pareto row. */ +interface ScalarMetric { + id: string; + label: string; + field: keyof InferenceData; + higherIsBetter: boolean; + /** Unit shown in the tooltip. */ + unit: string; +} + +const SCALAR_METRICS: ScalarMetric[] = [ + { + id: 'median_ttft', + label: 'Median TTFT', + field: 'median_ttft', + higherIsBetter: false, + unit: 's', + }, + { id: 'p99_ttft', label: 'P99 TTFT', field: 'p99_ttft', higherIsBetter: false, unit: 's' }, + { + id: 'median_tpot', + label: 'Median TPOT', + field: 'median_tpot', + higherIsBetter: false, + unit: 's', + }, + { + id: 'median_e2el', + label: 'Median E2EL', + field: 'median_e2el', + higherIsBetter: false, + unit: 's', + }, + { + id: 'median_intvty', + label: 'Interactivity', + field: 'median_intvty', + higherIsBetter: true, + unit: 'tok/s/user', + }, +]; + +type CellValue = + | { kind: 'pareto'; uplift: UpliftResult } + | { kind: 'mean'; uplift: MeanUpliftResult; metric: ScalarMetric } + | null; + +interface Row { + rowKey: string; + /** GPU display label shown only on the first row of each group. */ + gpuLabel: string; + /** Metric label shown in the "Metric" column. */ + metricLabel: string; + /** True for the first row of each GPU group (used for a subtle top border). */ + isFirstInGroup: boolean; + cells: { columnId: string; value: CellValue }[]; +} + +/** + * Historical uplift table: rows = (GPU × metric), columns = comparison dates. + * + * Per GPU: one "primary" row for the chart's Pareto-curve uplift plus one scalar row per time + * stat (Median/P99 TTFT, Median TPOT, Median E2EL, Median Interactivity). Scalar cells compare + * the arithmetic mean of each metric between the reference and the historical date, normalized + * so >1 always reads "reference is better". + * + * Reference = unofficial PR overlay when present, else the main run date. Built for the PR + * review workflow — "does my branch regress H100 Dynamo-TRT vs last Friday's main?" + */ +export default function ParetoUpliftPanel({ + data, + chartDefinition, + selectedYAxisMetric, + hardwareConfig, + activeHwTypes, + activeDates, + selectedPrecisions, + selectedRunDate, + selectedDates, + selectedDateRange, + isTimelineMode, + overlayData, + chartType, +}: ParetoUpliftPanelProps) { + const rooflineDir = chartDefinition[ + `${selectedYAxisMetric}_roofline` as keyof ChartDefinition + ] as RooflineDirection | undefined; + + const primaryLabel = useMemo(() => { + const titleKey = `${selectedYAxisMetric}_title` as keyof ChartDefinition; + const title = chartDefinition[titleKey]; + return typeof title === 'string' && title.length > 0 ? title : 'Primary'; + }, [chartDefinition, selectedYAxisMetric]); + + const hasUnofficial = Boolean(overlayData && overlayData.data.length > 0); + + const visibleData = useMemo( + () => + data.filter((d) => { + if (isTimelineMode) return activeDates.has(`${d.date}_${d.hwKey}`); + return activeHwTypes.has(d.hwKey) && selectedPrecisions.includes(d.precision ?? ''); + }), + [data, isTimelineMode, activeDates, activeHwTypes, selectedPrecisions], + ); + + const columns = useMemo(() => { + const seen = new Set(); + const cols: ColumnSpec[] = []; + const push = (date: string, hint?: string) => { + if (!date || seen.has(date)) return; + seen.add(date); + cols.push({ id: date, date, label: date, hint }); + }; + if (hasUnofficial && selectedRunDate) push(selectedRunDate, 'main'); + if (selectedDateRange.startDate) push(selectedDateRange.startDate, 'range start'); + if (selectedDateRange.endDate) push(selectedDateRange.endDate, 'range end'); + for (const d of selectedDates) push(d); + return cols.toSorted((a, b) => b.date.localeCompare(a.date)); + }, [hasUnofficial, selectedRunDate, selectedDates, selectedDateRange]); + + const { referenceByHw, historyByKey } = useMemo(() => { + const refByHw = new Map(); + const histByKey = new Map(); + + if (hasUnofficial && overlayData) { + for (const p of overlayData.data) { + if (!activeHwTypes.has(p.hwKey)) continue; + const arr = refByHw.get(p.hwKey); + if (arr) arr.push(p); + else refByHw.set(p.hwKey, [p]); + } + } + + for (const p of visibleData) { + if (!hasUnofficial && p.date === selectedRunDate) { + const arr = refByHw.get(p.hwKey); + if (arr) arr.push(p); + else refByHw.set(p.hwKey, [p]); + } + const isHistorical = hasUnofficial ? true : p.date !== selectedRunDate; + if (isHistorical) { + const key = `${p.hwKey}|${p.date}`; + const arr = histByKey.get(key); + if (arr) arr.push(p); + else histByKey.set(key, [p]); + } + } + return { referenceByHw: refByHw, historyByKey: histByKey }; + }, [visibleData, overlayData, hasUnofficial, activeHwTypes, selectedRunDate]); + + const rows = useMemo(() => { + if (columns.length === 0 || referenceByHw.size === 0) return []; + + const sortedHwKeys = [...referenceByHw.keys()].toSorted( + (a, b) => getModelSortIndex(a) - getModelSortIndex(b) || a.localeCompare(b), + ); + + const result: Row[] = []; + for (const hwKey of sortedHwKeys) { + const refPoints = referenceByHw.get(hwKey)!; + const cfg = hardwareConfig[hwKey]; + const gpuLabel = cfg ? getDisplayLabel({ label: cfg.label, suffix: cfg.suffix }) : hwKey; + + const groupRows: Row[] = []; + + // Primary Pareto-uplift row (only when the chart has a roofline direction AND enough ref pts). + if (rooflineDir && refPoints.length >= 2) { + const cells = columns.map((col) => { + const histPoints = historyByKey.get(`${hwKey}|${col.date}`); + if (!histPoints || histPoints.length < 2) { + return { columnId: col.id, value: null }; + } + const uplift = computeUplift(histPoints, refPoints, rooflineDir); + return { + columnId: col.id, + value: + Number.isFinite(uplift.geomean) && uplift.samples.length > 0 + ? ({ kind: 'pareto', uplift } as const) + : null, + }; + }); + if (cells.some((c) => c.value !== null)) { + groupRows.push({ + rowKey: `${hwKey}|__primary`, + gpuLabel, + metricLabel: primaryLabel, + isFirstInGroup: true, + cells, + }); + } + } + + // Scalar rows: arithmetic-mean ratio per metric. + for (const metric of SCALAR_METRICS) { + const cells = columns.map((col) => { + const histPoints = historyByKey.get(`${hwKey}|${col.date}`); + if (!histPoints || histPoints.length === 0) { + return { columnId: col.id, value: null }; + } + const uplift = computeMeanUplift( + histPoints, + refPoints, + metric.field, + metric.higherIsBetter, + ); + return { + columnId: col.id, + value: Number.isFinite(uplift.ratio) + ? ({ kind: 'mean', uplift, metric } as const) + : null, + }; + }); + if (cells.some((c) => c.value !== null)) { + groupRows.push({ + rowKey: `${hwKey}|${metric.id}`, + gpuLabel, + metricLabel: metric.label, + isFirstInGroup: groupRows.length === 0, + cells, + }); + } + } + + if (groupRows.length > 0) result.push(...groupRows); + } + return result; + }, [referenceByHw, historyByKey, columns, rooflineDir, hardwareConfig, primaryLabel]); + + if (!hasUnofficial && columns.length === 0) return null; + if (rows.length === 0) return null; + + const referenceLabel = + hasUnofficial && overlayData + ? `Reference: PR · ${overlayData.label}` + : `Reference: ${selectedRunDate || 'current'}`; + + return ( +
+
+
+ Performance uplift + + + + + + +

+ How the reference set (PR overlay or current run) compares to each historical + date, per GPU+framework. The first row per GPU is the Pareto-curve uplift for the + chart's y-metric (geomean of per-SLA ratios across the x-overlap). The following + rows compare the arithmetic mean of each time stat. All cells are + direction-normalized so >1 = reference is better. +

+
+
+
+
+ {referenceLabel} +
+
+ + + + + + {columns.map((col) => ( + + ))} + + + + {rows.map((row) => ( + + + + {row.cells.map((cell) => ( + + ))} + + ))} + +
+ GPU + Metric + vs {col.label} + {col.hint && ({col.hint})} +
+ {row.isFirstInGroup ? row.gpuLabel : ''} + + {row.metricLabel} +
+
+
+ ); +} + +function UpliftCell({ value }: { value: CellValue }) { + if (!value) { + return —; + } + const ratio = value.kind === 'pareto' ? value.uplift.geomean : value.uplift.ratio; + const pct = formatUpliftPercent(ratio); + const isBetter = ratio > 1.0005; + const isWorse = ratio < 0.9995; + const color = isBetter ? 'text-emerald-500' : isWorse ? 'text-red-500' : 'text-muted-foreground'; + + const tooltip = + value.kind === 'pareto' ? ( +

+ Pareto geomean across {value.uplift.samples.length} SLA samples, covering{' '} + {Math.round(value.uplift.coverage * 100)}% of the union x-range. + {value.uplift.coverage < 0.5 && ' ⚠ Narrow overlap — inspect the curves.'} +

+ ) : ( +

+ Mean reference: {formatValue(value.uplift.meanCandidate, value.metric.unit)} · Mean + historical: {formatValue(value.uplift.meanBaseline, value.metric.unit)} +
+ Based on {value.uplift.countCandidate} ref / {value.uplift.countBaseline} historical points. +

+ ); + + return ( + + + + + {pct} + + {tooltip} + + + + ); +} + +function formatValue(v: number, unit: string): string { + if (!Number.isFinite(v)) return '—'; + if (unit === 's') { + if (v < 0.001) return `${(v * 1_000_000).toFixed(0)} µs`; + if (v < 1) return `${(v * 1000).toFixed(1)} ms`; + return `${v.toFixed(2)} s`; + } + return `${v.toFixed(1)} ${unit}`; +} diff --git a/packages/app/src/lib/pareto-uplift.test.ts b/packages/app/src/lib/pareto-uplift.test.ts new file mode 100644 index 00000000..cf01fca2 --- /dev/null +++ b/packages/app/src/lib/pareto-uplift.test.ts @@ -0,0 +1,262 @@ +import { describe, expect, it } from 'vitest'; + +import type { InferenceData } from '@/components/inference/types'; + +import { + computeMeanUplift, + computeUplift, + formatUpliftPercent, + interpolateY, + sampleSLAs, + yHigherIsBetter, +} from './pareto-uplift'; + +function pt(x: number, y: number, hwKey = 'h100', date = '2025-01-01'): InferenceData { + return { + date, + x, + y, + tp: 1, + conc: 1, + hwKey, + precision: 'fp8', + tpPerGpu: { y, roof: false }, + tpPerMw: { y, roof: false }, + costh: { y, roof: false }, + costn: { y, roof: false }, + costr: { y, roof: false }, + costhi: { y, roof: false }, + costni: { y, roof: false }, + costri: { y, roof: false }, + }; +} + +describe('interpolateY', () => { + it('returns null for empty curve', () => { + expect(interpolateY([], 1)).toBeNull(); + }); + + it('returns null when x is out of range', () => { + const curve = [ + { x: 1, y: 10 }, + { x: 2, y: 20 }, + ]; + expect(interpolateY(curve, 0.5)).toBeNull(); + expect(interpolateY(curve, 2.5)).toBeNull(); + }); + + it('returns the exact y at endpoints', () => { + const curve = [ + { x: 1, y: 10 }, + { x: 2, y: 20 }, + ]; + expect(interpolateY(curve, 1)).toBe(10); + expect(interpolateY(curve, 2)).toBe(20); + }); + + it('linearly interpolates between two points', () => { + const curve = [ + { x: 0, y: 0 }, + { x: 10, y: 100 }, + ]; + expect(interpolateY(curve, 5)).toBe(50); + expect(interpolateY(curve, 2.5)).toBe(25); + }); + + it('interpolates across multi-segment curves', () => { + const curve = [ + { x: 0, y: 0 }, + { x: 10, y: 100 }, + { x: 20, y: 110 }, + ]; + expect(interpolateY(curve, 5)).toBe(50); + expect(interpolateY(curve, 15)).toBe(105); + }); +}); + +describe('sampleSLAs', () => { + it('returns 5 log-spaced points over a 100x range', () => { + const slas = sampleSLAs(1, 100, 5); + expect(slas).toHaveLength(5); + expect(slas[0]).toBeCloseTo(1); + expect(slas[4]).toBeCloseTo(100); + expect(slas[2]).toBeCloseTo(10); + }); + + it('returns linear-spaced points when the range is <10x', () => { + const slas = sampleSLAs(10, 50, 5); + expect(slas).toHaveLength(5); + expect(slas[0]).toBeCloseTo(10); + expect(slas[2]).toBeCloseTo(30); + expect(slas[4]).toBeCloseTo(50); + }); + + it('returns [] for invalid ranges', () => { + expect(sampleSLAs(0, 10, 5)).toEqual([]); + expect(sampleSLAs(10, 10, 5)).toEqual([]); + expect(sampleSLAs(-1, 10, 5)).toEqual([]); + }); + + it('returns geometric midpoint for n=1', () => { + expect(sampleSLAs(1, 100, 1)).toEqual([10]); + }); +}); + +describe('yHigherIsBetter', () => { + it('is true for upper_* directions', () => { + expect(yHigherIsBetter('upper_left')).toBe(true); + expect(yHigherIsBetter('upper_right')).toBe(true); + }); + + it('is false for lower_* directions', () => { + expect(yHigherIsBetter('lower_left')).toBe(false); + expect(yHigherIsBetter('lower_right')).toBe(false); + }); +}); + +describe('computeUplift', () => { + it('returns 1.0 geomean for identical curves', () => { + const a = [pt(10, 100), pt(20, 200), pt(30, 250)]; + const b = [pt(10, 100), pt(20, 200), pt(30, 250)]; + const r = computeUplift(a, b, 'upper_right'); + expect(r.geomean).toBeCloseTo(1, 5); + expect(r.coverage).toBe(1); + expect(r.samples.length).toBeGreaterThan(0); + }); + + it('returns ~2.0 for a candidate uniformly 2x baseline throughput (upper_right)', () => { + const baseline = [pt(10, 100), pt(20, 150), pt(30, 180)]; + const candidate = [pt(10, 200), pt(20, 300), pt(30, 360)]; + const r = computeUplift(baseline, candidate, 'upper_right'); + expect(r.geomean).toBeCloseTo(2, 2); + }); + + it('inverts ratio for lower-is-better metrics (cost)', () => { + // lower_left front shape: y decreases as x increases. Candidate has half the y everywhere + // → ratio (yBaseline / yCandidate) = 2, meaning candidate is 2x "better". + const baseline = [pt(1, 4), pt(2, 2), pt(3, 1)]; + const candidate = [pt(1, 2), pt(2, 1), pt(3, 0.5)]; + const r = computeUplift(baseline, candidate, 'lower_left'); + expect(r.geomean).toBeCloseTo(2, 2); + }); + + it('returns NaN geomean and coverage=0 for disjoint x-ranges', () => { + const a = [pt(1, 10), pt(2, 20)]; + const b = [pt(10, 100), pt(20, 200)]; + const r = computeUplift(a, b, 'upper_right'); + expect(Number.isNaN(r.geomean)).toBe(true); + expect(r.coverage).toBe(0); + expect(r.samples).toHaveLength(0); + }); + + it('reports partial coverage for partially-overlapping curves', () => { + // baseline spans [1, 10]; candidate spans [5, 20]. Overlap = [5, 10], union = [1, 20]. + const baseline = [pt(1, 10), pt(5, 50), pt(10, 100)]; + const candidate = [pt(5, 50), pt(10, 100), pt(20, 150)]; + const r = computeUplift(baseline, candidate, 'upper_right'); + expect(r.coverage).toBeCloseTo(5 / 19, 3); + expect(r.geomean).toBeCloseTo(1, 2); + expect(r.overlapRange).toEqual({ min: 5, max: 10 }); + }); + + it('handles empty inputs', () => { + const r = computeUplift([], [pt(1, 1)], 'upper_right'); + expect(r.baselineFrontSize).toBe(0); + expect(Number.isNaN(r.geomean)).toBe(true); + }); + + it('does not mutate input arrays', () => { + const a = [pt(30, 300), pt(10, 100), pt(20, 200)]; + const aSnapshot = a.map((p) => ({ x: p.x, y: p.y })); + const b = [pt(10, 120), pt(20, 220), pt(30, 320)]; + computeUplift(a, b, 'upper_right'); + expect(a.map((p) => ({ x: p.x, y: p.y }))).toEqual(aSnapshot); + }); + + it('filters non-dominated points before sampling (picks Pareto front only)', () => { + // Baseline has a dominated point at (15, 50) that should not affect the curve. + const baseline = [pt(10, 100), pt(15, 50), pt(20, 150), pt(30, 180)]; + const candidate = [pt(10, 200), pt(20, 300), pt(30, 360)]; + const r = computeUplift(baseline, candidate, 'upper_right'); + expect(r.geomean).toBeCloseTo(2, 2); + expect(r.baselineFrontSize).toBeLessThan(baseline.length); + }); +}); + +describe('computeMeanUplift', () => { + // Build a point with a specific ttft value; other fields use defaults. + const pointWithTtft = (ttft: number): InferenceData => ({ + ...pt(1, 1), + median_ttft: ttft, + }); + + it('returns 1.0 for identical averages', () => { + const baseline = [pointWithTtft(0.5), pointWithTtft(1)]; + const candidate = [pointWithTtft(0.5), pointWithTtft(1)]; + const r = computeMeanUplift(baseline, candidate, 'median_ttft', false); + expect(r.ratio).toBe(1); + expect(r.meanBaseline).toBe(0.75); + expect(r.meanCandidate).toBe(0.75); + }); + + it('inverts ratio for lower-is-better fields (TTFT)', () => { + const baseline = [pointWithTtft(1), pointWithTtft(1)]; // mean 1.0 + const candidate = [pointWithTtft(0.5), pointWithTtft(0.5)]; // mean 0.5 (better) + const r = computeMeanUplift(baseline, candidate, 'median_ttft', false); + expect(r.ratio).toBe(2); // candidate (ref) is 2x better + }); + + it('keeps ratio unflipped for higher-is-better fields (interactivity)', () => { + const baseline = [ + { ...pt(1, 1), median_intvty: 10 }, + { ...pt(1, 1), median_intvty: 20 }, + ]; + const candidate = [ + { ...pt(1, 1), median_intvty: 30 }, + { ...pt(1, 1), median_intvty: 30 }, + ]; + const r = computeMeanUplift(baseline, candidate, 'median_intvty', true); + expect(r.ratio).toBe(2); + }); + + it('skips non-finite / non-positive values when computing the mean', () => { + const baseline = [ + pointWithTtft(1), + pointWithTtft(Number.NaN), + pointWithTtft(-5), + pointWithTtft(3), + ]; + const candidate = [pointWithTtft(1), pointWithTtft(3)]; + const r = computeMeanUplift(baseline, candidate, 'median_ttft', false); + expect(r.countBaseline).toBe(2); + expect(r.meanBaseline).toBe(2); + expect(r.ratio).toBe(1); + }); + + it('returns NaN ratio when either side has no usable values', () => { + const baseline = [pointWithTtft(1)]; + const candidate = [pointWithTtft(Number.NaN)]; + const r = computeMeanUplift(baseline, candidate, 'median_ttft', false); + expect(Number.isNaN(r.ratio)).toBe(true); + }); +}); + +describe('formatUpliftPercent', () => { + it('formats positive uplift', () => { + expect(formatUpliftPercent(1.173)).toBe('+17.3%'); + }); + + it('formats negative uplift with a minus sign', () => { + expect(formatUpliftPercent(0.83)).toBe('−17.0%'); + }); + + it('returns "parity" for ratios near 1', () => { + expect(formatUpliftPercent(1)).toBe('parity'); + expect(formatUpliftPercent(0.9999)).toBe('parity'); + }); + + it('returns em dash for non-finite values', () => { + expect(formatUpliftPercent(NaN)).toBe('—'); + expect(formatUpliftPercent(Infinity)).toBe('—'); + }); +}); diff --git a/packages/app/src/lib/pareto-uplift.ts b/packages/app/src/lib/pareto-uplift.ts new file mode 100644 index 00000000..f9e1da47 --- /dev/null +++ b/packages/app/src/lib/pareto-uplift.ts @@ -0,0 +1,235 @@ +import type { InferenceData } from '@/components/inference/types'; + +import { + paretoFrontLowerLeft, + paretoFrontLowerRight, + paretoFrontUpperLeft, + paretoFrontUpperRight, +} from './chart-utils'; + +export type RooflineDirection = 'upper_left' | 'upper_right' | 'lower_left' | 'lower_right'; + +export interface ParetoPoint { + x: number; + y: number; +} + +export interface UpliftSample { + x: number; + yBaseline: number; + yCandidate: number; + ratio: number; +} + +export interface UpliftResult { + /** + * Geometric mean of per-SLA ratios, direction-normalized so >1 always means "candidate + * better" and <1 always means "baseline better", regardless of whether y is higher-is-better + * (throughput) or lower-is-better (cost, energy). + */ + geomean: number; + samples: UpliftSample[]; + /** x-range where both Pareto fronts overlap and SLAs are sampled. */ + overlapRange: { min: number; max: number } | null; + /** overlap / union of the two x-ranges. 0 = disjoint, 1 = identical bounds. */ + coverage: number; + baselineFrontSize: number; + candidateFrontSize: number; +} + +const FRONT_FNS: Record InferenceData[]> = { + upper_left: paretoFrontUpperLeft, + upper_right: paretoFrontUpperRight, + lower_left: paretoFrontLowerLeft, + lower_right: paretoFrontLowerRight, +}; + +export function yHigherIsBetter(dir: RooflineDirection): boolean { + return dir === 'upper_left' || dir === 'upper_right'; +} + +/** Linear interpolation of y at x along a curve sorted ascending by x. Returns null out of range. */ +export function interpolateY(curve: ParetoPoint[], x: number): number | null { + if (curve.length === 0) return null; + if (curve.length === 1) return curve[0].x === x ? curve[0].y : null; + const first = curve[0]; + const last = curve.at(-1)!; + if (x < first.x || x > last.x) return null; + for (let i = 0; i < curve.length - 1; i++) { + const a = curve[i]; + const b = curve[i + 1]; + if (x >= a.x && x <= b.x) { + if (b.x === a.x) return (a.y + b.y) / 2; + const t = (x - a.x) / (b.x - a.x); + return a.y + t * (b.y - a.y); + } + } + return null; +} + +/** + * Sample n points across [min, max]. Uses log-spacing when the range spans >10× + * (so SLAs distribute evenly across orders of magnitude). + */ +export function sampleSLAs(min: number, max: number, n: number): number[] { + if (!(min > 0) || !(max > 0) || min >= max || n < 1) return []; + if (n === 1) return [Math.sqrt(min * max)]; + const useLog = Math.log10(max / min) > 1; + const pts: number[] = []; + for (let i = 0; i < n; i++) { + const t = i / (n - 1); + pts.push(useLog ? min * (max / min) ** t : min + t * (max - min)); + } + return pts; +} + +/** Compute Pareto front from raw points and return sorted ascending by x. */ +function computeFront(points: InferenceData[], direction: RooflineDirection): ParetoPoint[] { + if (points.length === 0) return []; + // paretoFront* mutates via .sort; clone to avoid side effects on the caller's array. + const front = FRONT_FNS[direction]([...points]); + return front + .map((p) => ({ x: p.x, y: p.y })) + .filter((p) => Number.isFinite(p.x) && Number.isFinite(p.y)) + .toSorted((a, b) => a.x - b.x); +} + +/** + * Compare two Pareto curves and return a single-number geomean uplift ratio. + * + * Algorithm: compute Pareto fronts for each set, find x-overlap, sample N SLAs across the + * overlap, linearly interpolate y on each front, and take the geometric mean of per-SLA + * (candidate/baseline) ratios. Ratio is inverted for lower-is-better metrics so the result + * always reads "candidate performance relative to baseline" (>1 = better, <1 = worse). + */ +export function computeUplift( + baselinePts: InferenceData[], + candidatePts: InferenceData[], + direction: RooflineDirection, + slaCount = 5, +): UpliftResult { + const baseline = computeFront(baselinePts, direction); + const candidate = computeFront(candidatePts, direction); + + const empty: UpliftResult = { + geomean: NaN, + samples: [], + overlapRange: null, + coverage: 0, + baselineFrontSize: baseline.length, + candidateFrontSize: candidate.length, + }; + + if (baseline.length === 0 || candidate.length === 0) return empty; + + const bMin = baseline[0].x; + const bMax = baseline.at(-1)!.x; + const cMin = candidate[0].x; + const cMax = candidate.at(-1)!.x; + + const overlapMin = Math.max(bMin, cMin); + const overlapMax = Math.min(bMax, cMax); + const unionMin = Math.min(bMin, cMin); + const unionMax = Math.max(bMax, cMax); + const coverage = + unionMax > unionMin ? Math.max(0, (overlapMax - overlapMin) / (unionMax - unionMin)) : 0; + + if (overlapMin >= overlapMax) { + return { ...empty, coverage }; + } + + const slas = sampleSLAs(overlapMin, overlapMax, slaCount); + const higherIsBetter = yHigherIsBetter(direction); + + const samples: UpliftSample[] = []; + let logSum = 0; + for (const x of slas) { + const yB = interpolateY(baseline, x); + const yC = interpolateY(candidate, x); + if (yB === null || yC === null || yB <= 0 || yC <= 0) continue; + const ratio = higherIsBetter ? yC / yB : yB / yC; + samples.push({ x, yBaseline: yB, yCandidate: yC, ratio }); + logSum += Math.log(ratio); + } + + if (samples.length === 0) { + return { + ...empty, + overlapRange: { min: overlapMin, max: overlapMax }, + coverage, + }; + } + + return { + geomean: Math.exp(logSum / samples.length), + samples, + overlapRange: { min: overlapMin, max: overlapMax }, + coverage, + baselineFrontSize: baseline.length, + candidateFrontSize: candidate.length, + }; +} + +export interface MeanUpliftResult { + /** Direction-normalized ratio (>1 = reference is better on this metric). */ + ratio: number; + meanBaseline: number; + meanCandidate: number; + countBaseline: number; + countCandidate: number; +} + +/** + * Compare the arithmetic mean of a scalar field between two point sets. Returned ratio is + * direction-normalized so >1 always reads "candidate (reference) outperformed baseline". + * + * Used for the time-stat rows in the uplift table — a simpler "how did the average move?" + * signal that does not need a Pareto curve or SLA sampling. + */ +export function computeMeanUplift( + baselinePts: InferenceData[], + candidatePts: InferenceData[], + field: keyof InferenceData, + higherIsBetter: boolean, +): MeanUpliftResult { + const extract = (pts: InferenceData[]): { mean: number; count: number } => { + let sum = 0; + let count = 0; + for (const p of pts) { + const v = p[field]; + if (typeof v === 'number' && Number.isFinite(v) && v > 0) { + sum += v; + count++; + } + } + return { count, mean: count > 0 ? sum / count : NaN }; + }; + const b = extract(baselinePts); + const c = extract(candidatePts); + if (!Number.isFinite(b.mean) || !Number.isFinite(c.mean) || b.mean <= 0 || c.mean <= 0) { + return { + ratio: NaN, + meanBaseline: b.mean, + meanCandidate: c.mean, + countBaseline: b.count, + countCandidate: c.count, + }; + } + const ratio = higherIsBetter ? c.mean / b.mean : b.mean / c.mean; + return { + ratio, + meanBaseline: b.mean, + meanCandidate: c.mean, + countBaseline: b.count, + countCandidate: c.count, + }; +} + +/** Format a geomean ratio as "+17.3%", "−4.1%", or "parity". */ +export function formatUpliftPercent(ratio: number): string { + if (!Number.isFinite(ratio)) return '—'; + const pct = (ratio - 1) * 100; + if (Math.abs(pct) < 0.05) return 'parity'; + const sign = pct > 0 ? '+' : '−'; + return `${sign}${Math.abs(pct).toFixed(1)}%`; +} diff --git a/packages/app/vitest.config.ts b/packages/app/vitest.config.ts index 374c66e5..85e6fbc6 100644 --- a/packages/app/vitest.config.ts +++ b/packages/app/vitest.config.ts @@ -4,7 +4,7 @@ import path from 'path'; export default defineConfig({ test: { environment: 'node', - include: ['src/**/*.test.ts'], + include: ['src/**/*.test.{ts,tsx}'], coverage: { provider: 'v8', include: ['src/lib/**/*.ts', 'src/scripts/**/*.ts', 'src/app/api/**/*.ts'],