From b3f3e98128a21306a44722ea63d104189e66f1ab Mon Sep 17 00:00:00 2001 From: Andy Date: Thu, 11 Jun 2026 16:27:33 +0900 Subject: [PATCH] fix: stop double-counting partial offload in ranking --- docs/how-it-works.md | 7 +- docs/scoring.md | 19 +++--- src/whichllm/engine/ranker.py | 50 +++++++++----- tests/test_ranker.py | 125 +++++++++++++++++++++++++++++++++- 4 files changed, 168 insertions(+), 33 deletions(-) diff --git a/docs/how-it-works.md b/docs/how-it-works.md index b1baaab..d608ccb 100644 --- a/docs/how-it-works.md +++ b/docs/how-it-works.md @@ -168,9 +168,10 @@ For each candidate variant: 5. Compute a quality score. 6. Keep the best variant for the model family. -The final sorting key includes the quality score, a fit bonus, and a small -direct-benchmark bonus. Full-GPU candidates are preferred over comparable -partial-offload candidates because they are usually more responsive in practice. +The final sorting key stays close to the displayed quality score, with a small +direct-benchmark bonus and a CPU-only penalty. Full-GPU candidates are already +favored inside the score through the runtime-fit and speed adjustments, so the +sort key does not add a second full-GPU bonus. See [Scoring](scoring.md) for the score details. diff --git a/docs/scoring.md b/docs/scoring.md index 00bd5a6..91bcd68 100644 --- a/docs/scoring.md +++ b/docs/scoring.md @@ -99,20 +99,17 @@ The candidate's runtime form matters: | Fit | Multiplier | | --- | ---: | | Full GPU | `1.00` | -| Partial offload | `0.72` | +| Partial offload | `0.42`-`0.88`, based on spill ratio | | CPU-only | `0.50` | -The final family selection key also adds a fit bonus: +Light partial offload is penalized less than heavy offload. MoE models receive +a milder penalty when the active parameter working set can plausibly stay on +GPU while inactive experts spill to CPU RAM. -| Fit | Bonus | -| --- | ---: | -| Full GPU | `+15` | -| Partial offload | `0` | -| CPU-only | `-15` | - -This keeps a responsive full-GPU result ahead of a similar partial-offload -result, without letting a very weak full-GPU model beat a much stronger model -that only needs modest offload. +The final family selection key does not add a separate full-GPU bonus. Runtime +fit is already reflected in the quality score through the multiplier above and +the speed adjustment below. CPU-only results receive a small extra sort penalty +when mixed with GPU-backed candidates. ## Speed adjustment diff --git a/src/whichllm/engine/ranker.py b/src/whichllm/engine/ranker.py index 2a03faf..2e3d8d8 100644 --- a/src/whichllm/engine/ranker.py +++ b/src/whichllm/engine/ranker.py @@ -41,28 +41,20 @@ def _family_selection_key( ) -> tuple[float]: """Family-level selection key — single composite score. - Combines quality, fit type, and evidence tier into one number so the - sort is fully transitive and edge cases resolve sensibly: - - - ``fit_bonus`` (+15 / 0 / -15) is large enough that "estimated, - full-GPU" still beats "direct, partial-offload" of comparable - quality (users on small VRAM prefer the responsive option), - but small enough that a quality-17 Q1_0 full-GPU model loses to - a quality-57 partial-offload 27B model + ``quality_score`` already includes the runtime fit penalty and speed + adjustment. Keep final selection close to that displayed score so strong + partial-offload candidates do not get discounted again while sorting. + - ``direct_bonus`` (+5) gives independent leaderboard evidence a small edge at the same fit; cannot overturn a 6+ point quality gap """ - fit_bonus = { - "full_gpu": 15.0, - "partial_offload": 0.0, - "cpu_only": -15.0, - }.get(result.fit_type, -15.0) if require_direct_top and result.benchmark_status == "direct": direct_bonus = 5.0 else: direct_bonus = 0.0 + cpu_penalty = -6.0 if result.fit_type == "cpu_only" else 0.0 ctx_penalty = -20.0 if not result.context_fits else 0.0 - return (result.quality_score + fit_bonus + direct_bonus + ctx_penalty,) + return (result.quality_score + direct_bonus + cpu_penalty + ctx_penalty,) def _partial_offload_quality_factor(model: ModelInfo, offload_ratio: float) -> float: @@ -74,14 +66,36 @@ def _partial_offload_quality_factor(model: ModelInfo, offload_ratio: float) -> f factor = 0.52 elif ratio >= 0.40: factor = 0.62 + elif ratio >= 0.25: + factor = 0.76 else: - factor = 0.72 + factor = 0.86 # MoE offload is more nuanced: inactive experts and router/runtime - # placement do not hurt equally. Keep the penalty, but do not treat it - # as badly as dense-layer offload. + # placement do not hurt equally. If the GPU can plausibly hold the + # active expert working set, do not treat inactive-expert spill like + # dense-layer spill. if model.is_moe and model.parameter_count_active: - factor = min(0.72, factor + 0.08) + active_ratio = ( + model.parameter_count_active / model.parameter_count + if model.parameter_count > 0 + else 1.0 + ) + active_ratio = max(0.0, min(1.0, active_ratio)) + active_set_fits = ratio <= max(0.0, 1.0 - active_ratio) + if active_set_fits: + if ratio >= 0.75: + factor = max(factor, 0.66) + elif ratio >= 0.60: + factor = max(factor, 0.70) + elif ratio >= 0.40: + factor = max(factor, 0.76) + elif ratio >= 0.25: + factor = max(factor, 0.82) + else: + factor = max(factor, 0.88) + else: + factor = min(0.76, factor + 0.08) return factor diff --git a/tests/test_ranker.py b/tests/test_ranker.py index 7d639f9..3b374ee 100644 --- a/tests/test_ranker.py +++ b/tests/test_ranker.py @@ -1,7 +1,7 @@ """Tests for ranking behavior.""" from whichllm.engine.quantization import effective_quant_type -from whichllm.engine.ranker import rank_models +from whichllm.engine.ranker import _partial_offload_quality_factor, rank_models from whichllm.hardware.types import GPUInfo, HardwareInfo from whichllm.models.types import GGUFVariant, ModelInfo @@ -493,6 +493,129 @@ def test_full_gpu_estimated_ranks_above_partial_direct(): assert results[0].model.id == "Qwen/Qwen3-8B-AWQ" +def test_strong_partial_offload_not_buried_below_weaker_full_gpu(): + strong_partial = ModelInfo( + id="Qwen/Qwen3.6-27B", + family_id="qwen3.6-27b", + name="Qwen3.6-27B", + parameter_count=27_800_000_000, + downloads=5_300_000, + likes=10_000, + gguf_variants=[ + GGUFVariant( + filename="qwen3.6-27b-q4_k_m.gguf", + quant_type="Q4_K_M", + file_size_bytes=15 * 1024**3, + ) + ], + ) + full_gpu_14b = ModelInfo( + id="Qwen/Qwen3-14B", + family_id="qwen3-14b", + name="Qwen3-14B", + parameter_count=14_800_000_000, + downloads=1_600_000, + likes=5_000, + gguf_variants=[ + GGUFVariant( + filename="qwen3-14b-q5_k_m.gguf", + quant_type="Q5_K_M", + file_size_bytes=9 * 1024**3, + ) + ], + ) + full_gpu_8b = ModelInfo( + id="Qwen/Qwen3-8B", + family_id="qwen3-8b", + name="Qwen3-8B", + parameter_count=8_200_000_000, + downloads=11_000_000, + likes=5_000, + gguf_variants=[ + GGUFVariant( + filename="qwen3-8b-q5_k_m.gguf", + quant_type="Q5_K_M", + file_size_bytes=5 * 1024**3, + ) + ], + ) + old_full_gpu = ModelInfo( + id="google/gemma-2-9b-it", + family_id="gemma-2-9b-it", + name="gemma-2-9b-it", + parameter_count=9_200_000_000, + downloads=400_000, + likes=1_000, + gguf_variants=[ + GGUFVariant( + filename="gemma-2-9b-q5_k_m.gguf", + quant_type="Q5_K_M", + file_size_bytes=5_500_000_000, + ) + ], + ) + hardware = HardwareInfo( + gpus=[ + GPUInfo( + name="RTX 3060", + vendor="nvidia", + vram_bytes=12 * 1024**3, + compute_capability=(8, 6), + memory_bandwidth_gbps=360.0, + ) + ], + cpu_name="Test CPU", + cpu_cores=6, + has_avx2=True, + ram_bytes=32 * 1024**3, + disk_free_bytes=500 * 1024**3, + os="windows", + ) + + results = rank_models( + [strong_partial, full_gpu_14b, full_gpu_8b, old_full_gpu], + hardware, + top_n=10, + benchmark_scores={ + "Qwen/Qwen3.6-27B": 83.5, + "Qwen/Qwen3-14B": 66.7, + "Qwen/Qwen3-8B": 56.1, + "google/gemma-2-9b-it": 35.1, + }, + task_profile="any", + ) + + ids = [r.model.id for r in results] + assert ids.index("Qwen/Qwen3.6-27B") < ids.index("Qwen/Qwen3-8B") + assert ids.index("Qwen/Qwen3.6-27B") < ids.index("google/gemma-2-9b-it") + strong = next(r for r in results if r.model.id == "Qwen/Qwen3.6-27B") + assert strong.fit_type == "partial_offload" + assert ( + strong.quality_score + > next(r for r in results if r.model.id == "Qwen/Qwen3-8B").quality_score + ) + + +def test_moe_partial_offload_penalty_uses_active_working_set(): + dense = ModelInfo( + id="example/Dense-30B", + family_id="dense-30b", + name="Dense-30B", + parameter_count=30_000_000_000, + ) + moe = ModelInfo( + id="example/MoE-30B-A3B", + family_id="moe-30b-a3b", + name="MoE-30B-A3B", + parameter_count=30_000_000_000, + parameter_count_active=3_000_000_000, + is_moe=True, + ) + + assert _partial_offload_quality_factor(dense, 0.80) == 0.42 + assert _partial_offload_quality_factor(moe, 0.80) >= 0.66 + + def test_evidence_strict_filters_out_estimated_models(): direct_model = ModelInfo( id="Qwen/Qwen2.5-7B-Instruct",