Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions docs/how-it-works.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,10 @@ For each candidate variant:
5. Compute a quality score.
6. Keep the best variant for the model family.

The final sorting key includes the quality score, a fit bonus, and a small
direct-benchmark bonus. Full-GPU candidates are preferred over comparable
partial-offload candidates because they are usually more responsive in practice.
The final sorting key stays close to the displayed quality score, with a small
direct-benchmark bonus and a CPU-only penalty. Full-GPU candidates are already
favored inside the score through the runtime-fit and speed adjustments, so the
sort key does not add a second full-GPU bonus.

See [Scoring](scoring.md) for the score details.

Expand Down
19 changes: 8 additions & 11 deletions docs/scoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,20 +99,17 @@ The candidate's runtime form matters:
| Fit | Multiplier |
| --- | ---: |
| Full GPU | `1.00` |
| Partial offload | `0.72` |
| Partial offload | `0.42`-`0.88`, based on spill ratio |
| CPU-only | `0.50` |

The final family selection key also adds a fit bonus:
Light partial offload is penalized less than heavy offload. MoE models receive
a milder penalty when the active parameter working set can plausibly stay on
GPU while inactive experts spill to CPU RAM.

| Fit | Bonus |
| --- | ---: |
| Full GPU | `+15` |
| Partial offload | `0` |
| CPU-only | `-15` |

This keeps a responsive full-GPU result ahead of a similar partial-offload
result, without letting a very weak full-GPU model beat a much stronger model
that only needs modest offload.
The final family selection key does not add a separate full-GPU bonus. Runtime
fit is already reflected in the quality score through the multiplier above and
the speed adjustment below. CPU-only results receive a small extra sort penalty
when mixed with GPU-backed candidates.

## Speed adjustment

Expand Down
50 changes: 32 additions & 18 deletions src/whichllm/engine/ranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,28 +41,20 @@ def _family_selection_key(
) -> tuple[float]:
"""Family-level selection key — single composite score.

Combines quality, fit type, and evidence tier into one number so the
sort is fully transitive and edge cases resolve sensibly:

- ``fit_bonus`` (+15 / 0 / -15) is large enough that "estimated,
full-GPU" still beats "direct, partial-offload" of comparable
quality (users on small VRAM prefer the responsive option),
but small enough that a quality-17 Q1_0 full-GPU model loses to
a quality-57 partial-offload 27B model
``quality_score`` already includes the runtime fit penalty and speed
adjustment. Keep final selection close to that displayed score so strong
partial-offload candidates do not get discounted again while sorting.

- ``direct_bonus`` (+5) gives independent leaderboard evidence a
small edge at the same fit; cannot overturn a 6+ point quality gap
"""
fit_bonus = {
"full_gpu": 15.0,
"partial_offload": 0.0,
"cpu_only": -15.0,
}.get(result.fit_type, -15.0)
if require_direct_top and result.benchmark_status == "direct":
direct_bonus = 5.0
else:
direct_bonus = 0.0
cpu_penalty = -6.0 if result.fit_type == "cpu_only" else 0.0
ctx_penalty = -20.0 if not result.context_fits else 0.0
return (result.quality_score + fit_bonus + direct_bonus + ctx_penalty,)
return (result.quality_score + direct_bonus + cpu_penalty + ctx_penalty,)


def _partial_offload_quality_factor(model: ModelInfo, offload_ratio: float) -> float:
Expand All @@ -74,14 +66,36 @@ def _partial_offload_quality_factor(model: ModelInfo, offload_ratio: float) -> f
factor = 0.52
elif ratio >= 0.40:
factor = 0.62
elif ratio >= 0.25:
factor = 0.76
else:
factor = 0.72
factor = 0.86

# MoE offload is more nuanced: inactive experts and router/runtime
# placement do not hurt equally. Keep the penalty, but do not treat it
# as badly as dense-layer offload.
# placement do not hurt equally. If the GPU can plausibly hold the
# active expert working set, do not treat inactive-expert spill like
# dense-layer spill.
if model.is_moe and model.parameter_count_active:
factor = min(0.72, factor + 0.08)
active_ratio = (
model.parameter_count_active / model.parameter_count
if model.parameter_count > 0
else 1.0
)
active_ratio = max(0.0, min(1.0, active_ratio))
active_set_fits = ratio <= max(0.0, 1.0 - active_ratio)
if active_set_fits:
if ratio >= 0.75:
factor = max(factor, 0.66)
elif ratio >= 0.60:
factor = max(factor, 0.70)
elif ratio >= 0.40:
factor = max(factor, 0.76)
elif ratio >= 0.25:
factor = max(factor, 0.82)
else:
factor = max(factor, 0.88)
else:
factor = min(0.76, factor + 0.08)

return factor

Expand Down
125 changes: 124 additions & 1 deletion tests/test_ranker.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Tests for ranking behavior."""

from whichllm.engine.quantization import effective_quant_type
from whichllm.engine.ranker import rank_models
from whichllm.engine.ranker import _partial_offload_quality_factor, rank_models
from whichllm.hardware.types import GPUInfo, HardwareInfo
from whichllm.models.types import GGUFVariant, ModelInfo

Expand Down Expand Up @@ -493,6 +493,129 @@ def test_full_gpu_estimated_ranks_above_partial_direct():
assert results[0].model.id == "Qwen/Qwen3-8B-AWQ"


def test_strong_partial_offload_not_buried_below_weaker_full_gpu():
strong_partial = ModelInfo(
id="Qwen/Qwen3.6-27B",
family_id="qwen3.6-27b",
name="Qwen3.6-27B",
parameter_count=27_800_000_000,
downloads=5_300_000,
likes=10_000,
gguf_variants=[
GGUFVariant(
filename="qwen3.6-27b-q4_k_m.gguf",
quant_type="Q4_K_M",
file_size_bytes=15 * 1024**3,
)
],
)
full_gpu_14b = ModelInfo(
id="Qwen/Qwen3-14B",
family_id="qwen3-14b",
name="Qwen3-14B",
parameter_count=14_800_000_000,
downloads=1_600_000,
likes=5_000,
gguf_variants=[
GGUFVariant(
filename="qwen3-14b-q5_k_m.gguf",
quant_type="Q5_K_M",
file_size_bytes=9 * 1024**3,
)
],
)
full_gpu_8b = ModelInfo(
id="Qwen/Qwen3-8B",
family_id="qwen3-8b",
name="Qwen3-8B",
parameter_count=8_200_000_000,
downloads=11_000_000,
likes=5_000,
gguf_variants=[
GGUFVariant(
filename="qwen3-8b-q5_k_m.gguf",
quant_type="Q5_K_M",
file_size_bytes=5 * 1024**3,
)
],
)
old_full_gpu = ModelInfo(
id="google/gemma-2-9b-it",
family_id="gemma-2-9b-it",
name="gemma-2-9b-it",
parameter_count=9_200_000_000,
downloads=400_000,
likes=1_000,
gguf_variants=[
GGUFVariant(
filename="gemma-2-9b-q5_k_m.gguf",
quant_type="Q5_K_M",
file_size_bytes=5_500_000_000,
)
],
)
hardware = HardwareInfo(
gpus=[
GPUInfo(
name="RTX 3060",
vendor="nvidia",
vram_bytes=12 * 1024**3,
compute_capability=(8, 6),
memory_bandwidth_gbps=360.0,
)
],
cpu_name="Test CPU",
cpu_cores=6,
has_avx2=True,
ram_bytes=32 * 1024**3,
disk_free_bytes=500 * 1024**3,
os="windows",
)

results = rank_models(
[strong_partial, full_gpu_14b, full_gpu_8b, old_full_gpu],
hardware,
top_n=10,
benchmark_scores={
"Qwen/Qwen3.6-27B": 83.5,
"Qwen/Qwen3-14B": 66.7,
"Qwen/Qwen3-8B": 56.1,
"google/gemma-2-9b-it": 35.1,
},
task_profile="any",
)

ids = [r.model.id for r in results]
assert ids.index("Qwen/Qwen3.6-27B") < ids.index("Qwen/Qwen3-8B")
assert ids.index("Qwen/Qwen3.6-27B") < ids.index("google/gemma-2-9b-it")
strong = next(r for r in results if r.model.id == "Qwen/Qwen3.6-27B")
assert strong.fit_type == "partial_offload"
assert (
strong.quality_score
> next(r for r in results if r.model.id == "Qwen/Qwen3-8B").quality_score
)


def test_moe_partial_offload_penalty_uses_active_working_set():
dense = ModelInfo(
id="example/Dense-30B",
family_id="dense-30b",
name="Dense-30B",
parameter_count=30_000_000_000,
)
moe = ModelInfo(
id="example/MoE-30B-A3B",
family_id="moe-30b-a3b",
name="MoE-30B-A3B",
parameter_count=30_000_000_000,
parameter_count_active=3_000_000_000,
is_moe=True,
)

assert _partial_offload_quality_factor(dense, 0.80) == 0.42
assert _partial_offload_quality_factor(moe, 0.80) >= 0.66


def test_evidence_strict_filters_out_estimated_models():
direct_model = ModelInfo(
id="Qwen/Qwen2.5-7B-Instruct",
Expand Down