From 6028beeb62826ac5bf48fbfb009f749410f91e1d Mon Sep 17 00:00:00 2001 From: Andy Date: Sun, 14 Jun 2026 14:40:44 +0900 Subject: [PATCH] feat: add conservative multi-gpu simulation --- docs/cli.md | 6 +- docs/hardware.md | 30 +++++-- src/whichllm/cli.py | 20 +++-- src/whichllm/engine/compatibility.py | 86 +++++++++++++++++-- src/whichllm/engine/ranker.py | 15 ++++ src/whichllm/engine/types.py | 2 + src/whichllm/hardware/gpu_simulator.py | 49 +++++++++++ src/whichllm/output/json_output.py | 3 + src/whichllm/output/ranking.py | 31 ++++--- tests/test_cli.py | 11 +++ tests/test_compatibility.py | 109 +++++++++++++++++++++++++ tests/test_gpu_simulator.py | 41 +++++++++- tests/test_ranker.py | 64 +++++++++++++++ 13 files changed, 434 insertions(+), 33 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index 59ca8ea..0e443c2 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -28,7 +28,7 @@ Common options: | `--json` | Print machine-readable JSON | | `--refresh` | Ignore caches and fetch models/benchmarks again | | `--cpu-only` | Ignore GPUs and rank for CPU-only use | -| `--gpu` | Simulate a GPU by name | +| `--gpu` | Simulate GPU(s) by name. Accepts repeated flags, comma-separated values, and count shorthand | | `--vram` | Override simulated GPU VRAM in GB. Requires `--gpu` | | `--version` | Print the installed package version | @@ -38,6 +38,9 @@ Examples: whichllm whichllm --gpu "RTX 4090" whichllm --gpu "RTX 5060 Ti" --vram 16 +whichllm --gpu "2x RTX 4090" +whichllm --gpu "RTX 4090" --gpu "RTX 3090" +whichllm --gpu "RTX 4090, RTX 3090" whichllm --profile coding --top 5 whichllm --context-length 64k whichllm --evidence strict @@ -70,6 +73,7 @@ whichllm hardware whichllm hardware --cpu-only whichllm hardware --gpu "Apple M3 Max" whichllm hardware --gpu "RTX 3060" --vram 12 +whichllm hardware --gpu "4x RTX 4090" ``` ## `plan` diff --git a/docs/hardware.md b/docs/hardware.md index 7c1a691..38eef3b 100644 --- a/docs/hardware.md +++ b/docs/hardware.md @@ -134,6 +134,19 @@ whichllm hardware --gpu "Unknown GPU" --vram 24 `--vram` requires `--gpu`. +Multi-GPU simulation accepts repeated flags, comma-separated values, and count +shorthand: + +```bash +whichllm --gpu "2x RTX 4090" +whichllm --gpu "RTX 4090" --gpu "RTX 3090" +whichllm --gpu "RTX 4090, RTX 3090" +``` + +`--vram` is only supported for a single simulated GPU. For multi-GPU +simulation, use known GPU names so whichllm can resolve each card's VRAM from +the GPU database. + ## Fit types Compatibility checks classify a candidate into one of three fit types: @@ -147,19 +160,26 @@ Compatibility checks classify a candidate into one of three fit types: If neither GPU memory nor usable RAM can hold the model, the candidate is not ranked. -whichllm reserves about 20% of system RAM for the OS and other processes. +whichllm keeps a bounded system-RAM reserve for the OS and other processes. ## Multiple GPUs -For fit checks, whichllm sums available GPU memory. For speed estimates, it uses -the largest detected GPU as the representative device. +For fit checks, whichllm uses a conservative multi-GPU budget rather than +pretending all VRAM is one perfect device. It starts from raw total VRAM, applies +a small per-GPU overhead, and then applies a utilization factor. Homogeneous +sets receive a less severe reduction than heterogeneous sets. If a dedicated GPU is present, low-aperture shared-memory integrated GPUs are not added to the fit pool. This avoids treating unrelated system RAM and dedicated VRAM as one full-GPU target. -This is a practical approximation. It does not model every tensor-parallel or -pipeline-parallel runtime configuration. +For speed estimates, whichllm uses the largest detected GPU as the +representative device and marks multi-GPU speed as low-confidence. This avoids +claiming ideal scaling when real performance depends on backend split mode, +PCIe/NVLink bandwidth, NCCL/RCCL support, batch size, and model architecture. + +This is a practical fit approximation. It does not model every tensor-parallel +or pipeline-parallel runtime configuration. ## Disk checks diff --git a/src/whichllm/cli.py b/src/whichllm/cli.py index ff81f01..f041b4b 100644 --- a/src/whichllm/cli.py +++ b/src/whichllm/cli.py @@ -53,7 +53,7 @@ def _print_version(value: bool) -> None: def _validate_gpu_flags( cpu_only: bool, - gpu: str | None, + gpu: list[str] | None, vram: float | None, ) -> None: """Validate mutual exclusivity of GPU-related flags.""" @@ -99,17 +99,17 @@ def _resolve_evidence_mode(evidence: str, direct: bool) -> str: def _apply_gpu_overrides( hardware: HardwareInfo, cpu_only: bool, - gpu: str | None, + gpu: list[str] | None, vram: float | None, ) -> HardwareInfo: """Replace hardware.gpus based on CLI flags.""" if cpu_only: hardware.gpus = [] elif gpu: - from whichllm.hardware.gpu_simulator import create_synthetic_gpu + from whichllm.hardware.gpu_simulator import create_synthetic_gpus try: - hardware.gpus = [create_synthetic_gpu(gpu, vram)] + hardware.gpus = create_synthetic_gpus(gpu, vram) except ValueError as e: console.print(f"[red]Error:[/] {e}") raise typer.Exit(code=1) @@ -251,8 +251,10 @@ def main( cpu_only: bool = typer.Option( False, "--cpu-only", help="Ignore GPU and run in CPU-only mode" ), - gpu: Optional[str] = typer.Option( - None, "--gpu", help="Simulate a GPU (e.g. 'RTX 4090')" + gpu: Optional[list[str]] = typer.Option( + None, + "--gpu", + help="Simulate GPU(s), e.g. 'RTX 4090', '2x RTX 4090', or repeat --gpu", ), vram: Optional[float] = typer.Option( None, "--vram", help="Override VRAM in GB (requires --gpu)" @@ -1101,8 +1103,10 @@ def hardware( cpu_only: bool = typer.Option( False, "--cpu-only", help="Ignore GPU and run in CPU-only mode" ), - gpu: Optional[str] = typer.Option( - None, "--gpu", help="Simulate a GPU (e.g. 'RTX 4090')" + gpu: Optional[list[str]] = typer.Option( + None, + "--gpu", + help="Simulate GPU(s), e.g. 'RTX 4090', '2x RTX 4090', or repeat --gpu", ), vram: Optional[float] = typer.Option( None, "--vram", help="Override VRAM in GB (requires --gpu)" diff --git a/src/whichllm/engine/compatibility.py b/src/whichllm/engine/compatibility.py index 632965f..48b4681 100644 --- a/src/whichllm/engine/compatibility.py +++ b/src/whichllm/engine/compatibility.py @@ -12,6 +12,10 @@ from whichllm.hardware.types import GPUInfo, HardwareInfo from whichllm.models.types import GGUFVariant, ModelInfo +_MULTI_GPU_FRAMEWORK_OVERHEAD_BYTES = int(0.3 * _GiB) +_MULTI_GPU_HOMOGENEOUS_UTILIZATION = 0.95 +_MULTI_GPU_HETEROGENEOUS_UTILIZATION = 0.90 + def _gpu_available_memory(gpu: GPUInfo, usable_ram: int) -> int: if gpu.shared_memory and gpu.vram_bytes < 2 * _GiB: @@ -46,6 +50,64 @@ def _fit_candidate_gpus(gpus: list[GPUInfo]) -> list[GPUInfo]: return [gpu for gpu in gpus if not _uses_shared_system_pool(gpu)] +def _gpu_identity(gpu: GPUInfo) -> str: + name = gpu.name.lower().replace("(simulated)", "") + return " ".join(name.split()) + + +def _is_homogeneous_gpu_set(gpus: list[GPUInfo], available: list[int]) -> bool: + if not gpus: + return True + first = gpus[0] + first_identity = _gpu_identity(first) + first_available = available[0] + vram_tolerance = max(256 * 1024**2, int(first_available * 0.02)) + return all( + gpu.vendor == first.vendor + and _gpu_identity(gpu) == first_identity + and abs(gpu_available - first_available) <= vram_tolerance + for gpu, gpu_available in zip(gpus, available, strict=True) + ) + + +def _multi_gpu_effective_vram( + gpus: list[GPUInfo], + available: list[int], + warnings: list[str], +) -> tuple[int, bool, int | None]: + raw_total = sum(available) + if len(gpus) <= 1: + return raw_total, False, None + + if any(gpu.shared_memory or gpu.vendor == "apple" for gpu in gpus): + effective = max(available) + warnings.append( + "Multiple shared-memory GPUs are not pooled; using the largest " + "reported memory pool for fit checks" + ) + return effective, False, None + + homogeneous = _is_homogeneous_gpu_set(gpus, available) + utilization = ( + _MULTI_GPU_HOMOGENEOUS_UTILIZATION + if homogeneous + else _MULTI_GPU_HETEROGENEOUS_UTILIZATION + ) + overhead = min(raw_total, len(gpus) * _MULTI_GPU_FRAMEWORK_OVERHEAD_BYTES) + effective = int((raw_total - overhead) * utilization) + + warnings.append( + "Multi-GPU fit uses a conservative layer-split budget: " + f"{effective / _GiB:.1f} GB effective from {raw_total / _GiB:.1f} GB raw VRAM" + ) + if not homogeneous: + warnings.append( + "Heterogeneous multi-GPU setup: fit assumes uneven layer placement; " + "speed depends on backend split mode and interconnect" + ) + return effective, True, effective + + def check_compatibility( model: ModelInfo, variant: GGUFVariant | None, @@ -62,16 +124,25 @@ def check_compatibility( # Determine best GPU best_gpu: GPUInfo | None = None best_gpu_available = 0 - total_vram = 0 + gpu_available_values: list[int] = [] candidate_gpus = _fit_candidate_gpus(hardware.gpus) for gpu in candidate_gpus: gpu_available = _gpu_available_memory(gpu, usable_ram) - total_vram += gpu_available + gpu_available_values.append(gpu_available) if best_gpu is None or gpu_available > best_gpu_available: best_gpu = gpu best_gpu_available = gpu_available - vram_available = total_vram if total_vram > 0 else 0 + vram_available = sum(gpu_available_values) if gpu_available_values else 0 + fit_vram_available, uses_multi_gpu, multi_gpu_effective_vram = ( + _multi_gpu_effective_vram(candidate_gpus, gpu_available_values, warnings) + ) + if ( + len(candidate_gpus) > 1 + and not uses_multi_gpu + and any(gpu.shared_memory or gpu.vendor == "apple" for gpu in candidate_gpus) + ): + vram_available = fit_vram_available offload_ram_available = ( 0 if best_gpu and (best_gpu.shared_memory or best_gpu.vendor == "apple") @@ -108,17 +179,18 @@ def check_compatibility( warnings.append("Metal requires macOS for Apple Silicon inference") # Determine fit type - if vram_available >= vram_required: + if fit_vram_available >= vram_required: fit_type = "full_gpu" can_run = True offload_ratio = 0.0 elif ( - vram_available > 0 and (vram_available + offload_ram_available) >= vram_required + fit_vram_available > 0 + and (fit_vram_available + offload_ram_available) >= vram_required ): fit_type = "partial_offload" can_run = True offload_ratio = ( - (vram_required - vram_available) / vram_required + (vram_required - fit_vram_available) / vram_required if vram_required > 0 else 0.0 ) @@ -171,6 +243,8 @@ def check_compatibility( vram_required_bytes=vram_required, vram_available_bytes=vram_available, offload_ratio=offload_ratio, + uses_multi_gpu=uses_multi_gpu, + multi_gpu_effective_vram_bytes=multi_gpu_effective_vram, warnings=warnings, fit_type=fit_type, context_fits=context_fits, diff --git a/src/whichllm/engine/ranker.py b/src/whichllm/engine/ranker.py index 2e3d8d8..20bd3db 100644 --- a/src/whichllm/engine/ranker.py +++ b/src/whichllm/engine/ranker.py @@ -33,6 +33,7 @@ _LINEAGE_FAMILY_MAX: dict[str, int] = { family: max(idx for _, idx in entries) for family, entries in _LINEAGE_REGEX.items() } +_MULTI_GPU_SPEED_FACTOR = 0.70 def _family_selection_key( @@ -755,6 +756,8 @@ def rank_models( tok_per_sec = estimate_tok_per_sec( model, variant, best_gpu, compat.fit_type ) + if compat.uses_multi_gpu: + tok_per_sec *= _MULTI_GPU_SPEED_FACTOR if min_speed is not None and tok_per_sec < min_speed: continue @@ -781,6 +784,18 @@ def rank_models( compat.fit_type, tok_per_sec, ) + if compat.uses_multi_gpu: + compat.speed_confidence = "low" + if tok_per_sec > 0: + compat.speed_range_tok_per_sec = ( + round(tok_per_sec * 0.35, 1), + round(tok_per_sec * 2.0, 1), + ) + compat.speed_notes.append( + "Multi-GPU speed depends on layer/tensor split mode, " + "PCIe/NVLink bandwidth, and backend support; this estimate " + "does not assume ideal scaling." + ) compat.quality_score = _compute_quality_score( model, variant, diff --git a/src/whichllm/engine/types.py b/src/whichllm/engine/types.py index b83829a..a14fbe6 100644 --- a/src/whichllm/engine/types.py +++ b/src/whichllm/engine/types.py @@ -24,3 +24,5 @@ class CompatibilityResult: benchmark_source: str = "none" # granular: "direct" | "variant" | "base_model" | "line_interp" | "self_reported" | "none" benchmark_confidence: float = 0.0 # 0.0-1.0 from BenchmarkEvidence context_fits: bool = True # False when known model max context < requested + uses_multi_gpu: bool = False + multi_gpu_effective_vram_bytes: int | None = None diff --git a/src/whichllm/hardware/gpu_simulator.py b/src/whichllm/hardware/gpu_simulator.py index 1a21f03..2583e8b 100644 --- a/src/whichllm/hardware/gpu_simulator.py +++ b/src/whichllm/hardware/gpu_simulator.py @@ -8,6 +8,7 @@ import logging import re +from collections.abc import Sequence from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -190,6 +191,54 @@ def _lookup_dbgpu(name: str) -> GPUSpecification | None: _last_suggestions: list[tuple[str, int]] = [] +def parse_synthetic_gpu_specs(values: Sequence[str] | str) -> list[str]: + """Expand CLI GPU simulation values into individual GPU names. + + Accepts repeated options, comma-separated names, and count shorthand such + as ``2x RTX 4090``. The returned names are still looked up by + ``create_synthetic_gpu`` so existing fuzzy matching and aliases stay in + one place. + """ + raw_values = [values] if isinstance(values, str) else list(values) + gpu_names: list[str] = [] + + for raw in raw_values: + for part in raw.split(","): + spec = part.strip() + if not spec: + raise ValueError("Empty GPU entry in --gpu.") + + count_match = re.match(r"^(\d+)\s*x\s+(.+)$", spec, re.IGNORECASE) + if count_match: + count = int(count_match.group(1)) + name = count_match.group(2).strip() + if count < 1: + raise ValueError("GPU count must be at least 1.") + if not name: + raise ValueError("GPU count shorthand requires a GPU name.") + gpu_names.extend([name] * count) + else: + gpu_names.append(spec) + + if not gpu_names: + raise ValueError("At least one GPU must be specified.") + return gpu_names + + +def create_synthetic_gpus( + values: Sequence[str] | str, + vram_override_gb: float | None = None, +) -> list[GPUInfo]: + """Create one or more synthetic GPUs from CLI-style values.""" + names = parse_synthetic_gpu_specs(values) + if vram_override_gb is not None and len(names) != 1: + raise ValueError( + "--vram currently supports exactly one simulated GPU. " + "For multi-GPU simulation, specify known GPU names and omit --vram." + ) + return [create_synthetic_gpu(name, vram_override_gb) for name in names] + + def create_synthetic_gpu(name: str, vram_override_gb: float | None = None) -> GPUInfo: """Create a synthetic GPUInfo from a GPU name. diff --git a/src/whichllm/output/json_output.py b/src/whichllm/output/json_output.py index 2e044de..4fa2f24 100644 --- a/src/whichllm/output/json_output.py +++ b/src/whichllm/output/json_output.py @@ -45,6 +45,9 @@ def display_json(results: list[CompatibilityResult], hardware: HardwareInfo) -> else estimate_weight_bytes(r.model, None) ), "vram_required_bytes": r.vram_required_bytes, + "vram_available_bytes": r.vram_available_bytes, + "uses_multi_gpu": r.uses_multi_gpu, + "multi_gpu_effective_vram_bytes": r.multi_gpu_effective_vram_bytes, "estimated_tok_per_sec": r.estimated_tok_per_sec, "speed_confidence": r.speed_confidence, "speed_range_tok_per_sec": ( diff --git a/src/whichllm/output/ranking.py b/src/whichllm/output/ranking.py index c7d6df9..61b697a 100644 --- a/src/whichllm/output/ranking.py +++ b/src/whichllm/output/ranking.py @@ -42,35 +42,42 @@ def _top_pick_confidence(results: list[CompatibilityResult]) -> tuple[str, str]: """Return confidence level and explanation for top pick.""" top = results[0] gap = (top.quality_score - results[1].quality_score) if len(results) > 1 else 999.0 - fit_note = "" + notes: list[str] = [] if top.fit_type == "partial_offload": - fit_note = ", partial offload" + notes.append("partial offload") elif top.fit_type == "cpu_only": - fit_note = ", CPU-only" + notes.append("CPU-only") + if top.speed_confidence == "low": + notes.append("low-confidence speed") + risk_note = f", {', '.join(notes)}" if notes else "" if top.benchmark_status == "none": - return "Low", f"no benchmark data, gap +{gap:.1f}{fit_note}" + return "Low", f"no benchmark data, gap +{gap:.1f}{risk_note}" if top.benchmark_status == "self_reported": return ( "Low", - f"uploader-reported benchmark only (unverified), gap +{gap:.1f}{fit_note}", + f"uploader-reported benchmark only (unverified), gap +{gap:.1f}{risk_note}", ) if top.benchmark_status == "estimated": if gap >= 2.0: - return "Medium", f"estimated benchmark, gap +{gap:.1f}{fit_note}" - return "Low", f"estimated benchmark, gap +{gap:.1f}{fit_note}" + confidence = "Medium" + else: + confidence = "Low" + if top.speed_confidence == "low" and confidence == "Medium": + confidence = "Low" + return confidence, f"estimated benchmark, gap +{gap:.1f}{risk_note}" if gap >= 2.5: confidence = "High" - reason = f"direct benchmark, gap +{gap:.1f}{fit_note}" + reason = f"direct benchmark, gap +{gap:.1f}{risk_note}" elif gap >= 1.0: confidence = "Medium" - reason = f"direct benchmark, gap +{gap:.1f}{fit_note}" + reason = f"direct benchmark, gap +{gap:.1f}{risk_note}" else: confidence = "Low" - reason = f"direct benchmark but very close (+{gap:.1f}){fit_note}" + reason = f"direct benchmark but very close (+{gap:.1f}){risk_note}" - # オフロード/CPU-onlyの1位は実運用で不確実性が高いため信頼度を1段階下げる - if top.fit_type != "full_gpu": + # オフロード/CPU-only/低信頼speedの1位は実運用で不確実性が高いため信頼度を1段階下げる + if top.fit_type != "full_gpu" or top.speed_confidence == "low": if confidence == "High": confidence = "Medium" elif confidence == "Medium": diff --git a/tests/test_cli.py b/tests/test_cli.py index b052616..8013fe9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -6,6 +6,7 @@ import whichllm.cli as cli_mod from whichllm.cli import ( + _apply_gpu_overrides, _auto_min_params_for_profile, _fill_missing_published_at, _format_fetch_error, @@ -60,6 +61,16 @@ def test_auto_min_params_non_general_disabled(): assert _auto_min_params_for_profile(_hw_with_gpu(24), "coding") is None +def test_apply_gpu_overrides_accepts_multiple_simulated_gpus(): + hw = HardwareInfo(gpus=[], ram_bytes=64 * 1024**3, os="linux") + + _apply_gpu_overrides(hw, cpu_only=False, gpu=["2x RTX 4090"], vram=None) + + assert len(hw.gpus) == 2 + assert all(gpu.vendor == "nvidia" for gpu in hw.gpus) + assert all(gpu.vram_bytes == 24 * 1024**3 for gpu in hw.gpus) + + def test_include_vision_candidates_by_profile(): assert _include_vision_candidates("vision") is True assert _include_vision_candidates("any") is True diff --git a/tests/test_compatibility.py b/tests/test_compatibility.py index f170f21..ef792dd 100644 --- a/tests/test_compatibility.py +++ b/tests/test_compatibility.py @@ -1,5 +1,6 @@ """Tests for compatibility checking.""" +from whichllm.constants import _GiB from whichllm.engine.compatibility import check_compatibility from whichllm.hardware.memory import estimate_usable_ram from whichllm.hardware.types import GPUInfo, HardwareInfo @@ -160,6 +161,114 @@ def test_shared_memory_igpu_is_not_summed_with_dedicated_gpu(): assert any("offloaded to CPU RAM" in w for w in result.warnings) +def test_homogeneous_multi_gpu_uses_conservative_fit_budget(): + model = _make_model(1_000_000_000) + variant = _make_variant(int(46 * _GiB)) + hw = HardwareInfo( + gpus=[ + GPUInfo( + name="NVIDIA GeForce RTX 4090", + vendor="nvidia", + vram_bytes=24 * _GiB, + compute_capability=(8, 9), + memory_bandwidth_gbps=1008.0, + ), + GPUInfo( + name="NVIDIA GeForce RTX 4090", + vendor="nvidia", + vram_bytes=24 * _GiB, + compute_capability=(8, 9), + memory_bandwidth_gbps=1008.0, + ), + ], + cpu_name="Test CPU", + cpu_cores=16, + ram_bytes=128 * _GiB, + disk_free_bytes=200 * _GiB, + os="linux", + ) + + result = check_compatibility(model, variant, hw) + + assert result.can_run is True + assert result.fit_type == "partial_offload" + assert result.uses_multi_gpu is True + assert result.vram_available_bytes == 48 * _GiB + assert result.multi_gpu_effective_vram_bytes is not None + assert result.multi_gpu_effective_vram_bytes < result.vram_available_bytes + assert any("conservative layer-split budget" in w for w in result.warnings) + + +def test_heterogeneous_multi_gpu_warns_about_split_assumptions(): + model = _make_model() + variant = _make_variant(20 * _GiB) + hw = HardwareInfo( + gpus=[ + GPUInfo( + name="NVIDIA GeForce RTX 4090", + vendor="nvidia", + vram_bytes=24 * _GiB, + compute_capability=(8, 9), + memory_bandwidth_gbps=1008.0, + ), + GPUInfo( + name="NVIDIA GeForce RTX 3060", + vendor="nvidia", + vram_bytes=12 * _GiB, + compute_capability=(8, 6), + memory_bandwidth_gbps=360.0, + ), + ], + cpu_name="Test CPU", + cpu_cores=16, + ram_bytes=64 * _GiB, + disk_free_bytes=200 * _GiB, + os="linux", + ) + + result = check_compatibility(model, variant, hw) + + assert result.can_run is True + assert result.uses_multi_gpu is True + assert result.multi_gpu_effective_vram_bytes is not None + assert result.multi_gpu_effective_vram_bytes < 36 * _GiB + assert any("Heterogeneous multi-GPU" in w for w in result.warnings) + + +def test_multiple_shared_memory_gpus_are_not_summed(): + model = _make_model(120_000_000_000) + variant = _make_variant(70 * _GiB) + hw = HardwareInfo( + gpus=[ + GPUInfo( + name="Integrated GPU A", + vendor="amd", + vram_bytes=0, + memory_bandwidth_gbps=120.0, + shared_memory=True, + ), + GPUInfo( + name="Integrated GPU B", + vendor="intel", + vram_bytes=0, + shared_memory=True, + ), + ], + cpu_name="Test CPU", + cpu_cores=16, + ram_bytes=64 * _GiB, + disk_free_bytes=200 * _GiB, + os="linux", + ) + + result = check_compatibility(model, variant, hw) + + assert result.vram_available_bytes == estimate_usable_ram(hw.ram_bytes) + assert result.multi_gpu_effective_vram_bytes is None + assert result.fit_type == "cpu_only" + assert any("shared-memory GPUs are not pooled" in w for w in result.warnings) + + def test_apple_silicon_does_not_double_count_unified_memory(): """Apple Silicon uses unified memory: vram_bytes IS the system RAM. The fit checker must not add a separate offload pool on top.""" diff --git a/tests/test_gpu_simulator.py b/tests/test_gpu_simulator.py index f01584d..8c843d7 100644 --- a/tests/test_gpu_simulator.py +++ b/tests/test_gpu_simulator.py @@ -3,7 +3,46 @@ import pytest from whichllm.constants import _GiB -from whichllm.hardware.gpu_simulator import create_synthetic_gpu +from whichllm.hardware.gpu_simulator import ( + create_synthetic_gpu, + create_synthetic_gpus, + parse_synthetic_gpu_specs, +) + + +class TestMultiGPUSpecParsing: + def test_comma_separated_gpu_specs(self): + assert parse_synthetic_gpu_specs(["RTX 4090, RTX 3090"]) == [ + "RTX 4090", + "RTX 3090", + ] + + def test_repeated_gpu_specs(self): + assert parse_synthetic_gpu_specs(["RTX 4090", "RTX 3090"]) == [ + "RTX 4090", + "RTX 3090", + ] + + def test_count_shorthand(self): + assert parse_synthetic_gpu_specs(["2x RTX 4090, 1x RTX 3090"]) == [ + "RTX 4090", + "RTX 4090", + "RTX 3090", + ] + + def test_empty_entry_raises(self): + with pytest.raises(ValueError, match="Empty GPU entry"): + parse_synthetic_gpu_specs(["RTX 4090,"]) + + def test_create_synthetic_gpus_expands_count(self): + gpus = create_synthetic_gpus(["2x RTX 4090"]) + assert len(gpus) == 2 + assert all(gpu.vendor == "nvidia" for gpu in gpus) + assert all(gpu.vram_bytes == 24 * _GiB for gpu in gpus) + + def test_multi_gpu_vram_override_is_rejected(self): + with pytest.raises(ValueError, match="exactly one simulated GPU"): + create_synthetic_gpus(["2x RTX 4090"], vram_override_gb=24) class TestKnownGPULookup: diff --git a/tests/test_ranker.py b/tests/test_ranker.py index 3b374ee..585c9cd 100644 --- a/tests/test_ranker.py +++ b/tests/test_ranker.py @@ -785,6 +785,70 @@ def test_unknown_speed_heavy_partial_offload_does_not_top_rank(): assert heavy.estimated_tok_per_sec == 0.0 +def test_multi_gpu_speed_confidence_is_low(): + from whichllm.engine.performance import estimate_tok_per_sec + + model = ModelInfo( + id="org/Test-34B-GGUF", + family_id="org/Test-34B-GGUF", + name="Test-34B-GGUF", + parameter_count=34_000_000_000, + downloads=1000, + likes=100, + gguf_variants=[ + GGUFVariant( + filename="test-34b-Q4_K_M.gguf", + quant_type="Q4_K_M", + file_size_bytes=22 * 1024**3, + ) + ], + ) + hw = HardwareInfo( + gpus=[ + GPUInfo( + name="NVIDIA GeForce RTX 4090", + vendor="nvidia", + vram_bytes=24 * 1024**3, + compute_capability=(8, 9), + memory_bandwidth_gbps=1008.0, + ), + GPUInfo( + name="NVIDIA GeForce RTX 4090", + vendor="nvidia", + vram_bytes=24 * 1024**3, + compute_capability=(8, 9), + memory_bandwidth_gbps=1008.0, + ), + ], + cpu_name="Test CPU", + cpu_cores=16, + has_avx2=True, + ram_bytes=128 * 1024**3, + disk_free_bytes=500 * 1024**3, + os="linux", + ) + + results = rank_models( + [model], + hw, + top_n=1, + benchmark_scores={"org/Test-34B-GGUF": 70.0}, + ) + + assert results + assert results[0].fit_type == "full_gpu" + assert results[0].uses_multi_gpu is True + assert results[0].speed_confidence == "low" + single_gpu_speed = estimate_tok_per_sec( + model, + model.gguf_variants[0], + hw.gpus[0], + "full_gpu", + ) + assert results[0].estimated_tok_per_sec == single_gpu_speed * 0.70 + assert any("Multi-GPU speed depends" in note for note in results[0].speed_notes) + + def test_benchmark_source_and_confidence_exposed_for_direct(): model = ModelInfo( id="Qwen/Qwen2.5-7B-Instruct",