From 6028beeb62826ac5bf48fbfb009f749410f91e1d Mon Sep 17 00:00:00 2001
From: Andy <takuyarossi77@gmail.com>
Date: Sun, 14 Jun 2026 14:40:44 +0900
Subject: [PATCH] feat: add conservative multi-gpu simulation

---
 docs/cli.md                            |   6 +-
 docs/hardware.md                       |  30 +++++--
 src/whichllm/cli.py                    |  20 +++--
 src/whichllm/engine/compatibility.py   |  86 +++++++++++++++++--
 src/whichllm/engine/ranker.py          |  15 ++++
 src/whichllm/engine/types.py           |   2 +
 src/whichllm/hardware/gpu_simulator.py |  49 +++++++++++
 src/whichllm/output/json_output.py     |   3 +
 src/whichllm/output/ranking.py         |  31 ++++---
 tests/test_cli.py                      |  11 +++
 tests/test_compatibility.py            | 109 +++++++++++++++++++++++++
 tests/test_gpu_simulator.py            |  41 +++++++++-
 tests/test_ranker.py                   |  64 +++++++++++++++
 13 files changed, 434 insertions(+), 33 deletions(-)

diff --git a/docs/cli.md b/docs/cli.md
index 59ca8ea..0e443c2 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -28,7 +28,7 @@ Common options:
 | `--json` | Print machine-readable JSON |
 | `--refresh` | Ignore caches and fetch models/benchmarks again |
 | `--cpu-only` | Ignore GPUs and rank for CPU-only use |
-| `--gpu` | Simulate a GPU by name |
+| `--gpu` | Simulate GPU(s) by name. Accepts repeated flags, comma-separated values, and count shorthand |
 | `--vram` | Override simulated GPU VRAM in GB. Requires `--gpu` |
 | `--version` | Print the installed package version |
 
@@ -38,6 +38,9 @@ Examples:
 whichllm
 whichllm --gpu "RTX 4090"
 whichllm --gpu "RTX 5060 Ti" --vram 16
+whichllm --gpu "2x RTX 4090"
+whichllm --gpu "RTX 4090" --gpu "RTX 3090"
+whichllm --gpu "RTX 4090, RTX 3090"
 whichllm --profile coding --top 5
 whichllm --context-length 64k
 whichllm --evidence strict
@@ -70,6 +73,7 @@ whichllm hardware
 whichllm hardware --cpu-only
 whichllm hardware --gpu "Apple M3 Max"
 whichllm hardware --gpu "RTX 3060" --vram 12
+whichllm hardware --gpu "4x RTX 4090"
 ```
 
 ## `plan`
diff --git a/docs/hardware.md b/docs/hardware.md
index 7c1a691..38eef3b 100644
--- a/docs/hardware.md
+++ b/docs/hardware.md
@@ -134,6 +134,19 @@ whichllm hardware --gpu "Unknown GPU" --vram 24
 
 `--vram` requires `--gpu`.
 
+Multi-GPU simulation accepts repeated flags, comma-separated values, and count
+shorthand:
+
+```bash
+whichllm --gpu "2x RTX 4090"
+whichllm --gpu "RTX 4090" --gpu "RTX 3090"
+whichllm --gpu "RTX 4090, RTX 3090"
+```
+
+`--vram` is only supported for a single simulated GPU. For multi-GPU
+simulation, use known GPU names so whichllm can resolve each card's VRAM from
+the GPU database.
+
 ## Fit types
 
 Compatibility checks classify a candidate into one of three fit types:
@@ -147,19 +160,26 @@ Compatibility checks classify a candidate into one of three fit types:
 If neither GPU memory nor usable RAM can hold the model, the candidate is not
 ranked.
 
-whichllm reserves about 20% of system RAM for the OS and other processes.
+whichllm keeps a bounded system-RAM reserve for the OS and other processes.
 
 ## Multiple GPUs
 
-For fit checks, whichllm sums available GPU memory. For speed estimates, it uses
-the largest detected GPU as the representative device.
+For fit checks, whichllm uses a conservative multi-GPU budget rather than
+pretending all VRAM is one perfect device. It starts from raw total VRAM, applies
+a small per-GPU overhead, and then applies a utilization factor. Homogeneous
+sets receive a less severe reduction than heterogeneous sets.
 
 If a dedicated GPU is present, low-aperture shared-memory integrated GPUs are
 not added to the fit pool. This avoids treating unrelated system RAM and
 dedicated VRAM as one full-GPU target.
 
-This is a practical approximation. It does not model every tensor-parallel or
-pipeline-parallel runtime configuration.
+For speed estimates, whichllm uses the largest detected GPU as the
+representative device and marks multi-GPU speed as low-confidence. This avoids
+claiming ideal scaling when real performance depends on backend split mode,
+PCIe/NVLink bandwidth, NCCL/RCCL support, batch size, and model architecture.
+
+This is a practical fit approximation. It does not model every tensor-parallel
+or pipeline-parallel runtime configuration.
 
 ## Disk checks
 
diff --git a/src/whichllm/cli.py b/src/whichllm/cli.py
index ff81f01..f041b4b 100644
--- a/src/whichllm/cli.py
+++ b/src/whichllm/cli.py
@@ -53,7 +53,7 @@ def _print_version(value: bool) -> None:
 
 def _validate_gpu_flags(
     cpu_only: bool,
-    gpu: str | None,
+    gpu: list[str] | None,
     vram: float | None,
 ) -> None:
     """Validate mutual exclusivity of GPU-related flags."""
@@ -99,17 +99,17 @@ def _resolve_evidence_mode(evidence: str, direct: bool) -> str:
 def _apply_gpu_overrides(
     hardware: HardwareInfo,
     cpu_only: bool,
-    gpu: str | None,
+    gpu: list[str] | None,
     vram: float | None,
 ) -> HardwareInfo:
     """Replace hardware.gpus based on CLI flags."""
     if cpu_only:
         hardware.gpus = []
     elif gpu:
-        from whichllm.hardware.gpu_simulator import create_synthetic_gpu
+        from whichllm.hardware.gpu_simulator import create_synthetic_gpus
 
         try:
-            hardware.gpus = [create_synthetic_gpu(gpu, vram)]
+            hardware.gpus = create_synthetic_gpus(gpu, vram)
         except ValueError as e:
             console.print(f"[red]Error:[/] {e}")
             raise typer.Exit(code=1)
@@ -251,8 +251,10 @@ def main(
     cpu_only: bool = typer.Option(
         False, "--cpu-only", help="Ignore GPU and run in CPU-only mode"
     ),
-    gpu: Optional[str] = typer.Option(
-        None, "--gpu", help="Simulate a GPU (e.g. 'RTX 4090')"
+    gpu: Optional[list[str]] = typer.Option(
+        None,
+        "--gpu",
+        help="Simulate GPU(s), e.g. 'RTX 4090', '2x RTX 4090', or repeat --gpu",
     ),
     vram: Optional[float] = typer.Option(
         None, "--vram", help="Override VRAM in GB (requires --gpu)"
@@ -1101,8 +1103,10 @@ def hardware(
     cpu_only: bool = typer.Option(
         False, "--cpu-only", help="Ignore GPU and run in CPU-only mode"
     ),
-    gpu: Optional[str] = typer.Option(
-        None, "--gpu", help="Simulate a GPU (e.g. 'RTX 4090')"
+    gpu: Optional[list[str]] = typer.Option(
+        None,
+        "--gpu",
+        help="Simulate GPU(s), e.g. 'RTX 4090', '2x RTX 4090', or repeat --gpu",
     ),
     vram: Optional[float] = typer.Option(
         None, "--vram", help="Override VRAM in GB (requires --gpu)"
diff --git a/src/whichllm/engine/compatibility.py b/src/whichllm/engine/compatibility.py
index 632965f..48b4681 100644
--- a/src/whichllm/engine/compatibility.py
+++ b/src/whichllm/engine/compatibility.py
@@ -12,6 +12,10 @@
 from whichllm.hardware.types import GPUInfo, HardwareInfo
 from whichllm.models.types import GGUFVariant, ModelInfo
 
+_MULTI_GPU_FRAMEWORK_OVERHEAD_BYTES = int(0.3 * _GiB)
+_MULTI_GPU_HOMOGENEOUS_UTILIZATION = 0.95
+_MULTI_GPU_HETEROGENEOUS_UTILIZATION = 0.90
+
 
 def _gpu_available_memory(gpu: GPUInfo, usable_ram: int) -> int:
     if gpu.shared_memory and gpu.vram_bytes < 2 * _GiB:
@@ -46,6 +50,64 @@ def _fit_candidate_gpus(gpus: list[GPUInfo]) -> list[GPUInfo]:
     return [gpu for gpu in gpus if not _uses_shared_system_pool(gpu)]
 
 
+def _gpu_identity(gpu: GPUInfo) -> str:
+    name = gpu.name.lower().replace("(simulated)", "")
+    return " ".join(name.split())
+
+
+def _is_homogeneous_gpu_set(gpus: list[GPUInfo], available: list[int]) -> bool:
+    if not gpus:
+        return True
+    first = gpus[0]
+    first_identity = _gpu_identity(first)
+    first_available = available[0]
+    vram_tolerance = max(256 * 1024**2, int(first_available * 0.02))
+    return all(
+        gpu.vendor == first.vendor
+        and _gpu_identity(gpu) == first_identity
+        and abs(gpu_available - first_available) <= vram_tolerance
+        for gpu, gpu_available in zip(gpus, available, strict=True)
+    )
+
+
+def _multi_gpu_effective_vram(
+    gpus: list[GPUInfo],
+    available: list[int],
+    warnings: list[str],
+) -> tuple[int, bool, int | None]:
+    raw_total = sum(available)
+    if len(gpus) <= 1:
+        return raw_total, False, None
+
+    if any(gpu.shared_memory or gpu.vendor == "apple" for gpu in gpus):
+        effective = max(available)
+        warnings.append(
+            "Multiple shared-memory GPUs are not pooled; using the largest "
+            "reported memory pool for fit checks"
+        )
+        return effective, False, None
+
+    homogeneous = _is_homogeneous_gpu_set(gpus, available)
+    utilization = (
+        _MULTI_GPU_HOMOGENEOUS_UTILIZATION
+        if homogeneous
+        else _MULTI_GPU_HETEROGENEOUS_UTILIZATION
+    )
+    overhead = min(raw_total, len(gpus) * _MULTI_GPU_FRAMEWORK_OVERHEAD_BYTES)
+    effective = int((raw_total - overhead) * utilization)
+
+    warnings.append(
+        "Multi-GPU fit uses a conservative layer-split budget: "
+        f"{effective / _GiB:.1f} GB effective from {raw_total / _GiB:.1f} GB raw VRAM"
+    )
+    if not homogeneous:
+        warnings.append(
+            "Heterogeneous multi-GPU setup: fit assumes uneven layer placement; "
+            "speed depends on backend split mode and interconnect"
+        )
+    return effective, True, effective
+
+
 def check_compatibility(
     model: ModelInfo,
     variant: GGUFVariant | None,
@@ -62,16 +124,25 @@ def check_compatibility(
     # Determine best GPU
     best_gpu: GPUInfo | None = None
     best_gpu_available = 0
-    total_vram = 0
+    gpu_available_values: list[int] = []
     candidate_gpus = _fit_candidate_gpus(hardware.gpus)
     for gpu in candidate_gpus:
         gpu_available = _gpu_available_memory(gpu, usable_ram)
-        total_vram += gpu_available
+        gpu_available_values.append(gpu_available)
         if best_gpu is None or gpu_available > best_gpu_available:
             best_gpu = gpu
             best_gpu_available = gpu_available
 
-    vram_available = total_vram if total_vram > 0 else 0
+    vram_available = sum(gpu_available_values) if gpu_available_values else 0
+    fit_vram_available, uses_multi_gpu, multi_gpu_effective_vram = (
+        _multi_gpu_effective_vram(candidate_gpus, gpu_available_values, warnings)
+    )
+    if (
+        len(candidate_gpus) > 1
+        and not uses_multi_gpu
+        and any(gpu.shared_memory or gpu.vendor == "apple" for gpu in candidate_gpus)
+    ):
+        vram_available = fit_vram_available
     offload_ram_available = (
         0
         if best_gpu and (best_gpu.shared_memory or best_gpu.vendor == "apple")
@@ -108,17 +179,18 @@ def check_compatibility(
         warnings.append("Metal requires macOS for Apple Silicon inference")
 
     # Determine fit type
-    if vram_available >= vram_required:
+    if fit_vram_available >= vram_required:
         fit_type = "full_gpu"
         can_run = True
         offload_ratio = 0.0
     elif (
-        vram_available > 0 and (vram_available + offload_ram_available) >= vram_required
+        fit_vram_available > 0
+        and (fit_vram_available + offload_ram_available) >= vram_required
     ):
         fit_type = "partial_offload"
         can_run = True
         offload_ratio = (
-            (vram_required - vram_available) / vram_required
+            (vram_required - fit_vram_available) / vram_required
             if vram_required > 0
             else 0.0
         )
@@ -171,6 +243,8 @@ def check_compatibility(
         vram_required_bytes=vram_required,
         vram_available_bytes=vram_available,
         offload_ratio=offload_ratio,
+        uses_multi_gpu=uses_multi_gpu,
+        multi_gpu_effective_vram_bytes=multi_gpu_effective_vram,
         warnings=warnings,
         fit_type=fit_type,
         context_fits=context_fits,
diff --git a/src/whichllm/engine/ranker.py b/src/whichllm/engine/ranker.py
index 2e3d8d8..20bd3db 100644
--- a/src/whichllm/engine/ranker.py
+++ b/src/whichllm/engine/ranker.py
@@ -33,6 +33,7 @@
 _LINEAGE_FAMILY_MAX: dict[str, int] = {
     family: max(idx for _, idx in entries) for family, entries in _LINEAGE_REGEX.items()
 }
+_MULTI_GPU_SPEED_FACTOR = 0.70
 
 
 def _family_selection_key(
@@ -755,6 +756,8 @@ def rank_models(
             tok_per_sec = estimate_tok_per_sec(
                 model, variant, best_gpu, compat.fit_type
             )
+            if compat.uses_multi_gpu:
+                tok_per_sec *= _MULTI_GPU_SPEED_FACTOR
             if min_speed is not None and tok_per_sec < min_speed:
                 continue
 
@@ -781,6 +784,18 @@ def rank_models(
                 compat.fit_type,
                 tok_per_sec,
             )
+            if compat.uses_multi_gpu:
+                compat.speed_confidence = "low"
+                if tok_per_sec > 0:
+                    compat.speed_range_tok_per_sec = (
+                        round(tok_per_sec * 0.35, 1),
+                        round(tok_per_sec * 2.0, 1),
+                    )
+                compat.speed_notes.append(
+                    "Multi-GPU speed depends on layer/tensor split mode, "
+                    "PCIe/NVLink bandwidth, and backend support; this estimate "
+                    "does not assume ideal scaling."
+                )
             compat.quality_score = _compute_quality_score(
                 model,
                 variant,
diff --git a/src/whichllm/engine/types.py b/src/whichllm/engine/types.py
index b83829a..a14fbe6 100644
--- a/src/whichllm/engine/types.py
+++ b/src/whichllm/engine/types.py
@@ -24,3 +24,5 @@ class CompatibilityResult:
     benchmark_source: str = "none"  # granular: "direct" | "variant" | "base_model" | "line_interp" | "self_reported" | "none"
     benchmark_confidence: float = 0.0  # 0.0-1.0 from BenchmarkEvidence
     context_fits: bool = True  # False when known model max context < requested
+    uses_multi_gpu: bool = False
+    multi_gpu_effective_vram_bytes: int | None = None
diff --git a/src/whichllm/hardware/gpu_simulator.py b/src/whichllm/hardware/gpu_simulator.py
index 1a21f03..2583e8b 100644
--- a/src/whichllm/hardware/gpu_simulator.py
+++ b/src/whichllm/hardware/gpu_simulator.py
@@ -8,6 +8,7 @@
 
 import logging
 import re
+from collections.abc import Sequence
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
@@ -190,6 +191,54 @@ def _lookup_dbgpu(name: str) -> GPUSpecification | None:
 _last_suggestions: list[tuple[str, int]] = []
 
 
+def parse_synthetic_gpu_specs(values: Sequence[str] | str) -> list[str]:
+    """Expand CLI GPU simulation values into individual GPU names.
+
+    Accepts repeated options, comma-separated names, and count shorthand such
+    as ``2x RTX 4090``. The returned names are still looked up by
+    ``create_synthetic_gpu`` so existing fuzzy matching and aliases stay in
+    one place.
+    """
+    raw_values = [values] if isinstance(values, str) else list(values)
+    gpu_names: list[str] = []
+
+    for raw in raw_values:
+        for part in raw.split(","):
+            spec = part.strip()
+            if not spec:
+                raise ValueError("Empty GPU entry in --gpu.")
+
+            count_match = re.match(r"^(\d+)\s*x\s+(.+)$", spec, re.IGNORECASE)
+            if count_match:
+                count = int(count_match.group(1))
+                name = count_match.group(2).strip()
+                if count < 1:
+                    raise ValueError("GPU count must be at least 1.")
+                if not name:
+                    raise ValueError("GPU count shorthand requires a GPU name.")
+                gpu_names.extend([name] * count)
+            else:
+                gpu_names.append(spec)
+
+    if not gpu_names:
+        raise ValueError("At least one GPU must be specified.")
+    return gpu_names
+
+
+def create_synthetic_gpus(
+    values: Sequence[str] | str,
+    vram_override_gb: float | None = None,
+) -> list[GPUInfo]:
+    """Create one or more synthetic GPUs from CLI-style values."""
+    names = parse_synthetic_gpu_specs(values)
+    if vram_override_gb is not None and len(names) != 1:
+        raise ValueError(
+            "--vram currently supports exactly one simulated GPU. "
+            "For multi-GPU simulation, specify known GPU names and omit --vram."
+        )
+    return [create_synthetic_gpu(name, vram_override_gb) for name in names]
+
+
 def create_synthetic_gpu(name: str, vram_override_gb: float | None = None) -> GPUInfo:
     """Create a synthetic GPUInfo from a GPU name.
 
diff --git a/src/whichllm/output/json_output.py b/src/whichllm/output/json_output.py
index 2e044de..4fa2f24 100644
--- a/src/whichllm/output/json_output.py
+++ b/src/whichllm/output/json_output.py
@@ -45,6 +45,9 @@ def display_json(results: list[CompatibilityResult], hardware: HardwareInfo) ->
                     else estimate_weight_bytes(r.model, None)
                 ),
                 "vram_required_bytes": r.vram_required_bytes,
+                "vram_available_bytes": r.vram_available_bytes,
+                "uses_multi_gpu": r.uses_multi_gpu,
+                "multi_gpu_effective_vram_bytes": r.multi_gpu_effective_vram_bytes,
                 "estimated_tok_per_sec": r.estimated_tok_per_sec,
                 "speed_confidence": r.speed_confidence,
                 "speed_range_tok_per_sec": (
diff --git a/src/whichllm/output/ranking.py b/src/whichllm/output/ranking.py
index c7d6df9..61b697a 100644
--- a/src/whichllm/output/ranking.py
+++ b/src/whichllm/output/ranking.py
@@ -42,35 +42,42 @@ def _top_pick_confidence(results: list[CompatibilityResult]) -> tuple[str, str]:
     """Return confidence level and explanation for top pick."""
     top = results[0]
     gap = (top.quality_score - results[1].quality_score) if len(results) > 1 else 999.0
-    fit_note = ""
+    notes: list[str] = []
     if top.fit_type == "partial_offload":
-        fit_note = ", partial offload"
+        notes.append("partial offload")
     elif top.fit_type == "cpu_only":
-        fit_note = ", CPU-only"
+        notes.append("CPU-only")
+    if top.speed_confidence == "low":
+        notes.append("low-confidence speed")
+    risk_note = f", {', '.join(notes)}" if notes else ""
 
     if top.benchmark_status == "none":
-        return "Low", f"no benchmark data, gap +{gap:.1f}{fit_note}"
+        return "Low", f"no benchmark data, gap +{gap:.1f}{risk_note}"
     if top.benchmark_status == "self_reported":
         return (
             "Low",
-            f"uploader-reported benchmark only (unverified), gap +{gap:.1f}{fit_note}",
+            f"uploader-reported benchmark only (unverified), gap +{gap:.1f}{risk_note}",
         )
     if top.benchmark_status == "estimated":
         if gap >= 2.0:
-            return "Medium", f"estimated benchmark, gap +{gap:.1f}{fit_note}"
-        return "Low", f"estimated benchmark, gap +{gap:.1f}{fit_note}"
+            confidence = "Medium"
+        else:
+            confidence = "Low"
+        if top.speed_confidence == "low" and confidence == "Medium":
+            confidence = "Low"
+        return confidence, f"estimated benchmark, gap +{gap:.1f}{risk_note}"
     if gap >= 2.5:
         confidence = "High"
-        reason = f"direct benchmark, gap +{gap:.1f}{fit_note}"
+        reason = f"direct benchmark, gap +{gap:.1f}{risk_note}"
     elif gap >= 1.0:
         confidence = "Medium"
-        reason = f"direct benchmark, gap +{gap:.1f}{fit_note}"
+        reason = f"direct benchmark, gap +{gap:.1f}{risk_note}"
     else:
         confidence = "Low"
-        reason = f"direct benchmark but very close (+{gap:.1f}){fit_note}"
+        reason = f"direct benchmark but very close (+{gap:.1f}){risk_note}"
 
-    # オフロード/CPU-onlyの1位は実運用で不確実性が高いため信頼度を1段階下げる
-    if top.fit_type != "full_gpu":
+    # オフロード/CPU-only/低信頼speedの1位は実運用で不確実性が高いため信頼度を1段階下げる
+    if top.fit_type != "full_gpu" or top.speed_confidence == "low":
         if confidence == "High":
             confidence = "Medium"
         elif confidence == "Medium":
diff --git a/tests/test_cli.py b/tests/test_cli.py
index b052616..8013fe9 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -6,6 +6,7 @@
 
 import whichllm.cli as cli_mod
 from whichllm.cli import (
+    _apply_gpu_overrides,
     _auto_min_params_for_profile,
     _fill_missing_published_at,
     _format_fetch_error,
@@ -60,6 +61,16 @@ def test_auto_min_params_non_general_disabled():
     assert _auto_min_params_for_profile(_hw_with_gpu(24), "coding") is None
 
 
+def test_apply_gpu_overrides_accepts_multiple_simulated_gpus():
+    hw = HardwareInfo(gpus=[], ram_bytes=64 * 1024**3, os="linux")
+
+    _apply_gpu_overrides(hw, cpu_only=False, gpu=["2x RTX 4090"], vram=None)
+
+    assert len(hw.gpus) == 2
+    assert all(gpu.vendor == "nvidia" for gpu in hw.gpus)
+    assert all(gpu.vram_bytes == 24 * 1024**3 for gpu in hw.gpus)
+
+
 def test_include_vision_candidates_by_profile():
     assert _include_vision_candidates("vision") is True
     assert _include_vision_candidates("any") is True
diff --git a/tests/test_compatibility.py b/tests/test_compatibility.py
index f170f21..ef792dd 100644
--- a/tests/test_compatibility.py
+++ b/tests/test_compatibility.py
@@ -1,5 +1,6 @@
 """Tests for compatibility checking."""
 
+from whichllm.constants import _GiB
 from whichllm.engine.compatibility import check_compatibility
 from whichllm.hardware.memory import estimate_usable_ram
 from whichllm.hardware.types import GPUInfo, HardwareInfo
@@ -160,6 +161,114 @@ def test_shared_memory_igpu_is_not_summed_with_dedicated_gpu():
     assert any("offloaded to CPU RAM" in w for w in result.warnings)
 
 
+def test_homogeneous_multi_gpu_uses_conservative_fit_budget():
+    model = _make_model(1_000_000_000)
+    variant = _make_variant(int(46 * _GiB))
+    hw = HardwareInfo(
+        gpus=[
+            GPUInfo(
+                name="NVIDIA GeForce RTX 4090",
+                vendor="nvidia",
+                vram_bytes=24 * _GiB,
+                compute_capability=(8, 9),
+                memory_bandwidth_gbps=1008.0,
+            ),
+            GPUInfo(
+                name="NVIDIA GeForce RTX 4090",
+                vendor="nvidia",
+                vram_bytes=24 * _GiB,
+                compute_capability=(8, 9),
+                memory_bandwidth_gbps=1008.0,
+            ),
+        ],
+        cpu_name="Test CPU",
+        cpu_cores=16,
+        ram_bytes=128 * _GiB,
+        disk_free_bytes=200 * _GiB,
+        os="linux",
+    )
+
+    result = check_compatibility(model, variant, hw)
+
+    assert result.can_run is True
+    assert result.fit_type == "partial_offload"
+    assert result.uses_multi_gpu is True
+    assert result.vram_available_bytes == 48 * _GiB
+    assert result.multi_gpu_effective_vram_bytes is not None
+    assert result.multi_gpu_effective_vram_bytes < result.vram_available_bytes
+    assert any("conservative layer-split budget" in w for w in result.warnings)
+
+
+def test_heterogeneous_multi_gpu_warns_about_split_assumptions():
+    model = _make_model()
+    variant = _make_variant(20 * _GiB)
+    hw = HardwareInfo(
+        gpus=[
+            GPUInfo(
+                name="NVIDIA GeForce RTX 4090",
+                vendor="nvidia",
+                vram_bytes=24 * _GiB,
+                compute_capability=(8, 9),
+                memory_bandwidth_gbps=1008.0,
+            ),
+            GPUInfo(
+                name="NVIDIA GeForce RTX 3060",
+                vendor="nvidia",
+                vram_bytes=12 * _GiB,
+                compute_capability=(8, 6),
+                memory_bandwidth_gbps=360.0,
+            ),
+        ],
+        cpu_name="Test CPU",
+        cpu_cores=16,
+        ram_bytes=64 * _GiB,
+        disk_free_bytes=200 * _GiB,
+        os="linux",
+    )
+
+    result = check_compatibility(model, variant, hw)
+
+    assert result.can_run is True
+    assert result.uses_multi_gpu is True
+    assert result.multi_gpu_effective_vram_bytes is not None
+    assert result.multi_gpu_effective_vram_bytes < 36 * _GiB
+    assert any("Heterogeneous multi-GPU" in w for w in result.warnings)
+
+
+def test_multiple_shared_memory_gpus_are_not_summed():
+    model = _make_model(120_000_000_000)
+    variant = _make_variant(70 * _GiB)
+    hw = HardwareInfo(
+        gpus=[
+            GPUInfo(
+                name="Integrated GPU A",
+                vendor="amd",
+                vram_bytes=0,
+                memory_bandwidth_gbps=120.0,
+                shared_memory=True,
+            ),
+            GPUInfo(
+                name="Integrated GPU B",
+                vendor="intel",
+                vram_bytes=0,
+                shared_memory=True,
+            ),
+        ],
+        cpu_name="Test CPU",
+        cpu_cores=16,
+        ram_bytes=64 * _GiB,
+        disk_free_bytes=200 * _GiB,
+        os="linux",
+    )
+
+    result = check_compatibility(model, variant, hw)
+
+    assert result.vram_available_bytes == estimate_usable_ram(hw.ram_bytes)
+    assert result.multi_gpu_effective_vram_bytes is None
+    assert result.fit_type == "cpu_only"
+    assert any("shared-memory GPUs are not pooled" in w for w in result.warnings)
+
+
 def test_apple_silicon_does_not_double_count_unified_memory():
     """Apple Silicon uses unified memory: vram_bytes IS the system RAM.
     The fit checker must not add a separate offload pool on top."""
diff --git a/tests/test_gpu_simulator.py b/tests/test_gpu_simulator.py
index f01584d..8c843d7 100644
--- a/tests/test_gpu_simulator.py
+++ b/tests/test_gpu_simulator.py
@@ -3,7 +3,46 @@
 import pytest
 
 from whichllm.constants import _GiB
-from whichllm.hardware.gpu_simulator import create_synthetic_gpu
+from whichllm.hardware.gpu_simulator import (
+    create_synthetic_gpu,
+    create_synthetic_gpus,
+    parse_synthetic_gpu_specs,
+)
+
+
+class TestMultiGPUSpecParsing:
+    def test_comma_separated_gpu_specs(self):
+        assert parse_synthetic_gpu_specs(["RTX 4090, RTX 3090"]) == [
+            "RTX 4090",
+            "RTX 3090",
+        ]
+
+    def test_repeated_gpu_specs(self):
+        assert parse_synthetic_gpu_specs(["RTX 4090", "RTX 3090"]) == [
+            "RTX 4090",
+            "RTX 3090",
+        ]
+
+    def test_count_shorthand(self):
+        assert parse_synthetic_gpu_specs(["2x RTX 4090, 1x RTX 3090"]) == [
+            "RTX 4090",
+            "RTX 4090",
+            "RTX 3090",
+        ]
+
+    def test_empty_entry_raises(self):
+        with pytest.raises(ValueError, match="Empty GPU entry"):
+            parse_synthetic_gpu_specs(["RTX 4090,"])
+
+    def test_create_synthetic_gpus_expands_count(self):
+        gpus = create_synthetic_gpus(["2x RTX 4090"])
+        assert len(gpus) == 2
+        assert all(gpu.vendor == "nvidia" for gpu in gpus)
+        assert all(gpu.vram_bytes == 24 * _GiB for gpu in gpus)
+
+    def test_multi_gpu_vram_override_is_rejected(self):
+        with pytest.raises(ValueError, match="exactly one simulated GPU"):
+            create_synthetic_gpus(["2x RTX 4090"], vram_override_gb=24)
 
 
 class TestKnownGPULookup:
diff --git a/tests/test_ranker.py b/tests/test_ranker.py
index 3b374ee..585c9cd 100644
--- a/tests/test_ranker.py
+++ b/tests/test_ranker.py
@@ -785,6 +785,70 @@ def test_unknown_speed_heavy_partial_offload_does_not_top_rank():
         assert heavy.estimated_tok_per_sec == 0.0
 
 
+def test_multi_gpu_speed_confidence_is_low():
+    from whichllm.engine.performance import estimate_tok_per_sec
+
+    model = ModelInfo(
+        id="org/Test-34B-GGUF",
+        family_id="org/Test-34B-GGUF",
+        name="Test-34B-GGUF",
+        parameter_count=34_000_000_000,
+        downloads=1000,
+        likes=100,
+        gguf_variants=[
+            GGUFVariant(
+                filename="test-34b-Q4_K_M.gguf",
+                quant_type="Q4_K_M",
+                file_size_bytes=22 * 1024**3,
+            )
+        ],
+    )
+    hw = HardwareInfo(
+        gpus=[
+            GPUInfo(
+                name="NVIDIA GeForce RTX 4090",
+                vendor="nvidia",
+                vram_bytes=24 * 1024**3,
+                compute_capability=(8, 9),
+                memory_bandwidth_gbps=1008.0,
+            ),
+            GPUInfo(
+                name="NVIDIA GeForce RTX 4090",
+                vendor="nvidia",
+                vram_bytes=24 * 1024**3,
+                compute_capability=(8, 9),
+                memory_bandwidth_gbps=1008.0,
+            ),
+        ],
+        cpu_name="Test CPU",
+        cpu_cores=16,
+        has_avx2=True,
+        ram_bytes=128 * 1024**3,
+        disk_free_bytes=500 * 1024**3,
+        os="linux",
+    )
+
+    results = rank_models(
+        [model],
+        hw,
+        top_n=1,
+        benchmark_scores={"org/Test-34B-GGUF": 70.0},
+    )
+
+    assert results
+    assert results[0].fit_type == "full_gpu"
+    assert results[0].uses_multi_gpu is True
+    assert results[0].speed_confidence == "low"
+    single_gpu_speed = estimate_tok_per_sec(
+        model,
+        model.gguf_variants[0],
+        hw.gpus[0],
+        "full_gpu",
+    )
+    assert results[0].estimated_tok_per_sec == single_gpu_speed * 0.70
+    assert any("Multi-GPU speed depends" in note for note in results[0].speed_notes)
+
+
 def test_benchmark_source_and_confidence_exposed_for_direct():
     model = ModelInfo(
         id="Qwen/Qwen2.5-7B-Instruct",