Andyyyy64 · Andyyyy64 · Jun 14, 2026
diff --git a/docs/cli.md b/docs/cli.md
@@ -28,7 +28,7 @@ Common options:
 | `--json` | Print machine-readable JSON |
 | `--refresh` | Ignore caches and fetch models/benchmarks again |
 | `--cpu-only` | Ignore GPUs and rank for CPU-only use |
-| `--gpu` | Simulate a GPU by name |
+| `--gpu` | Simulate GPU(s) by name. Accepts repeated flags, comma-separated values, and count shorthand |
 | `--vram` | Override simulated GPU VRAM in GB. Requires `--gpu` |
 | `--version` | Print the installed package version |
 
@@ -38,6 +38,9 @@ Examples:
 whichllm
 whichllm --gpu "RTX 4090"
 whichllm --gpu "RTX 5060 Ti" --vram 16
+whichllm --gpu "2x RTX 4090"
+whichllm --gpu "RTX 4090" --gpu "RTX 3090"
+whichllm --gpu "RTX 4090, RTX 3090"
 whichllm --profile coding --top 5
 whichllm --context-length 64k
 whichllm --evidence strict
@@ -70,6 +73,7 @@ whichllm hardware
 whichllm hardware --cpu-only
 whichllm hardware --gpu "Apple M3 Max"
 whichllm hardware --gpu "RTX 3060" --vram 12
+whichllm hardware --gpu "4x RTX 4090"
 ```
 
 ## `plan`

diff --git a/docs/hardware.md b/docs/hardware.md
@@ -134,6 +134,19 @@ whichllm hardware --gpu "Unknown GPU" --vram 24
 
 `--vram` requires `--gpu`.
 
+Multi-GPU simulation accepts repeated flags, comma-separated values, and count
+shorthand:
+
+```bash
+whichllm --gpu "2x RTX 4090"
+whichllm --gpu "RTX 4090" --gpu "RTX 3090"
+whichllm --gpu "RTX 4090, RTX 3090"
+```
+
+`--vram` is only supported for a single simulated GPU. For multi-GPU
+simulation, use known GPU names so whichllm can resolve each card's VRAM from
+the GPU database.
+
 ## Fit types
 
 Compatibility checks classify a candidate into one of three fit types:
@@ -147,19 +160,26 @@ Compatibility checks classify a candidate into one of three fit types:
 If neither GPU memory nor usable RAM can hold the model, the candidate is not
 ranked.
 
-whichllm reserves about 20% of system RAM for the OS and other processes.
+whichllm keeps a bounded system-RAM reserve for the OS and other processes.
 
 ## Multiple GPUs
 
-For fit checks, whichllm sums available GPU memory. For speed estimates, it uses
-the largest detected GPU as the representative device.
+For fit checks, whichllm uses a conservative multi-GPU budget rather than
+pretending all VRAM is one perfect device. It starts from raw total VRAM, applies
+a small per-GPU overhead, and then applies a utilization factor. Homogeneous
+sets receive a less severe reduction than heterogeneous sets.
 
 If a dedicated GPU is present, low-aperture shared-memory integrated GPUs are
 not added to the fit pool. This avoids treating unrelated system RAM and
 dedicated VRAM as one full-GPU target.
 
-This is a practical approximation. It does not model every tensor-parallel or
-pipeline-parallel runtime configuration.
+For speed estimates, whichllm uses the largest detected GPU as the
+representative device and marks multi-GPU speed as low-confidence. This avoids
+claiming ideal scaling when real performance depends on backend split mode,
+PCIe/NVLink bandwidth, NCCL/RCCL support, batch size, and model architecture.
+
+This is a practical fit approximation. It does not model every tensor-parallel
+or pipeline-parallel runtime configuration.
 
 ## Disk checks
 

diff --git a/src/whichllm/cli.py b/src/whichllm/cli.py
@@ -53,7 +53,7 @@ def _print_version(value: bool) -> None:
 
 def _validate_gpu_flags(
     cpu_only: bool,
-    gpu: str | None,
+    gpu: list[str] | None,
     vram: float | None,
 ) -> None:
     """Validate mutual exclusivity of GPU-related flags."""
@@ -99,17 +99,17 @@ def _resolve_evidence_mode(evidence: str, direct: bool) -> str:
 def _apply_gpu_overrides(
     hardware: HardwareInfo,
     cpu_only: bool,
-    gpu: str | None,
+    gpu: list[str] | None,
     vram: float | None,
 ) -> HardwareInfo:
     """Replace hardware.gpus based on CLI flags."""
     if cpu_only:
         hardware.gpus = []
     elif gpu:
-        from whichllm.hardware.gpu_simulator import create_synthetic_gpu
+        from whichllm.hardware.gpu_simulator import create_synthetic_gpus
 
         try:
-            hardware.gpus = [create_synthetic_gpu(gpu, vram)]
+            hardware.gpus = create_synthetic_gpus(gpu, vram)
         except ValueError as e:
             console.print(f"[red]Error:[/] {e}")
             raise typer.Exit(code=1)
@@ -251,8 +251,10 @@ def main(
     cpu_only: bool = typer.Option(
         False, "--cpu-only", help="Ignore GPU and run in CPU-only mode"
     ),
-    gpu: Optional[str] = typer.Option(
-        None, "--gpu", help="Simulate a GPU (e.g. 'RTX 4090')"
+    gpu: Optional[list[str]] = typer.Option(
+        None,
+        "--gpu",
+        help="Simulate GPU(s), e.g. 'RTX 4090', '2x RTX 4090', or repeat --gpu",
     ),
     vram: Optional[float] = typer.Option(
         None, "--vram", help="Override VRAM in GB (requires --gpu)"
@@ -1101,8 +1103,10 @@ def hardware(
     cpu_only: bool = typer.Option(
         False, "--cpu-only", help="Ignore GPU and run in CPU-only mode"
     ),
-    gpu: Optional[str] = typer.Option(
-        None, "--gpu", help="Simulate a GPU (e.g. 'RTX 4090')"
+    gpu: Optional[list[str]] = typer.Option(
+        None,
+        "--gpu",
+        help="Simulate GPU(s), e.g. 'RTX 4090', '2x RTX 4090', or repeat --gpu",
     ),
     vram: Optional[float] = typer.Option(
         None, "--vram", help="Override VRAM in GB (requires --gpu)"

diff --git a/src/whichllm/engine/compatibility.py b/src/whichllm/engine/compatibility.py
@@ -12,6 +12,10 @@
 from whichllm.hardware.types import GPUInfo, HardwareInfo
 from whichllm.models.types import GGUFVariant, ModelInfo
 
+_MULTI_GPU_FRAMEWORK_OVERHEAD_BYTES = int(0.3 * _GiB)
+_MULTI_GPU_HOMOGENEOUS_UTILIZATION = 0.95
+_MULTI_GPU_HETEROGENEOUS_UTILIZATION = 0.90
+
 
 def _gpu_available_memory(gpu: GPUInfo, usable_ram: int) -> int:
     if gpu.shared_memory and gpu.vram_bytes < 2 * _GiB:
@@ -46,6 +50,64 @@ def _fit_candidate_gpus(gpus: list[GPUInfo]) -> list[GPUInfo]:
     return [gpu for gpu in gpus if not _uses_shared_system_pool(gpu)]
 
 
+def _gpu_identity(gpu: GPUInfo) -> str:
+    name = gpu.name.lower().replace("(simulated)", "")
+    return " ".join(name.split())
+
+
+def _is_homogeneous_gpu_set(gpus: list[GPUInfo], available: list[int]) -> bool:
+    if not gpus:
+        return True
+    first = gpus[0]
+    first_identity = _gpu_identity(first)
+    first_available = available[0]
+    vram_tolerance = max(256 * 1024**2, int(first_available * 0.02))
+    return all(
+        gpu.vendor == first.vendor
+        and _gpu_identity(gpu) == first_identity
+        and abs(gpu_available - first_available) <= vram_tolerance
+        for gpu, gpu_available in zip(gpus, available, strict=True)
+    )
+
+
+def _multi_gpu_effective_vram(
+    gpus: list[GPUInfo],
+    available: list[int],
+    warnings: list[str],
+) -> tuple[int, bool, int | None]:
+    raw_total = sum(available)
+    if len(gpus) <= 1:
+        return raw_total, False, None
+
+    if any(gpu.shared_memory or gpu.vendor == "apple" for gpu in gpus):
+        effective = max(available)
+        warnings.append(
+            "Multiple shared-memory GPUs are not pooled; using the largest "
+            "reported memory pool for fit checks"
+        )
+        return effective, False, None
+
+    homogeneous = _is_homogeneous_gpu_set(gpus, available)
+    utilization = (
+        _MULTI_GPU_HOMOGENEOUS_UTILIZATION
+        if homogeneous
+        else _MULTI_GPU_HETEROGENEOUS_UTILIZATION
+    )
+    overhead = min(raw_total, len(gpus) * _MULTI_GPU_FRAMEWORK_OVERHEAD_BYTES)
+    effective = int((raw_total - overhead) * utilization)
+
+    warnings.append(
+        "Multi-GPU fit uses a conservative layer-split budget: "
+        f"{effective / _GiB:.1f} GB effective from {raw_total / _GiB:.1f} GB raw VRAM"
+    )
+    if not homogeneous:
+        warnings.append(
+            "Heterogeneous multi-GPU setup: fit assumes uneven layer placement; "
+            "speed depends on backend split mode and interconnect"
+        )
+    return effective, True, effective
+
+
 def check_compatibility(
     model: ModelInfo,
     variant: GGUFVariant | None,
@@ -62,16 +124,25 @@ def check_compatibility(
     # Determine best GPU
     best_gpu: GPUInfo | None = None
     best_gpu_available = 0
-    total_vram = 0
+    gpu_available_values: list[int] = []
     candidate_gpus = _fit_candidate_gpus(hardware.gpus)
     for gpu in candidate_gpus:
         gpu_available = _gpu_available_memory(gpu, usable_ram)
-        total_vram += gpu_available
+        gpu_available_values.append(gpu_available)
         if best_gpu is None or gpu_available > best_gpu_available:
             best_gpu = gpu
             best_gpu_available = gpu_available
 
-    vram_available = total_vram if total_vram > 0 else 0
+    vram_available = sum(gpu_available_values) if gpu_available_values else 0
+    fit_vram_available, uses_multi_gpu, multi_gpu_effective_vram = (
+        _multi_gpu_effective_vram(candidate_gpus, gpu_available_values, warnings)
+    )
+    if (
+        len(candidate_gpus) > 1
+        and not uses_multi_gpu
+        and any(gpu.shared_memory or gpu.vendor == "apple" for gpu in candidate_gpus)
+    ):
+        vram_available = fit_vram_available
     offload_ram_available = (
         0
         if best_gpu and (best_gpu.shared_memory or best_gpu.vendor == "apple")
@@ -108,17 +179,18 @@ def check_compatibility(
         warnings.append("Metal requires macOS for Apple Silicon inference")
 
     # Determine fit type
-    if vram_available >= vram_required:
+    if fit_vram_available >= vram_required:
         fit_type = "full_gpu"
         can_run = True
         offload_ratio = 0.0
     elif (
-        vram_available > 0 and (vram_available + offload_ram_available) >= vram_required
+        fit_vram_available > 0
+        and (fit_vram_available + offload_ram_available) >= vram_required
     ):
         fit_type = "partial_offload"
         can_run = True
         offload_ratio = (
-            (vram_required - vram_available) / vram_required
+            (vram_required - fit_vram_available) / vram_required
             if vram_required > 0
             else 0.0
         )
@@ -171,6 +243,8 @@ def check_compatibility(
         vram_required_bytes=vram_required,
         vram_available_bytes=vram_available,
         offload_ratio=offload_ratio,
+        uses_multi_gpu=uses_multi_gpu,
+        multi_gpu_effective_vram_bytes=multi_gpu_effective_vram,
         warnings=warnings,
         fit_type=fit_type,
         context_fits=context_fits,

diff --git a/src/whichllm/engine/ranker.py b/src/whichllm/engine/ranker.py
@@ -33,6 +33,7 @@
 _LINEAGE_FAMILY_MAX: dict[str, int] = {
     family: max(idx for _, idx in entries) for family, entries in _LINEAGE_REGEX.items()
 }
+_MULTI_GPU_SPEED_FACTOR = 0.70
 
 
 def _family_selection_key(
@@ -755,6 +756,8 @@ def rank_models(
             tok_per_sec = estimate_tok_per_sec(
                 model, variant, best_gpu, compat.fit_type
             )
+            if compat.uses_multi_gpu:
+                tok_per_sec *= _MULTI_GPU_SPEED_FACTOR
             if min_speed is not None and tok_per_sec < min_speed:
                 continue
 
@@ -781,6 +784,18 @@ def rank_models(
                 compat.fit_type,
                 tok_per_sec,
             )
+            if compat.uses_multi_gpu:
+                compat.speed_confidence = "low"
+                if tok_per_sec > 0:
+                    compat.speed_range_tok_per_sec = (
+                        round(tok_per_sec * 0.35, 1),
+                        round(tok_per_sec * 2.0, 1),
+                    )
+                compat.speed_notes.append(
+                    "Multi-GPU speed depends on layer/tensor split mode, "
+                    "PCIe/NVLink bandwidth, and backend support; this estimate "
+                    "does not assume ideal scaling."
+                )
             compat.quality_score = _compute_quality_score(
                 model,
                 variant,

diff --git a/src/whichllm/engine/types.py b/src/whichllm/engine/types.py
@@ -24,3 +24,5 @@ class CompatibilityResult:
     benchmark_source: str = "none"  # granular: "direct" | "variant" | "base_model" | "line_interp" | "self_reported" | "none"
     benchmark_confidence: float = 0.0  # 0.0-1.0 from BenchmarkEvidence
     context_fits: bool = True  # False when known model max context < requested
+    uses_multi_gpu: bool = False
+    multi_gpu_effective_vram_bytes: int | None = None
diff --git a/src/whichllm/hardware/gpu_simulator.py b/src/whichllm/hardware/gpu_simulator.py
@@ -8,6 +8,7 @@
 
 import logging
 import re
+from collections.abc import Sequence
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
@@ -190,6 +191,54 @@ def _lookup_dbgpu(name: str) -> GPUSpecification | None:
 _last_suggestions: list[tuple[str, int]] = []
 
 
+def parse_synthetic_gpu_specs(values: Sequence[str] | str) -> list[str]:
+    """Expand CLI GPU simulation values into individual GPU names.
+
+    Accepts repeated options, comma-separated names, and count shorthand such
+    as ``2x RTX 4090``. The returned names are still looked up by
+    ``create_synthetic_gpu`` so existing fuzzy matching and aliases stay in
+    one place.
+    """
+    raw_values = [values] if isinstance(values, str) else list(values)
+    gpu_names: list[str] = []
+
+    for raw in raw_values:
+        for part in raw.split(","):
+            spec = part.strip()
+            if not spec:
+                raise ValueError("Empty GPU entry in --gpu.")
+
+            count_match = re.match(r"^(\d+)\s*x\s+(.+)$", spec, re.IGNORECASE)
+            if count_match:
+                count = int(count_match.group(1))
+                name = count_match.group(2).strip()
+                if count < 1:
+                    raise ValueError("GPU count must be at least 1.")
+                if not name:
+                    raise ValueError("GPU count shorthand requires a GPU name.")
+                gpu_names.extend([name] * count)
+            else:
+                gpu_names.append(spec)
+
+    if not gpu_names:
+        raise ValueError("At least one GPU must be specified.")
+    return gpu_names
+
+
+def create_synthetic_gpus(
+    values: Sequence[str] | str,
+    vram_override_gb: float | None = None,
+) -> list[GPUInfo]:
+    """Create one or more synthetic GPUs from CLI-style values."""
+    names = parse_synthetic_gpu_specs(values)
+    if vram_override_gb is not None and len(names) != 1:
+        raise ValueError(
+            "--vram currently supports exactly one simulated GPU. "
+            "For multi-GPU simulation, specify known GPU names and omit --vram."
+        )
+    return [create_synthetic_gpu(name, vram_override_gb) for name in names]
+
+
 def create_synthetic_gpu(name: str, vram_override_gb: float | None = None) -> GPUInfo:
     """Create a synthetic GPUInfo from a GPU name.
 

diff --git a/src/whichllm/output/json_output.py b/src/whichllm/output/json_output.py
@@ -45,6 +45,9 @@ def display_json(results: list[CompatibilityResult], hardware: HardwareInfo) ->
                     else estimate_weight_bytes(r.model, None)
                 ),
                 "vram_required_bytes": r.vram_required_bytes,
+                "vram_available_bytes": r.vram_available_bytes,
+                "uses_multi_gpu": r.uses_multi_gpu,
+                "multi_gpu_effective_vram_bytes": r.multi_gpu_effective_vram_bytes,
                 "estimated_tok_per_sec": r.estimated_tok_per_sec,
                 "speed_confidence": r.speed_confidence,
                 "speed_range_tok_per_sec": (