Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Common options:
| `--json` | Print machine-readable JSON |
| `--refresh` | Ignore caches and fetch models/benchmarks again |
| `--cpu-only` | Ignore GPUs and rank for CPU-only use |
| `--gpu` | Simulate a GPU by name |
| `--gpu` | Simulate GPU(s) by name. Accepts repeated flags, comma-separated values, and count shorthand |
| `--vram` | Override simulated GPU VRAM in GB. Requires `--gpu` |
| `--version` | Print the installed package version |

Expand All @@ -38,6 +38,9 @@ Examples:
whichllm
whichllm --gpu "RTX 4090"
whichllm --gpu "RTX 5060 Ti" --vram 16
whichllm --gpu "2x RTX 4090"
whichllm --gpu "RTX 4090" --gpu "RTX 3090"
whichllm --gpu "RTX 4090, RTX 3090"
whichllm --profile coding --top 5
whichllm --context-length 64k
whichllm --evidence strict
Expand Down Expand Up @@ -70,6 +73,7 @@ whichllm hardware
whichllm hardware --cpu-only
whichllm hardware --gpu "Apple M3 Max"
whichllm hardware --gpu "RTX 3060" --vram 12
whichllm hardware --gpu "4x RTX 4090"
```

## `plan`
Expand Down
30 changes: 25 additions & 5 deletions docs/hardware.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,19 @@ whichllm hardware --gpu "Unknown GPU" --vram 24

`--vram` requires `--gpu`.

Multi-GPU simulation accepts repeated flags, comma-separated values, and count
shorthand:

```bash
whichllm --gpu "2x RTX 4090"
whichllm --gpu "RTX 4090" --gpu "RTX 3090"
whichllm --gpu "RTX 4090, RTX 3090"
```

`--vram` is only supported for a single simulated GPU. For multi-GPU
simulation, use known GPU names so whichllm can resolve each card's VRAM from
the GPU database.

## Fit types

Compatibility checks classify a candidate into one of three fit types:
Expand All @@ -147,19 +160,26 @@ Compatibility checks classify a candidate into one of three fit types:
If neither GPU memory nor usable RAM can hold the model, the candidate is not
ranked.

whichllm reserves about 20% of system RAM for the OS and other processes.
whichllm keeps a bounded system-RAM reserve for the OS and other processes.

## Multiple GPUs

For fit checks, whichllm sums available GPU memory. For speed estimates, it uses
the largest detected GPU as the representative device.
For fit checks, whichllm uses a conservative multi-GPU budget rather than
pretending all VRAM is one perfect device. It starts from raw total VRAM, applies
a small per-GPU overhead, and then applies a utilization factor. Homogeneous
sets receive a less severe reduction than heterogeneous sets.

If a dedicated GPU is present, low-aperture shared-memory integrated GPUs are
not added to the fit pool. This avoids treating unrelated system RAM and
dedicated VRAM as one full-GPU target.

This is a practical approximation. It does not model every tensor-parallel or
pipeline-parallel runtime configuration.
For speed estimates, whichllm uses the largest detected GPU as the
representative device and marks multi-GPU speed as low-confidence. This avoids
claiming ideal scaling when real performance depends on backend split mode,
PCIe/NVLink bandwidth, NCCL/RCCL support, batch size, and model architecture.

This is a practical fit approximation. It does not model every tensor-parallel
or pipeline-parallel runtime configuration.

## Disk checks

Expand Down
20 changes: 12 additions & 8 deletions src/whichllm/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def _print_version(value: bool) -> None:

def _validate_gpu_flags(
cpu_only: bool,
gpu: str | None,
gpu: list[str] | None,
vram: float | None,
) -> None:
"""Validate mutual exclusivity of GPU-related flags."""
Expand Down Expand Up @@ -99,17 +99,17 @@ def _resolve_evidence_mode(evidence: str, direct: bool) -> str:
def _apply_gpu_overrides(
hardware: HardwareInfo,
cpu_only: bool,
gpu: str | None,
gpu: list[str] | None,
vram: float | None,
) -> HardwareInfo:
"""Replace hardware.gpus based on CLI flags."""
if cpu_only:
hardware.gpus = []
elif gpu:
from whichllm.hardware.gpu_simulator import create_synthetic_gpu
from whichllm.hardware.gpu_simulator import create_synthetic_gpus

try:
hardware.gpus = [create_synthetic_gpu(gpu, vram)]
hardware.gpus = create_synthetic_gpus(gpu, vram)
except ValueError as e:
console.print(f"[red]Error:[/] {e}")
raise typer.Exit(code=1)
Expand Down Expand Up @@ -251,8 +251,10 @@ def main(
cpu_only: bool = typer.Option(
False, "--cpu-only", help="Ignore GPU and run in CPU-only mode"
),
gpu: Optional[str] = typer.Option(
None, "--gpu", help="Simulate a GPU (e.g. 'RTX 4090')"
gpu: Optional[list[str]] = typer.Option(
None,
"--gpu",
help="Simulate GPU(s), e.g. 'RTX 4090', '2x RTX 4090', or repeat --gpu",
),
vram: Optional[float] = typer.Option(
None, "--vram", help="Override VRAM in GB (requires --gpu)"
Expand Down Expand Up @@ -1101,8 +1103,10 @@ def hardware(
cpu_only: bool = typer.Option(
False, "--cpu-only", help="Ignore GPU and run in CPU-only mode"
),
gpu: Optional[str] = typer.Option(
None, "--gpu", help="Simulate a GPU (e.g. 'RTX 4090')"
gpu: Optional[list[str]] = typer.Option(
None,
"--gpu",
help="Simulate GPU(s), e.g. 'RTX 4090', '2x RTX 4090', or repeat --gpu",
),
vram: Optional[float] = typer.Option(
None, "--vram", help="Override VRAM in GB (requires --gpu)"
Expand Down
86 changes: 80 additions & 6 deletions src/whichllm/engine/compatibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
from whichllm.hardware.types import GPUInfo, HardwareInfo
from whichllm.models.types import GGUFVariant, ModelInfo

_MULTI_GPU_FRAMEWORK_OVERHEAD_BYTES = int(0.3 * _GiB)
_MULTI_GPU_HOMOGENEOUS_UTILIZATION = 0.95
_MULTI_GPU_HETEROGENEOUS_UTILIZATION = 0.90


def _gpu_available_memory(gpu: GPUInfo, usable_ram: int) -> int:
if gpu.shared_memory and gpu.vram_bytes < 2 * _GiB:
Expand Down Expand Up @@ -46,6 +50,64 @@ def _fit_candidate_gpus(gpus: list[GPUInfo]) -> list[GPUInfo]:
return [gpu for gpu in gpus if not _uses_shared_system_pool(gpu)]


def _gpu_identity(gpu: GPUInfo) -> str:
name = gpu.name.lower().replace("(simulated)", "")
return " ".join(name.split())


def _is_homogeneous_gpu_set(gpus: list[GPUInfo], available: list[int]) -> bool:
if not gpus:
return True
first = gpus[0]
first_identity = _gpu_identity(first)
first_available = available[0]
vram_tolerance = max(256 * 1024**2, int(first_available * 0.02))
return all(
gpu.vendor == first.vendor
and _gpu_identity(gpu) == first_identity
and abs(gpu_available - first_available) <= vram_tolerance
for gpu, gpu_available in zip(gpus, available, strict=True)
)


def _multi_gpu_effective_vram(
gpus: list[GPUInfo],
available: list[int],
warnings: list[str],
) -> tuple[int, bool, int | None]:
raw_total = sum(available)
if len(gpus) <= 1:
return raw_total, False, None

if any(gpu.shared_memory or gpu.vendor == "apple" for gpu in gpus):
effective = max(available)
warnings.append(
"Multiple shared-memory GPUs are not pooled; using the largest "
"reported memory pool for fit checks"
)
return effective, False, None

homogeneous = _is_homogeneous_gpu_set(gpus, available)
utilization = (
_MULTI_GPU_HOMOGENEOUS_UTILIZATION
if homogeneous
else _MULTI_GPU_HETEROGENEOUS_UTILIZATION
)
overhead = min(raw_total, len(gpus) * _MULTI_GPU_FRAMEWORK_OVERHEAD_BYTES)
effective = int((raw_total - overhead) * utilization)

warnings.append(
"Multi-GPU fit uses a conservative layer-split budget: "
f"{effective / _GiB:.1f} GB effective from {raw_total / _GiB:.1f} GB raw VRAM"
)
if not homogeneous:
warnings.append(
"Heterogeneous multi-GPU setup: fit assumes uneven layer placement; "
"speed depends on backend split mode and interconnect"
)
return effective, True, effective


def check_compatibility(
model: ModelInfo,
variant: GGUFVariant | None,
Expand All @@ -62,16 +124,25 @@ def check_compatibility(
# Determine best GPU
best_gpu: GPUInfo | None = None
best_gpu_available = 0
total_vram = 0
gpu_available_values: list[int] = []
candidate_gpus = _fit_candidate_gpus(hardware.gpus)
for gpu in candidate_gpus:
gpu_available = _gpu_available_memory(gpu, usable_ram)
total_vram += gpu_available
gpu_available_values.append(gpu_available)
if best_gpu is None or gpu_available > best_gpu_available:
best_gpu = gpu
best_gpu_available = gpu_available

vram_available = total_vram if total_vram > 0 else 0
vram_available = sum(gpu_available_values) if gpu_available_values else 0
fit_vram_available, uses_multi_gpu, multi_gpu_effective_vram = (
_multi_gpu_effective_vram(candidate_gpus, gpu_available_values, warnings)
)
if (
len(candidate_gpus) > 1
and not uses_multi_gpu
and any(gpu.shared_memory or gpu.vendor == "apple" for gpu in candidate_gpus)
):
vram_available = fit_vram_available
offload_ram_available = (
0
if best_gpu and (best_gpu.shared_memory or best_gpu.vendor == "apple")
Expand Down Expand Up @@ -108,17 +179,18 @@ def check_compatibility(
warnings.append("Metal requires macOS for Apple Silicon inference")

# Determine fit type
if vram_available >= vram_required:
if fit_vram_available >= vram_required:
fit_type = "full_gpu"
can_run = True
offload_ratio = 0.0
elif (
vram_available > 0 and (vram_available + offload_ram_available) >= vram_required
fit_vram_available > 0
and (fit_vram_available + offload_ram_available) >= vram_required
):
fit_type = "partial_offload"
can_run = True
offload_ratio = (
(vram_required - vram_available) / vram_required
(vram_required - fit_vram_available) / vram_required
if vram_required > 0
else 0.0
)
Expand Down Expand Up @@ -171,6 +243,8 @@ def check_compatibility(
vram_required_bytes=vram_required,
vram_available_bytes=vram_available,
offload_ratio=offload_ratio,
uses_multi_gpu=uses_multi_gpu,
multi_gpu_effective_vram_bytes=multi_gpu_effective_vram,
warnings=warnings,
fit_type=fit_type,
context_fits=context_fits,
Expand Down
15 changes: 15 additions & 0 deletions src/whichllm/engine/ranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
_LINEAGE_FAMILY_MAX: dict[str, int] = {
family: max(idx for _, idx in entries) for family, entries in _LINEAGE_REGEX.items()
}
_MULTI_GPU_SPEED_FACTOR = 0.70


def _family_selection_key(
Expand Down Expand Up @@ -755,6 +756,8 @@ def rank_models(
tok_per_sec = estimate_tok_per_sec(
model, variant, best_gpu, compat.fit_type
)
if compat.uses_multi_gpu:
tok_per_sec *= _MULTI_GPU_SPEED_FACTOR
if min_speed is not None and tok_per_sec < min_speed:
continue

Expand All @@ -781,6 +784,18 @@ def rank_models(
compat.fit_type,
tok_per_sec,
)
if compat.uses_multi_gpu:
compat.speed_confidence = "low"
if tok_per_sec > 0:
compat.speed_range_tok_per_sec = (
round(tok_per_sec * 0.35, 1),
round(tok_per_sec * 2.0, 1),
)
compat.speed_notes.append(
"Multi-GPU speed depends on layer/tensor split mode, "
"PCIe/NVLink bandwidth, and backend support; this estimate "
"does not assume ideal scaling."
)
compat.quality_score = _compute_quality_score(
model,
variant,
Expand Down
2 changes: 2 additions & 0 deletions src/whichllm/engine/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,5 @@ class CompatibilityResult:
benchmark_source: str = "none" # granular: "direct" | "variant" | "base_model" | "line_interp" | "self_reported" | "none"
benchmark_confidence: float = 0.0 # 0.0-1.0 from BenchmarkEvidence
context_fits: bool = True # False when known model max context < requested
uses_multi_gpu: bool = False
multi_gpu_effective_vram_bytes: int | None = None
49 changes: 49 additions & 0 deletions src/whichllm/hardware/gpu_simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import logging
import re
from collections.abc import Sequence
from typing import TYPE_CHECKING

if TYPE_CHECKING:
Expand Down Expand Up @@ -190,6 +191,54 @@ def _lookup_dbgpu(name: str) -> GPUSpecification | None:
_last_suggestions: list[tuple[str, int]] = []


def parse_synthetic_gpu_specs(values: Sequence[str] | str) -> list[str]:
"""Expand CLI GPU simulation values into individual GPU names.

Accepts repeated options, comma-separated names, and count shorthand such
as ``2x RTX 4090``. The returned names are still looked up by
``create_synthetic_gpu`` so existing fuzzy matching and aliases stay in
one place.
"""
raw_values = [values] if isinstance(values, str) else list(values)
gpu_names: list[str] = []

for raw in raw_values:
for part in raw.split(","):
spec = part.strip()
if not spec:
raise ValueError("Empty GPU entry in --gpu.")

count_match = re.match(r"^(\d+)\s*x\s+(.+)$", spec, re.IGNORECASE)
if count_match:
count = int(count_match.group(1))
name = count_match.group(2).strip()
if count < 1:
raise ValueError("GPU count must be at least 1.")
if not name:
raise ValueError("GPU count shorthand requires a GPU name.")
gpu_names.extend([name] * count)
else:
gpu_names.append(spec)

if not gpu_names:
raise ValueError("At least one GPU must be specified.")
return gpu_names


def create_synthetic_gpus(
values: Sequence[str] | str,
vram_override_gb: float | None = None,
) -> list[GPUInfo]:
"""Create one or more synthetic GPUs from CLI-style values."""
names = parse_synthetic_gpu_specs(values)
if vram_override_gb is not None and len(names) != 1:
raise ValueError(
"--vram currently supports exactly one simulated GPU. "
"For multi-GPU simulation, specify known GPU names and omit --vram."
)
return [create_synthetic_gpu(name, vram_override_gb) for name in names]


def create_synthetic_gpu(name: str, vram_override_gb: float | None = None) -> GPUInfo:
"""Create a synthetic GPUInfo from a GPU name.

Expand Down
3 changes: 3 additions & 0 deletions src/whichllm/output/json_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ def display_json(results: list[CompatibilityResult], hardware: HardwareInfo) ->
else estimate_weight_bytes(r.model, None)
),
"vram_required_bytes": r.vram_required_bytes,
"vram_available_bytes": r.vram_available_bytes,
"uses_multi_gpu": r.uses_multi_gpu,
"multi_gpu_effective_vram_bytes": r.multi_gpu_effective_vram_bytes,
"estimated_tok_per_sec": r.estimated_tok_per_sec,
"speed_confidence": r.speed_confidence,
"speed_range_tok_per_sec": (
Expand Down
Loading