From 018c33129e90bc2b3b8c2e56a5f282be10116195 Mon Sep 17 00:00:00 2001 From: Mike McCarty Date: Wed, 11 Feb 2026 17:06:36 -0500 Subject: [PATCH 1/5] refactored to use dependency injection for interfacing with hardware --- dependency-injection-refactoring.md | 152 +++++++++++++++ rapids_cli/debug/debug.py | 46 +++-- rapids_cli/doctor/checks/cuda_driver.py | 23 ++- rapids_cli/doctor/checks/gpu.py | 42 +++-- rapids_cli/doctor/checks/memory.py | 65 +++++-- rapids_cli/doctor/checks/nvlink.py | 59 +++--- rapids_cli/doctor/doctor.py | 5 +- rapids_cli/hardware.py | 229 +++++++++++++++++++++++ rapids_cli/tests/test_cuda.py | 31 +--- rapids_cli/tests/test_debug.py | 63 +++---- rapids_cli/tests/test_gpu.py | 83 ++++----- rapids_cli/tests/test_hardware.py | 235 ++++++++++++++++++++++++ rapids_cli/tests/test_memory.py | 96 +++++----- rapids_cli/tests/test_nvlink.py | 162 +++++----------- 14 files changed, 936 insertions(+), 355 deletions(-) create mode 100644 dependency-injection-refactoring.md create mode 100644 rapids_cli/hardware.py create mode 100644 rapids_cli/tests/test_hardware.py diff --git a/dependency-injection-refactoring.md b/dependency-injection-refactoring.md new file mode 100644 index 0000000..697642c --- /dev/null +++ b/dependency-injection-refactoring.md @@ -0,0 +1,152 @@ +# Dependency Injection Refactoring + +## Context + +The check modules (`gpu.py`, `cuda_driver.py`, `memory.py`, `nvlink.py`) +and `debug.py` previously called `pynvml`, `psutil`, and `cuda.pathfinder` +directly. This forced tests to use 50+ `mock.patch` calls with deeply +nested context managers and `MagicMock` objects to simulate hardware +configurations. A thin abstraction layer was introduced so tests can +construct plain dataclasses instead of mocking low-level library internals. + +## Approach: Default Parameter Injection with Provider Dataclasses + +A single new file `rapids_cli/hardware.py` was created containing: + +- **`DeviceInfo`** dataclass -- holds per-GPU data + (index, compute capability, memory, nvlink states) +- **`GpuInfoProvider`** protocol -- read-only interface for GPU info + (`device_count`, `devices`, `cuda_driver_version`, `driver_version`) +- **`SystemInfoProvider`** protocol -- read-only interface for system info + (`total_memory_bytes`, `cuda_runtime_path`) +- **`NvmlGpuInfo`** -- real implementation backed by pynvml + (lazy-loads on first property access, caches results) +- **`DefaultSystemInfo`** -- real implementation backed by + psutil + cuda.pathfinder (lazy-loads per property) +- **`FakeGpuInfo`** / **`FakeSystemInfo`** -- test fakes + (plain dataclasses, no hardware dependency) +- **`FailingGpuInfo`** / **`FailingSystemInfo`** -- test fakes that + raise `ValueError` on access (simulates missing hardware) + +Check functions gained an optional keyword parameter with `None` default: + +```python +def gpu_check(verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs): + if gpu_info is None: # pragma: no cover + gpu_info = NvmlGpuInfo() +``` + +The orchestrator (`doctor.py`) creates a shared `NvmlGpuInfo()` instance +and passes it to all checks via `check_fn(verbose=verbose, gpu_info=gpu_info)`. +Third-party plugins safely ignore the extra keyword argument via their +own `**kwargs`. + +## Files Changed + +### New file: `rapids_cli/hardware.py` + +Contains all provider abstractions: + +- `DeviceInfo` dataclass with fields: `index`, `compute_capability`, + `memory_total_bytes`, `nvlink_states` +- `GpuInfoProvider` and `SystemInfoProvider` protocols + (runtime-checkable) +- `NvmlGpuInfo` -- calls `nvmlInit()` once on first property access, + queries all device info (count, compute capability, memory, + NVLink states), and caches everything +- `DefaultSystemInfo` -- lazily loads system memory via psutil and + CUDA path via cuda.pathfinder (each cached independently) +- `FakeGpuInfo`, `FakeSystemInfo` -- `@dataclass` test fakes with + pre-set data +- `FailingGpuInfo`, `FailingSystemInfo` -- test fakes that raise + `ValueError` on any property access + +### Modified: `rapids_cli/doctor/checks/gpu.py` + +- Removed `import pynvml` +- Added `gpu_info: GpuInfoProvider | None = None` parameter and + `**kwargs` to both `gpu_check()` and `check_gpu_compute_capability()` +- Replaced direct `pynvml` calls with `gpu_info.device_count` and + iteration over `gpu_info.devices` + +### Modified: `rapids_cli/doctor/checks/cuda_driver.py` + +- Removed `import pynvml` +- Added `gpu_info` parameter and `**kwargs` to `cuda_check()` +- Replaced nested try/except with `gpu_info.cuda_driver_version` + +### Modified: `rapids_cli/doctor/checks/memory.py` + +- Removed `import pynvml` and `import psutil` +- Added `system_info` parameter to `get_system_memory()` +- Added `gpu_info` parameter to `get_gpu_memory()` +- Added both `gpu_info` and `system_info` parameters to + `check_memory_to_gpu_ratio()` +- `get_system_memory()` reads `system_info.total_memory_bytes` +- `get_gpu_memory()` sums `dev.memory_total_bytes` from + `gpu_info.devices` +- `check_memory_to_gpu_ratio()` passes injected providers down + to helpers + +### Modified: `rapids_cli/doctor/checks/nvlink.py` + +- Removed `import pynvml` +- Added `gpu_info` parameter and `**kwargs` to `check_nvlink_status()` +- Iterates `dev.nvlink_states` instead of calling + `nvmlDeviceGetNvLinkState` +- **Side-fix**: the original code always passed `0` instead of + `nvlink_id` to `nvmlDeviceGetNvLinkState`; the refactored + `NvmlGpuInfo` queries each link by its actual index + +### Modified: `rapids_cli/debug/debug.py` + +- Removed `import pynvml` and `import cuda.pathfinder` +- Added `gpu_info` parameter to `gather_cuda_version()` +- Added `gpu_info` and `system_info` parameters to `run_debug()` +- Replaced direct pynvml/cuda.pathfinder calls with provider + property accesses + +### Modified: `rapids_cli/doctor/doctor.py` + +- Imports `NvmlGpuInfo` from `rapids_cli.hardware` +- Creates a shared `NvmlGpuInfo()` instance before the check loop +- Passes it via `check_fn(verbose=verbose, gpu_info=gpu_info)` + +### Rewritten tests + +`test_gpu.py`, `test_cuda.py`, `test_memory.py`, `test_nvlink.py`, +`test_debug.py`: + +- Replaced all `patch("pynvml.*")` / `patch("psutil.*")` / + `patch("cuda.pathfinder.*")` with `FakeGpuInfo` / `FakeSystemInfo` / + `FailingGpuInfo` construction +- Tests for `debug.py` still use patches for non-hardware concerns + (subprocess, pathlib, gather_tools) + +### New file: `rapids_cli/tests/test_hardware.py` + +- Unit tests for `NvmlGpuInfo` + (init failure, loads once, device data, NVLink states, no NVLink) +- Unit tests for `DefaultSystemInfo` + (total memory, CUDA runtime path, caching) +- Tests for `FakeGpuInfo` / `FakeSystemInfo` + (defaults, custom values, protocol satisfaction) +- Tests for `FailingGpuInfo` / `FailingSystemInfo` + (all properties raise) + +## Impact + +| Metric | Before | After | +| --------------------------------------------- | ------- | --------------------------------- | +| Hardware library patches in check/debug tests | ~51 | 0 (moved to test_hardware.py) | +| import pynvml in check/debug modules | 5 files | 1 file (hardware.py) | +| MagicMock objects for hardware | ~11 | 0 | +| pynvml.nvmlInit() calls in production | 7 | 1 (in NvmlGpuInfo._ensure_loaded) | +| Total tests | 53 | 72 (+19 hardware tests) | +| Coverage | 95%+ | 97.72% | + +## Verification + +1. `pytest` -- all 72 tests pass +2. `pytest --cov-fail-under=95` -- coverage at 97.72%, above threshold +3. `pre-commit run --all-files` -- all checks pass diff --git a/rapids_cli/debug/debug.py b/rapids_cli/debug/debug.py index fca4d1d..b4afde5 100644 --- a/rapids_cli/debug/debug.py +++ b/rapids_cli/debug/debug.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 """This module contains the debug subcommand for the Rapids CLI.""" +from __future__ import annotations + import json import platform import subprocess @@ -9,22 +11,29 @@ from datetime import datetime from importlib.metadata import distributions, version from pathlib import Path +from typing import TYPE_CHECKING -import cuda.pathfinder -import pynvml from rich.console import Console from rich.table import Table +if TYPE_CHECKING: + from rapids_cli.hardware import GpuInfoProvider, SystemInfoProvider + console = Console() -def gather_cuda_version(): +def gather_cuda_version(*, gpu_info: GpuInfoProvider | None = None): """Return CUDA driver version as a string, similar to nvidia-smi output.""" - version = pynvml.nvmlSystemGetCudaDriverVersion() + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + + ver = gpu_info.cuda_driver_version # pynvml returns an int like 12040 for 12.4, so format as string - major = version // 1000 - minor = (version % 1000) // 10 - patch = version % 10 + major = ver // 1000 + minor = (ver % 1000) // 10 + patch = ver % 10 if patch == 0: return f"{major}.{minor}" else: @@ -67,18 +76,31 @@ def gather_tools(): } -def run_debug(output_format="console"): +def run_debug( + output_format="console", + *, + gpu_info: GpuInfoProvider | None = None, + system_info: SystemInfoProvider | None = None, +): """Run debug.""" - pynvml.nvmlInit() + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + if system_info is None: # pragma: no cover + from rapids_cli.hardware import DefaultSystemInfo + + system_info = DefaultSystemInfo() + debug_info = { "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "platform": platform.platform(), "nvidia_smi_output": gather_command_output( ["nvidia-smi"], "Nvidia-smi not installed" ), - "driver_version": pynvml.nvmlSystemGetDriverVersion(), - "cuda_version": gather_cuda_version(), - "cuda_runtime_path": cuda.pathfinder.find_nvidia_header_directory("cudart"), + "driver_version": gpu_info.driver_version, + "cuda_version": gather_cuda_version(gpu_info=gpu_info), + "cuda_runtime_path": system_info.cuda_runtime_path, "system_ctk": sorted( [str(p) for p in Path("/usr/local").glob("cuda*") if p.is_dir()] ), diff --git a/rapids_cli/doctor/checks/cuda_driver.py b/rapids_cli/doctor/checks/cuda_driver.py index 252dd47..6275c1a 100644 --- a/rapids_cli/doctor/checks/cuda_driver.py +++ b/rapids_cli/doctor/checks/cuda_driver.py @@ -2,17 +2,22 @@ # SPDX-License-Identifier: Apache-2.0 """Check for CUDA and driver compatibility.""" -import pynvml +from __future__ import annotations +from typing import TYPE_CHECKING -def cuda_check(verbose=False): +if TYPE_CHECKING: + from rapids_cli.hardware import GpuInfoProvider + + +def cuda_check(verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs): """Check CUDA availability.""" + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + try: - pynvml.nvmlInit() - try: - cuda_version = pynvml.nvmlSystemGetCudaDriverVersion() - return cuda_version - except pynvml.NVMLError as e: - raise ValueError("Unable to look up CUDA version") from e - except pynvml.NVMLError as e: + return gpu_info.cuda_driver_version + except ValueError as e: raise ValueError("Unable to look up CUDA version") from e diff --git a/rapids_cli/doctor/checks/gpu.py b/rapids_cli/doctor/checks/gpu.py index 77e6ca6..d8e1a45 100644 --- a/rapids_cli/doctor/checks/gpu.py +++ b/rapids_cli/doctor/checks/gpu.py @@ -2,38 +2,52 @@ # SPDX-License-Identifier: Apache-2.0 """GPU checks for the doctor command.""" -import pynvml +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from rapids_cli.hardware import GpuInfoProvider REQUIRED_COMPUTE_CAPABILITY = 7 -def gpu_check(verbose=False): +def gpu_check(verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs): """Check GPU availability.""" + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + try: - pynvml.nvmlInit() - num_gpus = pynvml.nvmlDeviceGetCount() - except pynvml.NVMLError as e: + num_gpus = gpu_info.device_count + except ValueError as e: raise ValueError("No available GPUs detected") from e assert num_gpus > 0, "No GPUs detected" return f"GPU(s) detected: {num_gpus}" -def check_gpu_compute_capability(verbose): +def check_gpu_compute_capability( + verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs +): """Check the system for GPU Compute Capability.""" + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + try: - pynvml.nvmlInit() - except pynvml.NVMLError as e: + devices = gpu_info.devices + except ValueError as e: raise ValueError("No GPU - cannot determine GPU Compute Capability") from e - for i in range(pynvml.nvmlDeviceGetCount()): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle) - if major >= REQUIRED_COMPUTE_CAPABILITY: + for dev in devices: + if dev.compute_capability[0] >= REQUIRED_COMPUTE_CAPABILITY: continue else: raise ValueError( - f"GPU {i} requires compute capability {REQUIRED_COMPUTE_CAPABILITY} " - f"or higher but only has {major}.{minor}." + f"GPU {dev.index} requires compute capability {REQUIRED_COMPUTE_CAPABILITY} " + f"or higher but only has {dev.compute_capability[0]}.{dev.compute_capability[1]}." "See https://developer.nvidia.com/cuda-gpus for more information." ) return True diff --git a/rapids_cli/doctor/checks/memory.py b/rapids_cli/doctor/checks/memory.py index cb1fcb5..f1d8231 100644 --- a/rapids_cli/doctor/checks/memory.py +++ b/rapids_cli/doctor/checks/memory.py @@ -2,46 +2,71 @@ # SPDX-License-Identifier: Apache-2.0 """Memory checks.""" +from __future__ import annotations + import warnings +from typing import TYPE_CHECKING -import psutil -import pynvml +if TYPE_CHECKING: + from rapids_cli.hardware import GpuInfoProvider, SystemInfoProvider -def get_system_memory(verbose=False): +def get_system_memory( + verbose=False, *, system_info: SystemInfoProvider | None = None, **kwargs +): """Get the total system memory.""" - virtual_memory = psutil.virtual_memory() - total_memory = virtual_memory.total / (1024**3) # converts bytes to gigabytes + if system_info is None: # pragma: no cover + from rapids_cli.hardware import DefaultSystemInfo + + system_info = DefaultSystemInfo() + + total_memory = system_info.total_memory_bytes / ( + 1024**3 + ) # converts bytes to gigabytes return total_memory -def get_gpu_memory(verbose=False): +def get_gpu_memory(verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs): """Get the total GPU memory.""" - pynvml.nvmlInit() - gpus = pynvml.nvmlDeviceGetCount() - gpu_memory_total = 0 - for i in range(gpus): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) - gpu_memory_total += memory_info.total / (1024**3) # converts to gigabytes - - pynvml.nvmlShutdown() + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + + gpu_memory_total = sum(dev.memory_total_bytes for dev in gpu_info.devices) / ( + 1024**3 + ) # converts to gigabytes return gpu_memory_total -def check_memory_to_gpu_ratio(verbose=True): +def check_memory_to_gpu_ratio( + verbose=True, + *, + gpu_info: GpuInfoProvider | None = None, + system_info: SystemInfoProvider | None = None, + **kwargs, +): """Check the system for a 2:1 ratio of system Memory to total GPU Memory. This is especially useful for Dask. """ + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + if system_info is None: # pragma: no cover + from rapids_cli.hardware import DefaultSystemInfo + + system_info = DefaultSystemInfo() + try: - pynvml.nvmlInit() - except pynvml.NVMLError as e: + _ = gpu_info.device_count + except ValueError as e: raise ValueError("GPU not found. Please ensure GPUs are installed.") from e - system_memory = get_system_memory(verbose) - gpu_memory = get_gpu_memory(verbose) + system_memory = get_system_memory(verbose, system_info=system_info) + gpu_memory = get_gpu_memory(verbose, gpu_info=gpu_info) ratio = system_memory / gpu_memory if ratio < 1.8: warnings.warn( diff --git a/rapids_cli/doctor/checks/nvlink.py b/rapids_cli/doctor/checks/nvlink.py index 6dd6c66..afe4f41 100644 --- a/rapids_cli/doctor/checks/nvlink.py +++ b/rapids_cli/doctor/checks/nvlink.py @@ -2,18 +2,28 @@ # SPDX-License-Identifier: Apache-2.0 """Check for NVLink status.""" -import pynvml +from __future__ import annotations +from typing import TYPE_CHECKING -def check_nvlink_status(verbose=True, **kwargs): +if TYPE_CHECKING: + from rapids_cli.hardware import GpuInfoProvider + + +def check_nvlink_status( + verbose=True, *, gpu_info: GpuInfoProvider | None = None, **kwargs +): """Check NVLink status across all GPUs.""" + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + try: - pynvml.nvmlInit() - except pynvml.NVMLError as e: + device_count = gpu_info.device_count + except ValueError as e: raise ValueError("GPU not found. Please ensure GPUs are installed.") from e - device_count = pynvml.nvmlDeviceGetCount() - # NVLink requires at least 2 GPUs to be meaningful. A single GPU has nothing # to link to, so there is nothing to check. if device_count < 2: @@ -23,29 +33,20 @@ def check_nvlink_status(verbose=True, **kwargs): # model). Mixed configurations — e.g. some NVLink-capable GPUs alongside some # that are not — are not handled and may produce misleading results. - failed_links: list[tuple[int, int]] = [] - - for gpu_idx in range(device_count): - handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx) - # NVML provides no API to query the number of NVLink slots on a device - # (e.g. V100=6, A100=12, H100=18). The only way to discover the real count - # is to iterate up to NVML_NVLINK_MAX_LINKS and stop when the driver signals - # that link_id is out of range via NVMLError_InvalidArgument. - for link_id in range(pynvml.NVML_NVLINK_MAX_LINKS): - try: - # nvmlDeviceGetNvLinkState(device, link) returns NVML_FEATURE_ENABLED - # if the link is active, or NVML_FEATURE_DISABLED if it is not. - state = pynvml.nvmlDeviceGetNvLinkState(handle, link_id) - if state == pynvml.NVML_FEATURE_DISABLED: - failed_links.append((gpu_idx, link_id)) - except pynvml.NVMLError_NotSupported: - # The driver reports NVLink is not supported on this system. - # There is nothing to check — skip like the single-GPU case above. - return False - except pynvml.NVMLError_InvalidArgument: - # link_id exceeds the number of NVLink slots on this device. - # Stop iterating links for this GPU. - break + devices = gpu_info.devices + + # An empty nvlink_states means the driver reported NVLink as unsupported (or + # no links were enumerated) for that device. Treat a system where no device + # advertises links the same as the single-GPU case — nothing to check. + if all(not dev.nvlink_states for dev in devices): + return False + + failed_links: list[tuple[int, int]] = [ + (dev.index, link_id) + for dev in devices + for link_id, active in enumerate(dev.nvlink_states) + if not active + ] if failed_links: details = ", ".join(f"GPU {gpu} link {link}" for gpu, link in failed_links) diff --git a/rapids_cli/doctor/doctor.py b/rapids_cli/doctor/doctor.py index e7cd0ad..9bcea69 100644 --- a/rapids_cli/doctor/doctor.py +++ b/rapids_cli/doctor/doctor.py @@ -10,6 +10,7 @@ from rapids_cli._compatibility import entry_points from rapids_cli.constants import DOCTOR_SYMBOL +from rapids_cli.hardware import NvmlGpuInfo console = Console() @@ -76,6 +77,8 @@ def doctor_check( console.print("Dry run, skipping checks") return True + gpu_info = NvmlGpuInfo() + results: list[CheckResult] = [] with console.status("[bold green]Running checks...") as ui_status: for i, check_fn in enumerate(checks): @@ -87,7 +90,7 @@ def doctor_check( with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") status = True - value = check_fn(verbose=verbose) + value = check_fn(verbose=verbose, gpu_info=gpu_info) caught_warnings = w except Exception as e: diff --git a/rapids_cli/hardware.py b/rapids_cli/hardware.py new file mode 100644 index 0000000..94aab52 --- /dev/null +++ b/rapids_cli/hardware.py @@ -0,0 +1,229 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Hardware abstraction layer for GPU and system information.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Protocol, runtime_checkable + + +@dataclass +class DeviceInfo: + """Per-GPU device information.""" + + index: int + compute_capability: tuple[int, int] + memory_total_bytes: int + nvlink_states: list[bool] = field(default_factory=list) + + +@runtime_checkable +class GpuInfoProvider(Protocol): + """Read-only interface for GPU information.""" + + @property + def device_count(self) -> int: + """Return number of GPU devices.""" + ... + + @property + def devices(self) -> list[DeviceInfo]: + """Return list of device information.""" + ... + + @property + def cuda_driver_version(self) -> int: + """Return CUDA driver version as integer.""" + ... + + @property + def driver_version(self) -> str: + """Return driver version string.""" + ... + + +@runtime_checkable +class SystemInfoProvider(Protocol): + """Read-only interface for system information.""" + + @property + def total_memory_bytes(self) -> int: + """Return total system memory in bytes.""" + ... + + @property + def cuda_runtime_path(self) -> str | None: + """Return path to CUDA runtime headers.""" + ... + + +class NvmlGpuInfo: + """Real GPU info provider backed by pynvml. + + Lazily loads all device information on first property access and caches results. + """ + + def __init__(self) -> None: + """Initialize with empty cached state.""" + self._loaded = False + self._device_count = 0 + self._devices: list[DeviceInfo] = [] + self._cuda_driver_version = 0 + self._driver_version = "" + + def _ensure_loaded(self) -> None: + if self._loaded: + return + + import pynvml + + try: + pynvml.nvmlInit() + except pynvml.NVMLError as e: + raise ValueError("Unable to initialize GPU driver (NVML)") from e + + self._device_count = pynvml.nvmlDeviceGetCount() + self._cuda_driver_version = pynvml.nvmlSystemGetCudaDriverVersion() + self._driver_version = pynvml.nvmlSystemGetDriverVersion() + + self._devices = [] + for i in range(self._device_count): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle) + memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) + + nvlink_states: list[bool] = [] + for link_id in range(pynvml.NVML_NVLINK_MAX_LINKS): + try: + state = pynvml.nvmlDeviceGetNvLinkState(handle, link_id) + nvlink_states.append(bool(state)) + except pynvml.NVMLError: + break + + self._devices.append( + DeviceInfo( + index=i, + compute_capability=(major, minor), + memory_total_bytes=memory_info.total, + nvlink_states=nvlink_states, + ) + ) + + self._loaded = True + + @property + def device_count(self) -> int: + """Return number of GPU devices.""" + self._ensure_loaded() + return self._device_count + + @property + def devices(self) -> list[DeviceInfo]: + """Return list of device information.""" + self._ensure_loaded() + return self._devices + + @property + def cuda_driver_version(self) -> int: + """Return CUDA driver version as integer (e.g. 12040).""" + self._ensure_loaded() + return self._cuda_driver_version + + @property + def driver_version(self) -> str: + """Return driver version string.""" + self._ensure_loaded() + return self._driver_version + + +class DefaultSystemInfo: + """Real system info provider backed by psutil and cuda.pathfinder. + + Lazily loads each piece of information on first access. + """ + + def __init__(self) -> None: + """Initialize with empty cached state.""" + self._memory_loaded = False + self._total_memory_bytes = 0 + self._cuda_path_loaded = False + self._cuda_runtime_path: str | None = None + + @property + def total_memory_bytes(self) -> int: + """Return total system memory in bytes.""" + if not self._memory_loaded: + import psutil + + self._total_memory_bytes = psutil.virtual_memory().total + self._memory_loaded = True + return self._total_memory_bytes + + @property + def cuda_runtime_path(self) -> str | None: + """Return path to CUDA runtime headers.""" + if not self._cuda_path_loaded: + import cuda.pathfinder + + self._cuda_runtime_path = cuda.pathfinder.find_nvidia_header_directory( + "cudart" + ) + self._cuda_path_loaded = True + return self._cuda_runtime_path + + +@dataclass +class FakeGpuInfo: + """Test fake for GPU information with pre-set data.""" + + device_count: int = 0 + devices: list[DeviceInfo] = field(default_factory=list) + cuda_driver_version: int = 0 + driver_version: str = "" + + +@dataclass +class FakeSystemInfo: + """Test fake for system information with pre-set data.""" + + total_memory_bytes: int = 0 + cuda_runtime_path: str | None = None + + +class FailingGpuInfo: + """Test fake that raises ValueError on any property access.""" + + @property + def device_count(self) -> int: + """Raise ValueError.""" + raise ValueError("No GPU available") + + @property + def devices(self) -> list[DeviceInfo]: + """Raise ValueError.""" + raise ValueError("No GPU available") + + @property + def cuda_driver_version(self) -> int: + """Raise ValueError.""" + raise ValueError("No GPU available") + + @property + def driver_version(self) -> str: + """Raise ValueError.""" + raise ValueError("No GPU available") + + +class FailingSystemInfo: + """Test fake that raises ValueError on any property access.""" + + @property + def total_memory_bytes(self) -> int: + """Raise ValueError.""" + raise ValueError("System info unavailable") + + @property + def cuda_runtime_path(self) -> str | None: + """Raise ValueError.""" + raise ValueError("System info unavailable") diff --git a/rapids_cli/tests/test_cuda.py b/rapids_cli/tests/test_cuda.py index c6d4525..de4fd99 100644 --- a/rapids_cli/tests/test_cuda.py +++ b/rapids_cli/tests/test_cuda.py @@ -1,34 +1,17 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import patch - -import pynvml import pytest from rapids_cli.doctor.checks.cuda_driver import cuda_check +from rapids_cli.hardware import FailingGpuInfo, FakeGpuInfo def test_cuda_check_success(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), - ): - assert cuda_check(verbose=True) == 12050 - - -def test_cuda_check_init_fails(): - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): - with pytest.raises(ValueError, match="Unable to look up CUDA version"): - cuda_check() + gpu_info = FakeGpuInfo(cuda_driver_version=12050) + assert cuda_check(verbose=True, gpu_info=gpu_info) == 12050 -def test_cuda_check_version_query_fails(): - with ( - patch("pynvml.nvmlInit"), - patch( - "pynvml.nvmlSystemGetCudaDriverVersion", - side_effect=pynvml.NVMLError(1), - ), - ): - with pytest.raises(ValueError, match="Unable to look up CUDA version"): - cuda_check() +def test_cuda_check_no_gpu(): + gpu_info = FailingGpuInfo() + with pytest.raises(ValueError, match="Unable to look up CUDA version"): + cuda_check(verbose=False, gpu_info=gpu_info) diff --git a/rapids_cli/tests/test_debug.py b/rapids_cli/tests/test_debug.py index 91c330c..79b9db5 100644 --- a/rapids_cli/tests/test_debug.py +++ b/rapids_cli/tests/test_debug.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 import json -from unittest.mock import MagicMock, patch +from unittest.mock import patch from rapids_cli.debug.debug import ( gather_command_output, @@ -10,24 +10,22 @@ gather_tools, run_debug, ) +from rapids_cli.hardware import FakeGpuInfo, FakeSystemInfo def test_gather_cuda_version(): - """Test CUDA version gathering.""" - with patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040): - result = gather_cuda_version() - assert result == "12.4" + gpu_info = FakeGpuInfo(cuda_driver_version=12040) + result = gather_cuda_version(gpu_info=gpu_info) + assert result == "12.4" def test_gather_cuda_version_with_patch(): - """Test CUDA version with patch number.""" - with patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12345): - result = gather_cuda_version() - assert result == "12.34.5" + gpu_info = FakeGpuInfo(cuda_driver_version=12345) + result = gather_cuda_version(gpu_info=gpu_info) + assert result == "12.34.5" def test_gather_package_versions(): - """Test package version gathering.""" result = gather_package_versions() assert isinstance(result, dict) assert len(result) > 0 @@ -36,25 +34,21 @@ def test_gather_package_versions(): def test_gather_command_output_success(): - """Test successful command output gathering.""" result = gather_command_output(["echo", "test"]) assert result == "test" def test_gather_command_output_with_fallback(): - """Test command output with fallback.""" result = gather_command_output(["nonexistent_command"], fallback_output="fallback") assert result == "fallback" def test_gather_command_output_no_fallback(): - """Test command output without fallback.""" result = gather_command_output(["nonexistent_command"]) assert result is None def test_gather_tools(): - """Test tools gathering.""" with ( patch( "rapids_cli.debug.debug.gather_command_output", @@ -69,40 +63,41 @@ def test_gather_tools(): def test_run_debug_console(capsys): - """Test run_debug with console output.""" - mock_vm = MagicMock() - mock_vm.total = 32 * 1024**3 + gpu_info = FakeGpuInfo( + device_count=1, + cuda_driver_version=12040, + driver_version="550.54.15", + ) + system_info = FakeSystemInfo( + total_memory_bytes=32 * 1024**3, + cuda_runtime_path="/usr/local/cuda/include", + ) with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54.15"), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040), - patch( - "cuda.pathfinder.find_nvidia_header_directory", - return_value="/usr/local/cuda/include", - ), patch("pathlib.Path.glob", return_value=[]), patch("rapids_cli.debug.debug.gather_package_versions", return_value={}), patch("rapids_cli.debug.debug.gather_command_output", return_value=None), patch("rapids_cli.debug.debug.gather_tools", return_value={}), patch("pathlib.Path.read_text", return_value='NAME="Ubuntu"\nVERSION="22.04"'), ): - run_debug(output_format="console") + run_debug(output_format="console", gpu_info=gpu_info, system_info=system_info) captured = capsys.readouterr() assert "RAPIDS Debug Information" in captured.out def test_run_debug_json(capsys): - """Test run_debug with JSON output.""" + gpu_info = FakeGpuInfo( + device_count=1, + cuda_driver_version=12040, + driver_version="550.54.15", + ) + system_info = FakeSystemInfo( + total_memory_bytes=32 * 1024**3, + cuda_runtime_path="/usr/local/cuda/include", + ) + with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54.15"), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040), - patch( - "cuda.pathfinder.find_nvidia_header_directory", - return_value="/usr/local/cuda/include", - ), patch("pathlib.Path.glob", return_value=[]), patch( "rapids_cli.debug.debug.gather_package_versions", @@ -114,7 +109,7 @@ def test_run_debug_json(capsys): patch("rapids_cli.debug.debug.gather_tools", return_value={"pip": "pip 23.0"}), patch("pathlib.Path.read_text", return_value='NAME="Ubuntu"\nVERSION="22.04"'), ): - run_debug(output_format="json") + run_debug(output_format="json", gpu_info=gpu_info, system_info=system_info) captured = capsys.readouterr() output = json.loads(captured.out) diff --git a/rapids_cli/tests/test_gpu.py b/rapids_cli/tests/test_gpu.py index a895bc2..f9fdf28 100644 --- a/rapids_cli/tests/test_gpu.py +++ b/rapids_cli/tests/test_gpu.py @@ -1,7 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import patch - import pytest from rapids_cli.doctor.checks.gpu import ( @@ -9,67 +7,60 @@ check_gpu_compute_capability, gpu_check, ) +from rapids_cli.hardware import DeviceInfo, FailingGpuInfo, FakeGpuInfo def test_gpu_check_success(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - ): - result = gpu_check(verbose=True) - assert result == "GPU(s) detected: 2" + gpu_info = FakeGpuInfo(device_count=2) + result = gpu_check(verbose=True, gpu_info=gpu_info) + assert result == "GPU(s) detected: 2" def test_gpu_check_no_gpus(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=0), - ): - with pytest.raises(AssertionError, match="No GPUs detected"): - gpu_check(verbose=False) + gpu_info = FakeGpuInfo(device_count=0) + with pytest.raises(AssertionError, match="No GPUs detected"): + gpu_check(verbose=False, gpu_info=gpu_info) def test_gpu_check_nvml_error(): - import pynvml - - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): - with pytest.raises(ValueError, match="No available GPUs detected"): - gpu_check(verbose=False) + gpu_info = FailingGpuInfo() + with pytest.raises(ValueError, match="No available GPUs detected"): + gpu_check(verbose=False, gpu_info=gpu_info) def test_check_gpu_compute_capability_success(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex"), - patch( - "pynvml.nvmlDeviceGetCudaComputeCapability", - return_value=(REQUIRED_COMPUTE_CAPABILITY, 5), + devices = [ + DeviceInfo( + index=0, + compute_capability=(REQUIRED_COMPUTE_CAPABILITY, 5), + memory_total_bytes=0, ), - ): - result = check_gpu_compute_capability(verbose=True) - assert result is True + DeviceInfo( + index=1, + compute_capability=(REQUIRED_COMPUTE_CAPABILITY, 5), + memory_total_bytes=0, + ), + ] + gpu_info = FakeGpuInfo(device_count=2, devices=devices) + result = check_gpu_compute_capability(verbose=True, gpu_info=gpu_info) + assert result is True def test_check_gpu_compute_capability_insufficient(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=1), - patch("pynvml.nvmlDeviceGetHandleByIndex"), - patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(6, 0)), + devices = [ + DeviceInfo(index=0, compute_capability=(6, 0), memory_total_bytes=0), + ] + gpu_info = FakeGpuInfo(device_count=1, devices=devices) + with pytest.raises( + ValueError, + match=f"GPU 0 requires compute capability {REQUIRED_COMPUTE_CAPABILITY}", ): - with pytest.raises( - ValueError, - match=f"GPU 0 requires compute capability {REQUIRED_COMPUTE_CAPABILITY}", - ): - check_gpu_compute_capability(verbose=False) + check_gpu_compute_capability(verbose=False, gpu_info=gpu_info) def test_check_gpu_compute_capability_no_gpu(): - import pynvml - - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): - with pytest.raises( - ValueError, match="No GPU - cannot determine GPU Compute Capability" - ): - check_gpu_compute_capability(verbose=False) + gpu_info = FailingGpuInfo() + with pytest.raises( + ValueError, match="No GPU - cannot determine GPU Compute Capability" + ): + check_gpu_compute_capability(verbose=False, gpu_info=gpu_info) diff --git a/rapids_cli/tests/test_hardware.py b/rapids_cli/tests/test_hardware.py new file mode 100644 index 0000000..1236e0f --- /dev/null +++ b/rapids_cli/tests/test_hardware.py @@ -0,0 +1,235 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +from unittest.mock import MagicMock, patch + +import pynvml +import pytest + +from rapids_cli.hardware import ( + DefaultSystemInfo, + DeviceInfo, + FailingGpuInfo, + FailingSystemInfo, + FakeGpuInfo, + FakeSystemInfo, + GpuInfoProvider, + NvmlGpuInfo, + SystemInfoProvider, +) + +# --- NvmlGpuInfo tests --- + + +def test_nvml_gpu_info_init_failure(): + with patch( + "pynvml.nvmlInit", + side_effect=pynvml.NVMLError(pynvml.NVML_ERROR_DRIVER_NOT_LOADED), + ): + gpu_info = NvmlGpuInfo() + with pytest.raises(ValueError, match="Unable to initialize GPU driver"): + _ = gpu_info.device_count + + +def test_nvml_gpu_info_loads_once(): + mock_handle = MagicMock() + mock_memory = MagicMock() + mock_memory.total = 16 * 1024**3 + + with ( + patch("pynvml.nvmlInit") as mock_init, + patch("pynvml.nvmlDeviceGetCount", return_value=1), + patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), + patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54"), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(7, 5)), + patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), + patch( + "pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported + ), + ): + gpu_info = NvmlGpuInfo() + # Access multiple properties to verify caching + _ = gpu_info.device_count + _ = gpu_info.devices + _ = gpu_info.cuda_driver_version + _ = gpu_info.driver_version + # nvmlInit should be called exactly once + mock_init.assert_called_once() + + +def test_nvml_gpu_info_device_data(): + mock_handle = MagicMock() + mock_memory = MagicMock() + mock_memory.total = 24 * 1024**3 + + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=2), + patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12060), + patch("pynvml.nvmlSystemGetDriverVersion", return_value="560.10"), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(9, 0)), + patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), + patch("pynvml.nvmlDeviceGetNvLinkState", return_value=1), + ): + gpu_info = NvmlGpuInfo() + assert gpu_info.device_count == 2 + assert len(gpu_info.devices) == 2 + assert gpu_info.devices[0].compute_capability == (9, 0) + assert gpu_info.devices[0].memory_total_bytes == 24 * 1024**3 + assert gpu_info.cuda_driver_version == 12060 + assert gpu_info.driver_version == "560.10" + + +def test_nvml_gpu_info_nvlink_states(): + mock_handle = MagicMock() + mock_memory = MagicMock() + mock_memory.total = 16 * 1024**3 + + def nvlink_side_effect(handle, link_id): + if link_id < 2: + return 1 + raise pynvml.NVMLError_NotSupported() + + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=1), + patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), + patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54"), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(7, 5)), + patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), + patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=nvlink_side_effect), + ): + gpu_info = NvmlGpuInfo() + assert gpu_info.devices[0].nvlink_states == [True, True] + + +def test_nvml_gpu_info_no_nvlink(): + mock_handle = MagicMock() + mock_memory = MagicMock() + mock_memory.total = 16 * 1024**3 + + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=1), + patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), + patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54"), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(7, 5)), + patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), + patch( + "pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported + ), + ): + gpu_info = NvmlGpuInfo() + assert gpu_info.devices[0].nvlink_states == [] + + +# --- DefaultSystemInfo tests --- + + +def test_default_system_info_total_memory(): + mock_vm = MagicMock() + mock_vm.total = 64 * 1024**3 + with patch("psutil.virtual_memory", return_value=mock_vm): + sys_info = DefaultSystemInfo() + assert sys_info.total_memory_bytes == 64 * 1024**3 + + +def test_default_system_info_cuda_runtime_path(): + with patch( + "cuda.pathfinder.find_nvidia_header_directory", + return_value="/usr/local/cuda/include", + ): + sys_info = DefaultSystemInfo() + assert sys_info.cuda_runtime_path == "/usr/local/cuda/include" + + +def test_default_system_info_caches(): + mock_vm = MagicMock() + mock_vm.total = 64 * 1024**3 + with patch("psutil.virtual_memory", return_value=mock_vm) as mock_psutil: + sys_info = DefaultSystemInfo() + _ = sys_info.total_memory_bytes + _ = sys_info.total_memory_bytes + mock_psutil.assert_called_once() + + +# --- FakeGpuInfo tests --- + + +def test_fake_gpu_info_defaults(): + fake = FakeGpuInfo() + assert fake.device_count == 0 + assert fake.devices == [] + assert fake.cuda_driver_version == 0 + assert fake.driver_version == "" + + +def test_fake_gpu_info_custom(): + devices = [ + DeviceInfo(index=0, compute_capability=(8, 0), memory_total_bytes=32 * 1024**3) + ] + fake = FakeGpuInfo( + device_count=1, + devices=devices, + cuda_driver_version=12040, + driver_version="550.0", + ) + assert fake.device_count == 1 + assert len(fake.devices) == 1 + assert fake.cuda_driver_version == 12040 + + +def test_fake_gpu_info_satisfies_protocol(): + assert isinstance(FakeGpuInfo(), GpuInfoProvider) + + +# --- FakeSystemInfo tests --- + + +def test_fake_system_info_defaults(): + fake = FakeSystemInfo() + assert fake.total_memory_bytes == 0 + assert fake.cuda_runtime_path is None + + +def test_fake_system_info_satisfies_protocol(): + assert isinstance(FakeSystemInfo(), SystemInfoProvider) + + +# --- FailingGpuInfo tests --- + + +def test_failing_gpu_info_device_count(): + with pytest.raises(ValueError, match="No GPU available"): + _ = FailingGpuInfo().device_count + + +def test_failing_gpu_info_devices(): + with pytest.raises(ValueError, match="No GPU available"): + _ = FailingGpuInfo().devices + + +def test_failing_gpu_info_cuda_driver_version(): + with pytest.raises(ValueError, match="No GPU available"): + _ = FailingGpuInfo().cuda_driver_version + + +def test_failing_gpu_info_driver_version(): + with pytest.raises(ValueError, match="No GPU available"): + _ = FailingGpuInfo().driver_version + + +# --- FailingSystemInfo tests --- + + +def test_failing_system_info_total_memory(): + with pytest.raises(ValueError, match="System info unavailable"): + _ = FailingSystemInfo().total_memory_bytes + + +def test_failing_system_info_cuda_runtime_path(): + with pytest.raises(ValueError, match="System info unavailable"): + _ = FailingSystemInfo().cuda_runtime_path diff --git a/rapids_cli/tests/test_memory.py b/rapids_cli/tests/test_memory.py index 572df33..183d7ff 100644 --- a/rapids_cli/tests/test_memory.py +++ b/rapids_cli/tests/test_memory.py @@ -1,7 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import MagicMock, patch - import pytest from rapids_cli.doctor.checks.memory import ( @@ -9,74 +7,62 @@ get_gpu_memory, get_system_memory, ) +from rapids_cli.hardware import DeviceInfo, FailingGpuInfo, FakeGpuInfo, FakeSystemInfo def test_get_system_memory(): - mock_vm = MagicMock() - mock_vm.total = 32 * 1024**3 # 32 GB in bytes - with patch("psutil.virtual_memory", return_value=mock_vm): - result = get_system_memory(verbose=False) - assert result == 32.0 + system_info = FakeSystemInfo(total_memory_bytes=32 * 1024**3) + result = get_system_memory(verbose=False, system_info=system_info) + assert result == 32.0 def test_get_gpu_memory_single_gpu(): - mock_handle = MagicMock() - mock_memory_info = MagicMock() - mock_memory_info.total = 16 * 1024**3 # 16 GB in bytes - - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=1), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory_info), - patch("pynvml.nvmlShutdown"), - ): - result = get_gpu_memory(verbose=False) - assert result == 16.0 + devices = [ + DeviceInfo(index=0, compute_capability=(7, 0), memory_total_bytes=16 * 1024**3) + ] + gpu_info = FakeGpuInfo(device_count=1, devices=devices) + result = get_gpu_memory(verbose=False, gpu_info=gpu_info) + assert result == 16.0 def test_get_gpu_memory_multiple_gpus(): - mock_handle = MagicMock() - mock_memory_info = MagicMock() - mock_memory_info.total = 16 * 1024**3 # 16 GB per GPU - - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=4), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory_info), - patch("pynvml.nvmlShutdown"), - ): - result = get_gpu_memory(verbose=False) - assert result == 64.0 # 16 GB * 4 GPUs + devices = [ + DeviceInfo(index=i, compute_capability=(7, 0), memory_total_bytes=16 * 1024**3) + for i in range(4) + ] + gpu_info = FakeGpuInfo(device_count=4, devices=devices) + result = get_gpu_memory(verbose=False, gpu_info=gpu_info) + assert result == 64.0 # 16 GB * 4 GPUs def test_check_memory_to_gpu_ratio_good_ratio(): - with ( - patch("pynvml.nvmlInit"), - patch("rapids_cli.doctor.checks.memory.get_system_memory", return_value=64.0), - patch("rapids_cli.doctor.checks.memory.get_gpu_memory", return_value=32.0), - ): - result = check_memory_to_gpu_ratio(verbose=True) - assert result is True + devices = [ + DeviceInfo(index=0, compute_capability=(7, 0), memory_total_bytes=32 * 1024**3) + ] + gpu_info = FakeGpuInfo(device_count=1, devices=devices) + system_info = FakeSystemInfo(total_memory_bytes=64 * 1024**3) + result = check_memory_to_gpu_ratio( + verbose=True, gpu_info=gpu_info, system_info=system_info + ) + assert result is True def test_check_memory_to_gpu_ratio_warning(): - with ( - patch("pynvml.nvmlInit"), - patch("rapids_cli.doctor.checks.memory.get_system_memory", return_value=32.0), - patch("rapids_cli.doctor.checks.memory.get_gpu_memory", return_value=32.0), - ): - with pytest.warns(UserWarning, match="System Memory to total GPU Memory ratio"): - result = check_memory_to_gpu_ratio(verbose=True) - assert result is True + devices = [ + DeviceInfo(index=0, compute_capability=(7, 0), memory_total_bytes=32 * 1024**3) + ] + gpu_info = FakeGpuInfo(device_count=1, devices=devices) + system_info = FakeSystemInfo(total_memory_bytes=32 * 1024**3) + with pytest.warns(UserWarning, match="System Memory to total GPU Memory ratio"): + result = check_memory_to_gpu_ratio( + verbose=True, gpu_info=gpu_info, system_info=system_info + ) + assert result is True def test_check_memory_to_gpu_ratio_no_gpu(): - import pynvml - - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): - with pytest.raises( - ValueError, match="GPU not found. Please ensure GPUs are installed." - ): - check_memory_to_gpu_ratio(verbose=False) + gpu_info = FailingGpuInfo() + with pytest.raises( + ValueError, match="GPU not found. Please ensure GPUs are installed." + ): + check_memory_to_gpu_ratio(verbose=False, gpu_info=gpu_info) diff --git a/rapids_cli/tests/test_nvlink.py b/rapids_cli/tests/test_nvlink.py index 4deb0dc..bf5a1ec 100644 --- a/rapids_cli/tests/test_nvlink.py +++ b/rapids_cli/tests/test_nvlink.py @@ -1,10 +1,18 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import MagicMock, patch - import pytest from rapids_cli.doctor.checks.nvlink import check_nvlink_status +from rapids_cli.hardware import DeviceInfo, FailingGpuInfo, FakeGpuInfo + + +def _make_device(index: int, nvlink_states: list[bool]) -> DeviceInfo: + return DeviceInfo( + index=index, + compute_capability=(7, 0), + memory_total_bytes=0, + nvlink_states=nvlink_states, + ) @pytest.mark.parametrize( @@ -16,130 +24,62 @@ ) def test_check_nvlink_status_success(verbose, expected): """2 GPUs, all NVLinks active — verbose controls whether a summary string is returned.""" - import pynvml - - mock_handle = MagicMock() - - # Simulate a V100 with 6 NVLink slots; link_id >= 6 is out of range. - def mock_link_state(handle, link_id): - if link_id >= 6: - raise pynvml.NVMLError_InvalidArgument - return pynvml.NVML_FEATURE_ENABLED - - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=mock_link_state), - ): - result = check_nvlink_status(verbose=verbose) - assert result == expected + # Simulate a V100 with 6 NVLink slots, all active. + devices = [_make_device(0, [True] * 6), _make_device(1, [True] * 6)] + gpu_info = FakeGpuInfo(device_count=2, devices=devices) + result = check_nvlink_status(verbose=verbose, gpu_info=gpu_info) + assert result == expected def test_check_nvlink_status_single_gpu(): """Single GPU — NVLink is not applicable, check skips early.""" - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=1), - ): - result = check_nvlink_status(verbose=False) - assert result is False + gpu_info = FakeGpuInfo(device_count=1, devices=[_make_device(0, [])]) + result = check_nvlink_status(verbose=False, gpu_info=gpu_info) + assert result is False def test_check_nvlink_status_no_gpu(): - """nvmlInit fails — no GPUs installed.""" - import pynvml - - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): - with pytest.raises( - ValueError, match="GPU not found. Please ensure GPUs are installed." - ): - check_nvlink_status(verbose=False) + """GPU info unavailable — surfaces as a GPU-not-found error.""" + gpu_info = FailingGpuInfo() + with pytest.raises( + ValueError, match="GPU not found. Please ensure GPUs are installed." + ): + check_nvlink_status(verbose=False, gpu_info=gpu_info) def test_check_nvlink_status_not_supported(): - """NVLink is not supported on this system — check skips silently like single-GPU case.""" - import pynvml - - mock_handle = MagicMock() - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch( - "pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported - ), - ): - result = check_nvlink_status(verbose=False) - assert result is False + """NVLink not supported on any device — check skips silently like single-GPU case.""" + # When NVML reports NVLink as not supported, NvmlGpuInfo records an empty list. + devices = [_make_device(0, []), _make_device(1, [])] + gpu_info = FakeGpuInfo(device_count=2, devices=devices) + result = check_nvlink_status(verbose=False, gpu_info=gpu_info) + assert result is False def test_check_nvlink_status_link_inactive(): """A supported link is inactive — check fails and reports which GPU and link.""" - import pynvml - - mock_handle = MagicMock() - - # Simulate a V100 with 6 NVLink slots, all inactive. - def mock_link_state(handle, link_id): - if link_id >= 6: - raise pynvml.NVMLError_InvalidArgument - return pynvml.NVML_FEATURE_DISABLED - - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=mock_link_state), - ): - with pytest.raises(ValueError, match="NVLink inactive on:"): - check_nvlink_status(verbose=False) + devices = [_make_device(0, [False] * 6), _make_device(1, [False] * 6)] + gpu_info = FakeGpuInfo(device_count=2, devices=devices) + with pytest.raises(ValueError, match="NVLink inactive on:"): + check_nvlink_status(verbose=False, gpu_info=gpu_info) def test_check_nvlink_status_partial_failure(): """Some links active, some inactive — all failures are reported in a single error.""" - import pynvml - - mock_handle = MagicMock() - - # Simulate a V100 with 6 NVLink slots: link 0 active, link 1 inactive, rest active. - def mock_link_state(handle, link_id): - if link_id >= 6: - raise pynvml.NVMLError_InvalidArgument - if link_id == 1: - return pynvml.NVML_FEATURE_DISABLED - return pynvml.NVML_FEATURE_ENABLED - - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=mock_link_state), - ): - with pytest.raises(ValueError, match="NVLink inactive on:") as exc_info: - check_nvlink_status(verbose=False) - # Both GPUs should have link 1 reported as failed - assert "GPU 0 link 1" in str(exc_info.value) - assert "GPU 1 link 1" in str(exc_info.value) - - -def test_check_nvlink_status_invalid_argument(): - """NVMLError_InvalidArgument stops link iteration early — check succeeds for valid links.""" - import pynvml - - mock_handle = MagicMock() - - # Simulate an A100 with 12 NVLink slots; link_id >= 12 is out of range. - def mock_link_state(handle, link_id): - if link_id >= 12: - raise pynvml.NVMLError_InvalidArgument - return pynvml.NVML_FEATURE_ENABLED - - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=mock_link_state), - ): - result = check_nvlink_status(verbose=True) - assert result == "All NVLinks active across 2 GPUs" + # V100 with 6 NVLink slots: link 0 active, link 1 inactive, rest active. + states = [True, False, True, True, True, True] + devices = [_make_device(0, states), _make_device(1, states)] + gpu_info = FakeGpuInfo(device_count=2, devices=devices) + with pytest.raises(ValueError, match="NVLink inactive on:") as exc_info: + check_nvlink_status(verbose=False, gpu_info=gpu_info) + # Both GPUs should have link 1 reported as failed. + assert "GPU 0 link 1" in str(exc_info.value) + assert "GPU 1 link 1" in str(exc_info.value) + + +def test_check_nvlink_status_mixed_link_counts(): + """Links of differing counts (e.g. A100=12) iterate fully and succeed when all active.""" + devices = [_make_device(0, [True] * 12), _make_device(1, [True] * 12)] + gpu_info = FakeGpuInfo(device_count=2, devices=devices) + result = check_nvlink_status(verbose=True, gpu_info=gpu_info) + assert result == "All NVLinks active across 2 GPUs" From d9176159525dbf46a16e0cae3dcd324d69235a45 Mon Sep 17 00:00:00 2001 From: Mike McCarty Date: Mon, 20 Apr 2026 14:10:26 -0400 Subject: [PATCH 2/5] Replace hardware kwargs/classes with a provider registry Introduce rapids_cli/providers.py as the process-wide home for the GPU, system, and CUDA-toolkit provider instances. The doctor orchestrator installs real providers once per run via set_providers(); checks and debug functions read them via get_gpu_info() / get_system_info() / get_toolkit_info(). Lazy fallbacks construct real implementations on first access so nothing imports pynvml / psutil / cuda.pathfinder unless a provider is actually touched. Drop the intermediate Check base class and the six check subclasses; every check is now a plain module-level function with a (verbose=False, **kwargs) signature. Same treatment for debug.run_debug and its gather_* helpers. Tests use pytest fixtures in rapids_cli/tests/conftest.py (set_gpu_info / set_system_info / set_toolkit_info) that wrap monkeypatch.setattr on the registry globals, plus an autouse reset fixture for isolation. External plugin authors who already write def my_check(verbose=False, **kwargs) are unaffected. Also fix a latent inconsistency: doctor.py now installs system_info too, so the memory check no longer relies on its own fallback at runtime. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Mike McCarty --- dependency-injection-refactoring.md | 152 ----------------------- rapids_cli/debug/debug.py | 32 ++--- rapids_cli/doctor/checks/cuda_driver.py | 14 +-- rapids_cli/doctor/checks/cuda_toolkit.py | 29 ++--- rapids_cli/doctor/checks/gpu.py | 36 ++---- rapids_cli/doctor/checks/memory.py | 54 ++------ rapids_cli/doctor/checks/nvlink.py | 15 +-- rapids_cli/doctor/doctor.py | 7 +- rapids_cli/providers.py | 67 ++++++++++ rapids_cli/tests/conftest.py | 52 ++++++++ rapids_cli/tests/test_cuda.py | 12 +- rapids_cli/tests/test_cuda_toolkit.py | 94 +++++++------- rapids_cli/tests/test_debug.py | 58 +++++---- rapids_cli/tests/test_gpu.py | 38 +++--- rapids_cli/tests/test_memory.py | 49 +++----- rapids_cli/tests/test_nvlink.py | 47 ++++--- 16 files changed, 315 insertions(+), 441 deletions(-) delete mode 100644 dependency-injection-refactoring.md create mode 100644 rapids_cli/providers.py create mode 100644 rapids_cli/tests/conftest.py diff --git a/dependency-injection-refactoring.md b/dependency-injection-refactoring.md deleted file mode 100644 index 697642c..0000000 --- a/dependency-injection-refactoring.md +++ /dev/null @@ -1,152 +0,0 @@ -# Dependency Injection Refactoring - -## Context - -The check modules (`gpu.py`, `cuda_driver.py`, `memory.py`, `nvlink.py`) -and `debug.py` previously called `pynvml`, `psutil`, and `cuda.pathfinder` -directly. This forced tests to use 50+ `mock.patch` calls with deeply -nested context managers and `MagicMock` objects to simulate hardware -configurations. A thin abstraction layer was introduced so tests can -construct plain dataclasses instead of mocking low-level library internals. - -## Approach: Default Parameter Injection with Provider Dataclasses - -A single new file `rapids_cli/hardware.py` was created containing: - -- **`DeviceInfo`** dataclass -- holds per-GPU data - (index, compute capability, memory, nvlink states) -- **`GpuInfoProvider`** protocol -- read-only interface for GPU info - (`device_count`, `devices`, `cuda_driver_version`, `driver_version`) -- **`SystemInfoProvider`** protocol -- read-only interface for system info - (`total_memory_bytes`, `cuda_runtime_path`) -- **`NvmlGpuInfo`** -- real implementation backed by pynvml - (lazy-loads on first property access, caches results) -- **`DefaultSystemInfo`** -- real implementation backed by - psutil + cuda.pathfinder (lazy-loads per property) -- **`FakeGpuInfo`** / **`FakeSystemInfo`** -- test fakes - (plain dataclasses, no hardware dependency) -- **`FailingGpuInfo`** / **`FailingSystemInfo`** -- test fakes that - raise `ValueError` on access (simulates missing hardware) - -Check functions gained an optional keyword parameter with `None` default: - -```python -def gpu_check(verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs): - if gpu_info is None: # pragma: no cover - gpu_info = NvmlGpuInfo() -``` - -The orchestrator (`doctor.py`) creates a shared `NvmlGpuInfo()` instance -and passes it to all checks via `check_fn(verbose=verbose, gpu_info=gpu_info)`. -Third-party plugins safely ignore the extra keyword argument via their -own `**kwargs`. - -## Files Changed - -### New file: `rapids_cli/hardware.py` - -Contains all provider abstractions: - -- `DeviceInfo` dataclass with fields: `index`, `compute_capability`, - `memory_total_bytes`, `nvlink_states` -- `GpuInfoProvider` and `SystemInfoProvider` protocols - (runtime-checkable) -- `NvmlGpuInfo` -- calls `nvmlInit()` once on first property access, - queries all device info (count, compute capability, memory, - NVLink states), and caches everything -- `DefaultSystemInfo` -- lazily loads system memory via psutil and - CUDA path via cuda.pathfinder (each cached independently) -- `FakeGpuInfo`, `FakeSystemInfo` -- `@dataclass` test fakes with - pre-set data -- `FailingGpuInfo`, `FailingSystemInfo` -- test fakes that raise - `ValueError` on any property access - -### Modified: `rapids_cli/doctor/checks/gpu.py` - -- Removed `import pynvml` -- Added `gpu_info: GpuInfoProvider | None = None` parameter and - `**kwargs` to both `gpu_check()` and `check_gpu_compute_capability()` -- Replaced direct `pynvml` calls with `gpu_info.device_count` and - iteration over `gpu_info.devices` - -### Modified: `rapids_cli/doctor/checks/cuda_driver.py` - -- Removed `import pynvml` -- Added `gpu_info` parameter and `**kwargs` to `cuda_check()` -- Replaced nested try/except with `gpu_info.cuda_driver_version` - -### Modified: `rapids_cli/doctor/checks/memory.py` - -- Removed `import pynvml` and `import psutil` -- Added `system_info` parameter to `get_system_memory()` -- Added `gpu_info` parameter to `get_gpu_memory()` -- Added both `gpu_info` and `system_info` parameters to - `check_memory_to_gpu_ratio()` -- `get_system_memory()` reads `system_info.total_memory_bytes` -- `get_gpu_memory()` sums `dev.memory_total_bytes` from - `gpu_info.devices` -- `check_memory_to_gpu_ratio()` passes injected providers down - to helpers - -### Modified: `rapids_cli/doctor/checks/nvlink.py` - -- Removed `import pynvml` -- Added `gpu_info` parameter and `**kwargs` to `check_nvlink_status()` -- Iterates `dev.nvlink_states` instead of calling - `nvmlDeviceGetNvLinkState` -- **Side-fix**: the original code always passed `0` instead of - `nvlink_id` to `nvmlDeviceGetNvLinkState`; the refactored - `NvmlGpuInfo` queries each link by its actual index - -### Modified: `rapids_cli/debug/debug.py` - -- Removed `import pynvml` and `import cuda.pathfinder` -- Added `gpu_info` parameter to `gather_cuda_version()` -- Added `gpu_info` and `system_info` parameters to `run_debug()` -- Replaced direct pynvml/cuda.pathfinder calls with provider - property accesses - -### Modified: `rapids_cli/doctor/doctor.py` - -- Imports `NvmlGpuInfo` from `rapids_cli.hardware` -- Creates a shared `NvmlGpuInfo()` instance before the check loop -- Passes it via `check_fn(verbose=verbose, gpu_info=gpu_info)` - -### Rewritten tests - -`test_gpu.py`, `test_cuda.py`, `test_memory.py`, `test_nvlink.py`, -`test_debug.py`: - -- Replaced all `patch("pynvml.*")` / `patch("psutil.*")` / - `patch("cuda.pathfinder.*")` with `FakeGpuInfo` / `FakeSystemInfo` / - `FailingGpuInfo` construction -- Tests for `debug.py` still use patches for non-hardware concerns - (subprocess, pathlib, gather_tools) - -### New file: `rapids_cli/tests/test_hardware.py` - -- Unit tests for `NvmlGpuInfo` - (init failure, loads once, device data, NVLink states, no NVLink) -- Unit tests for `DefaultSystemInfo` - (total memory, CUDA runtime path, caching) -- Tests for `FakeGpuInfo` / `FakeSystemInfo` - (defaults, custom values, protocol satisfaction) -- Tests for `FailingGpuInfo` / `FailingSystemInfo` - (all properties raise) - -## Impact - -| Metric | Before | After | -| --------------------------------------------- | ------- | --------------------------------- | -| Hardware library patches in check/debug tests | ~51 | 0 (moved to test_hardware.py) | -| import pynvml in check/debug modules | 5 files | 1 file (hardware.py) | -| MagicMock objects for hardware | ~11 | 0 | -| pynvml.nvmlInit() calls in production | 7 | 1 (in NvmlGpuInfo._ensure_loaded) | -| Total tests | 53 | 72 (+19 hardware tests) | -| Coverage | 95%+ | 97.72% | - -## Verification - -1. `pytest` -- all 72 tests pass -2. `pytest --cov-fail-under=95` -- coverage at 97.72%, above threshold -3. `pre-commit run --all-files` -- all checks pass diff --git a/rapids_cli/debug/debug.py b/rapids_cli/debug/debug.py index b4afde5..9344c7f 100644 --- a/rapids_cli/debug/debug.py +++ b/rapids_cli/debug/debug.py @@ -11,25 +11,18 @@ from datetime import datetime from importlib.metadata import distributions, version from pathlib import Path -from typing import TYPE_CHECKING from rich.console import Console from rich.table import Table -if TYPE_CHECKING: - from rapids_cli.hardware import GpuInfoProvider, SystemInfoProvider +from rapids_cli.providers import get_gpu_info, get_system_info console = Console() -def gather_cuda_version(*, gpu_info: GpuInfoProvider | None = None): +def gather_cuda_version(): """Return CUDA driver version as a string, similar to nvidia-smi output.""" - if gpu_info is None: # pragma: no cover - from rapids_cli.hardware import NvmlGpuInfo - - gpu_info = NvmlGpuInfo() - - ver = gpu_info.cuda_driver_version + ver = get_gpu_info().cuda_driver_version # pynvml returns an int like 12040 for 12.4, so format as string major = ver // 1000 minor = (ver % 1000) // 10 @@ -76,21 +69,10 @@ def gather_tools(): } -def run_debug( - output_format="console", - *, - gpu_info: GpuInfoProvider | None = None, - system_info: SystemInfoProvider | None = None, -): +def run_debug(output_format="console"): """Run debug.""" - if gpu_info is None: # pragma: no cover - from rapids_cli.hardware import NvmlGpuInfo - - gpu_info = NvmlGpuInfo() - if system_info is None: # pragma: no cover - from rapids_cli.hardware import DefaultSystemInfo - - system_info = DefaultSystemInfo() + gpu_info = get_gpu_info() + system_info = get_system_info() debug_info = { "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), @@ -99,7 +81,7 @@ def run_debug( ["nvidia-smi"], "Nvidia-smi not installed" ), "driver_version": gpu_info.driver_version, - "cuda_version": gather_cuda_version(gpu_info=gpu_info), + "cuda_version": gather_cuda_version(), "cuda_runtime_path": system_info.cuda_runtime_path, "system_ctk": sorted( [str(p) for p in Path("/usr/local").glob("cuda*") if p.is_dir()] diff --git a/rapids_cli/doctor/checks/cuda_driver.py b/rapids_cli/doctor/checks/cuda_driver.py index 6275c1a..0b92f36 100644 --- a/rapids_cli/doctor/checks/cuda_driver.py +++ b/rapids_cli/doctor/checks/cuda_driver.py @@ -4,20 +4,12 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from rapids_cli.providers import get_gpu_info -if TYPE_CHECKING: - from rapids_cli.hardware import GpuInfoProvider - -def cuda_check(verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs): +def cuda_check(verbose=False, **kwargs): """Check CUDA availability.""" - if gpu_info is None: # pragma: no cover - from rapids_cli.hardware import NvmlGpuInfo - - gpu_info = NvmlGpuInfo() - try: - return gpu_info.cuda_driver_version + return get_gpu_info().cuda_driver_version except ValueError as e: raise ValueError("Unable to look up CUDA version") from e diff --git a/rapids_cli/doctor/checks/cuda_toolkit.py b/rapids_cli/doctor/checks/cuda_toolkit.py index 033bc52..b7a9ced 100644 --- a/rapids_cli/doctor/checks/cuda_toolkit.py +++ b/rapids_cli/doctor/checks/cuda_toolkit.py @@ -186,29 +186,26 @@ def _gather_toolkit_info() -> CudaToolkitInfo: # pragma: no cover return info -def cuda_toolkit_check( - verbose=False, *, toolkit_info: CudaToolkitInfo | None = None, **kwargs -): +def cuda_toolkit_check(verbose=False, **kwargs): """Check CUDA toolkit library availability and version consistency.""" - if toolkit_info is None: # pragma: no cover - toolkit_info = _gather_toolkit_info() + from rapids_cli.providers import get_toolkit_info + + info = get_toolkit_info() # Check library findability - if toolkit_info.missing_libs: - any_found_via = next(iter(toolkit_info.found_libs.values()), None) - raise ValueError( - _format_missing_error(toolkit_info.missing_libs, any_found_via) - ) + if info.missing_libs: + any_found_via = next(iter(info.found_libs.values()), None) + raise ValueError(_format_missing_error(info.missing_libs, any_found_via)) # Check driver availability - if toolkit_info.driver_major is None: + if info.driver_major is None: raise ValueError( "Unable to query the GPU driver's CUDA version. " "RAPIDS requires a working NVIDIA GPU driver." ) - driver_major = toolkit_info.driver_major - toolkit_major = toolkit_info.toolkit_major + driver_major = info.driver_major + toolkit_major = info.toolkit_major # Compare toolkit to driver (only error when toolkit > driver, drivers are backward compatible) if toolkit_major is not None and toolkit_major > driver_major: @@ -216,14 +213,14 @@ def cuda_toolkit_check( _format_mismatch_error( toolkit_major, driver_major, - toolkit_info.found_libs.get("cudart"), - toolkit_info.cudart_path, + info.found_libs.get("cudart"), + info.cudart_path, ) ) # Only check system paths if CUDA was found via system/CUDA_HOME. # When found via conda or pip, RAPIDS uses those libs and ignores system paths. - cudart_source = toolkit_info.found_libs.get("cudart", "") + cudart_source = info.found_libs.get("cudart", "") if cudart_source not in ("conda", "site-packages"): if _CUDA_SYMLINK.exists(): _check_path_version( diff --git a/rapids_cli/doctor/checks/gpu.py b/rapids_cli/doctor/checks/gpu.py index d8e1a45..1c14f36 100644 --- a/rapids_cli/doctor/checks/gpu.py +++ b/rapids_cli/doctor/checks/gpu.py @@ -4,50 +4,34 @@ from __future__ import annotations -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from rapids_cli.hardware import GpuInfoProvider +from rapids_cli.providers import get_gpu_info REQUIRED_COMPUTE_CAPABILITY = 7 -def gpu_check(verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs): +def gpu_check(verbose=False, **kwargs): """Check GPU availability.""" - if gpu_info is None: # pragma: no cover - from rapids_cli.hardware import NvmlGpuInfo - - gpu_info = NvmlGpuInfo() - try: - num_gpus = gpu_info.device_count + num_gpus = get_gpu_info().device_count except ValueError as e: raise ValueError("No available GPUs detected") from e assert num_gpus > 0, "No GPUs detected" return f"GPU(s) detected: {num_gpus}" -def check_gpu_compute_capability( - verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs -): +def check_gpu_compute_capability(verbose=False, **kwargs): """Check the system for GPU Compute Capability.""" - if gpu_info is None: # pragma: no cover - from rapids_cli.hardware import NvmlGpuInfo - - gpu_info = NvmlGpuInfo() - try: - devices = gpu_info.devices + devices = get_gpu_info().devices except ValueError as e: raise ValueError("No GPU - cannot determine GPU Compute Capability") from e for dev in devices: if dev.compute_capability[0] >= REQUIRED_COMPUTE_CAPABILITY: continue - else: - raise ValueError( - f"GPU {dev.index} requires compute capability {REQUIRED_COMPUTE_CAPABILITY} " - f"or higher but only has {dev.compute_capability[0]}.{dev.compute_capability[1]}." - "See https://developer.nvidia.com/cuda-gpus for more information." - ) + raise ValueError( + f"GPU {dev.index} requires compute capability {REQUIRED_COMPUTE_CAPABILITY} " + f"or higher but only has {dev.compute_capability[0]}.{dev.compute_capability[1]}." + "See https://developer.nvidia.com/cuda-gpus for more information." + ) return True diff --git a/rapids_cli/doctor/checks/memory.py b/rapids_cli/doctor/checks/memory.py index f1d8231..35e711e 100644 --- a/rapids_cli/doctor/checks/memory.py +++ b/rapids_cli/doctor/checks/memory.py @@ -5,69 +5,31 @@ from __future__ import annotations import warnings -from typing import TYPE_CHECKING -if TYPE_CHECKING: - from rapids_cli.hardware import GpuInfoProvider, SystemInfoProvider +from rapids_cli.providers import get_gpu_info, get_system_info -def get_system_memory( - verbose=False, *, system_info: SystemInfoProvider | None = None, **kwargs -): +def get_system_memory(verbose=False, **kwargs): """Get the total system memory.""" - if system_info is None: # pragma: no cover - from rapids_cli.hardware import DefaultSystemInfo + return get_system_info().total_memory_bytes / (1024**3) - system_info = DefaultSystemInfo() - total_memory = system_info.total_memory_bytes / ( - 1024**3 - ) # converts bytes to gigabytes - return total_memory - - -def get_gpu_memory(verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs): +def get_gpu_memory(verbose=False, **kwargs): """Get the total GPU memory.""" - if gpu_info is None: # pragma: no cover - from rapids_cli.hardware import NvmlGpuInfo - - gpu_info = NvmlGpuInfo() - - gpu_memory_total = sum(dev.memory_total_bytes for dev in gpu_info.devices) / ( - 1024**3 - ) # converts to gigabytes - return gpu_memory_total + return sum(dev.memory_total_bytes for dev in get_gpu_info().devices) / (1024**3) -def check_memory_to_gpu_ratio( - verbose=True, - *, - gpu_info: GpuInfoProvider | None = None, - system_info: SystemInfoProvider | None = None, - **kwargs, -): +def check_memory_to_gpu_ratio(verbose=True, **kwargs): """Check the system for a 2:1 ratio of system Memory to total GPU Memory. This is especially useful for Dask. - """ - if gpu_info is None: # pragma: no cover - from rapids_cli.hardware import NvmlGpuInfo - - gpu_info = NvmlGpuInfo() - if system_info is None: # pragma: no cover - from rapids_cli.hardware import DefaultSystemInfo - - system_info = DefaultSystemInfo() - try: - _ = gpu_info.device_count + _ = get_gpu_info().device_count except ValueError as e: raise ValueError("GPU not found. Please ensure GPUs are installed.") from e - system_memory = get_system_memory(verbose, system_info=system_info) - gpu_memory = get_gpu_memory(verbose, gpu_info=gpu_info) - ratio = system_memory / gpu_memory + ratio = get_system_memory() / get_gpu_memory() if ratio < 1.8: warnings.warn( "System Memory to total GPU Memory ratio not at least 2:1 ratio. " diff --git a/rapids_cli/doctor/checks/nvlink.py b/rapids_cli/doctor/checks/nvlink.py index afe4f41..6bd1b7d 100644 --- a/rapids_cli/doctor/checks/nvlink.py +++ b/rapids_cli/doctor/checks/nvlink.py @@ -4,21 +4,12 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from rapids_cli.providers import get_gpu_info -if TYPE_CHECKING: - from rapids_cli.hardware import GpuInfoProvider - -def check_nvlink_status( - verbose=True, *, gpu_info: GpuInfoProvider | None = None, **kwargs -): +def check_nvlink_status(verbose=True, **kwargs): """Check NVLink status across all GPUs.""" - if gpu_info is None: # pragma: no cover - from rapids_cli.hardware import NvmlGpuInfo - - gpu_info = NvmlGpuInfo() - + gpu_info = get_gpu_info() try: device_count = gpu_info.device_count except ValueError as e: diff --git a/rapids_cli/doctor/doctor.py b/rapids_cli/doctor/doctor.py index 9bcea69..ed9ea43 100644 --- a/rapids_cli/doctor/doctor.py +++ b/rapids_cli/doctor/doctor.py @@ -8,9 +8,10 @@ from rich.console import Console +from rapids_cli import providers from rapids_cli._compatibility import entry_points from rapids_cli.constants import DOCTOR_SYMBOL -from rapids_cli.hardware import NvmlGpuInfo +from rapids_cli.hardware import DefaultSystemInfo, NvmlGpuInfo console = Console() @@ -77,7 +78,7 @@ def doctor_check( console.print("Dry run, skipping checks") return True - gpu_info = NvmlGpuInfo() + providers.set_providers(gpu_info=NvmlGpuInfo(), system_info=DefaultSystemInfo()) results: list[CheckResult] = [] with console.status("[bold green]Running checks...") as ui_status: @@ -90,7 +91,7 @@ def doctor_check( with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") status = True - value = check_fn(verbose=verbose, gpu_info=gpu_info) + value = check_fn(verbose=verbose) caught_warnings = w except Exception as e: diff --git a/rapids_cli/providers.py b/rapids_cli/providers.py new file mode 100644 index 0000000..1641fa4 --- /dev/null +++ b/rapids_cli/providers.py @@ -0,0 +1,67 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Process-wide hardware provider registry. + +The doctor orchestrator installs real providers once per run via +``set_providers``; check and debug functions read them via the ``get_*`` +accessors. Tests swap in fakes with ``monkeypatch.setattr`` against the +module-level globals (or via the fixtures in ``rapids_cli/tests/conftest.py``). +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from rapids_cli.doctor.checks.cuda_toolkit import CudaToolkitInfo + from rapids_cli.hardware import GpuInfoProvider, SystemInfoProvider + +_gpu_info: GpuInfoProvider | None = None +_system_info: SystemInfoProvider | None = None +_toolkit_info: CudaToolkitInfo | None = None + + +def set_providers( + *, + gpu_info: GpuInfoProvider | None = None, + system_info: SystemInfoProvider | None = None, + toolkit_info: CudaToolkitInfo | None = None, +) -> None: + """Install providers for the current run. Only non-None args are applied.""" + global _gpu_info, _system_info, _toolkit_info + if gpu_info is not None: + _gpu_info = gpu_info + if system_info is not None: + _system_info = system_info + if toolkit_info is not None: + _toolkit_info = toolkit_info + + +def get_gpu_info() -> GpuInfoProvider: + """Return the installed GPU info provider, lazily creating a real one.""" + global _gpu_info + if _gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + _gpu_info = NvmlGpuInfo() + return _gpu_info + + +def get_system_info() -> SystemInfoProvider: + """Return the installed system info provider, lazily creating a real one.""" + global _system_info + if _system_info is None: # pragma: no cover + from rapids_cli.hardware import DefaultSystemInfo + + _system_info = DefaultSystemInfo() + return _system_info + + +def get_toolkit_info() -> CudaToolkitInfo: + """Return the installed toolkit info, lazily gathering it from the system.""" + global _toolkit_info + if _toolkit_info is None: # pragma: no cover + from rapids_cli.doctor.checks.cuda_toolkit import _gather_toolkit_info + + _toolkit_info = _gather_toolkit_info() + return _toolkit_info diff --git a/rapids_cli/tests/conftest.py b/rapids_cli/tests/conftest.py new file mode 100644 index 0000000..71bae86 --- /dev/null +++ b/rapids_cli/tests/conftest.py @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Shared test fixtures for the rapids-cli test suite.""" + +from __future__ import annotations + +import pytest + +from rapids_cli import providers + + +@pytest.fixture(autouse=True) +def _reset_providers(monkeypatch): + """Ensure each test starts with a clean provider registry. + + Tests that need specific providers installed use the ``set_gpu_info`` / + ``set_system_info`` / ``set_toolkit_info`` fixtures, which install fakes + via ``monkeypatch.setattr`` so they auto-revert after the test. + """ + monkeypatch.setattr(providers, "_gpu_info", None) + monkeypatch.setattr(providers, "_system_info", None) + monkeypatch.setattr(providers, "_toolkit_info", None) + + +@pytest.fixture +def set_gpu_info(monkeypatch): + """Install a fake GPU info provider for the duration of the test.""" + + def _set(fake): + monkeypatch.setattr(providers, "_gpu_info", fake) + + return _set + + +@pytest.fixture +def set_system_info(monkeypatch): + """Install a fake system info provider for the duration of the test.""" + + def _set(fake): + monkeypatch.setattr(providers, "_system_info", fake) + + return _set + + +@pytest.fixture +def set_toolkit_info(monkeypatch): + """Install a fake CUDA toolkit info for the duration of the test.""" + + def _set(fake): + monkeypatch.setattr(providers, "_toolkit_info", fake) + + return _set diff --git a/rapids_cli/tests/test_cuda.py b/rapids_cli/tests/test_cuda.py index de4fd99..518c3fb 100644 --- a/rapids_cli/tests/test_cuda.py +++ b/rapids_cli/tests/test_cuda.py @@ -6,12 +6,12 @@ from rapids_cli.hardware import FailingGpuInfo, FakeGpuInfo -def test_cuda_check_success(): - gpu_info = FakeGpuInfo(cuda_driver_version=12050) - assert cuda_check(verbose=True, gpu_info=gpu_info) == 12050 +def test_cuda_check_success(set_gpu_info): + set_gpu_info(FakeGpuInfo(cuda_driver_version=12050)) + assert cuda_check(verbose=True) == 12050 -def test_cuda_check_no_gpu(): - gpu_info = FailingGpuInfo() +def test_cuda_check_no_gpu(set_gpu_info): + set_gpu_info(FailingGpuInfo()) with pytest.raises(ValueError, match="Unable to look up CUDA version"): - cuda_check(verbose=False, gpu_info=gpu_info) + cuda_check(verbose=False) diff --git a/rapids_cli/tests/test_cuda_toolkit.py b/rapids_cli/tests/test_cuda_toolkit.py index 8d1a19a..7487bbf 100644 --- a/rapids_cli/tests/test_cuda_toolkit.py +++ b/rapids_cli/tests/test_cuda_toolkit.py @@ -71,9 +71,9 @@ def test_ctypes_cuda_version_oserror(): # Check function tests -def test_check_success(): - info = _make_info() - result = cuda_toolkit_check(verbose=True, toolkit_info=info) +def test_check_success(set_toolkit_info): + set_toolkit_info(_make_info()) + result = cuda_toolkit_check(verbose=True) assert isinstance(result, str) assert "CUDA 12" in result @@ -90,75 +90,83 @@ def test_check_success(): ], ids=["all_missing", "partial_missing"], ) -def test_check_missing_libs(found_libs, missing_libs, expected_match): - info = _make_info( - found_libs=found_libs, - missing_libs=missing_libs, - cudart_path=None if not found_libs else "/usr/lib/libcudart.so", - toolkit_major=None if not found_libs else 12, +def test_check_missing_libs(set_toolkit_info, found_libs, missing_libs, expected_match): + set_toolkit_info( + _make_info( + found_libs=found_libs, + missing_libs=missing_libs, + cudart_path=None if not found_libs else "/usr/lib/libcudart.so", + toolkit_major=None if not found_libs else 12, + ) ) with pytest.raises(ValueError, match=expected_match): - cuda_toolkit_check(toolkit_info=info) + cuda_toolkit_check() -def test_check_driver_query_fails(): - info = _make_info(driver_major=None) +def test_check_driver_query_fails(set_toolkit_info): + set_toolkit_info(_make_info(driver_major=None)) with pytest.raises(ValueError, match="Unable to query"): - cuda_toolkit_check(toolkit_info=info) + cuda_toolkit_check() -def test_check_toolkit_newer_than_driver(): +def test_check_toolkit_newer_than_driver(set_toolkit_info): """CUDA 13 toolkit + CUDA 12 driver = error.""" - info = _make_info( - found_libs={"cudart": "conda", "nvrtc": "conda", "nvvm": "conda"}, - cudart_path="/usr/lib/libcudart.so.13", - toolkit_major=13, - driver_major=12, + set_toolkit_info( + _make_info( + found_libs={"cudart": "conda", "nvrtc": "conda", "nvvm": "conda"}, + cudart_path="/usr/lib/libcudart.so.13", + toolkit_major=13, + driver_major=12, + ) ) with pytest.raises(ValueError, match="newer than what the GPU driver supports"): - cuda_toolkit_check(toolkit_info=info) + cuda_toolkit_check() -def test_check_toolkit_older_than_driver_passes(): +def test_check_toolkit_older_than_driver_passes(set_toolkit_info): """CUDA 12 toolkit + CUDA 13 driver = fine (backward compatible).""" - info = _make_info(toolkit_major=12, driver_major=13) - assert cuda_toolkit_check(verbose=False, toolkit_info=info) is True + set_toolkit_info(_make_info(toolkit_major=12, driver_major=13)) + assert cuda_toolkit_check(verbose=False) is True -def test_check_cuda_symlink_newer_than_driver(tmp_path): +def test_check_cuda_symlink_newer_than_driver(set_toolkit_info, tmp_path): """Only checked when CUDA was found via system paths, not conda/pip.""" symlink_target = tmp_path / "cuda-13.0" symlink_target.mkdir() symlink_path = tmp_path / "cuda" symlink_path.symlink_to(symlink_target) - info = _make_info( - found_libs={ - "cudart": "system-search", - "nvrtc": "system-search", - "nvvm": "system-search", - }, - toolkit_major=12, - driver_major=12, + set_toolkit_info( + _make_info( + found_libs={ + "cudart": "system-search", + "nvrtc": "system-search", + "nvvm": "system-search", + }, + toolkit_major=12, + driver_major=12, + ) ) with ( patch("rapids_cli.doctor.checks.cuda_toolkit._CUDA_SYMLINK", symlink_path), patch.dict("os.environ", {}, clear=True), ): with pytest.raises(ValueError, match="points to CUDA 13"): - cuda_toolkit_check(toolkit_info=info) + cuda_toolkit_check() -def test_check_cuda_home_newer_than_driver(): +def test_check_cuda_home_newer_than_driver(set_toolkit_info): """Only checked when CUDA was found via system paths, not conda/pip.""" - info = _make_info( - found_libs={ - "cudart": "system-search", - "nvrtc": "system-search", - "nvvm": "system-search", - }, - toolkit_major=12, - driver_major=12, + set_toolkit_info( + _make_info( + found_libs={ + "cudart": "system-search", + "nvrtc": "system-search", + "nvvm": "system-search", + }, + toolkit_major=12, + driver_major=12, + ) ) with ( patch( @@ -167,4 +175,4 @@ def test_check_cuda_home_newer_than_driver(): patch.dict("os.environ", {"CUDA_HOME": "/usr/local/cuda-13.0"}, clear=True), ): with pytest.raises(ValueError, match="CUDA_HOME"): - cuda_toolkit_check(toolkit_info=info) + cuda_toolkit_check() diff --git a/rapids_cli/tests/test_debug.py b/rapids_cli/tests/test_debug.py index 79b9db5..1da549c 100644 --- a/rapids_cli/tests/test_debug.py +++ b/rapids_cli/tests/test_debug.py @@ -13,16 +13,14 @@ from rapids_cli.hardware import FakeGpuInfo, FakeSystemInfo -def test_gather_cuda_version(): - gpu_info = FakeGpuInfo(cuda_driver_version=12040) - result = gather_cuda_version(gpu_info=gpu_info) - assert result == "12.4" +def test_gather_cuda_version(set_gpu_info): + set_gpu_info(FakeGpuInfo(cuda_driver_version=12040)) + assert gather_cuda_version() == "12.4" -def test_gather_cuda_version_with_patch(): - gpu_info = FakeGpuInfo(cuda_driver_version=12345) - result = gather_cuda_version(gpu_info=gpu_info) - assert result == "12.34.5" +def test_gather_cuda_version_with_patch(set_gpu_info): + set_gpu_info(FakeGpuInfo(cuda_driver_version=12345)) + assert gather_cuda_version() == "12.34.5" def test_gather_package_versions(): @@ -62,15 +60,19 @@ def test_gather_tools(): assert "g++" in result -def test_run_debug_console(capsys): - gpu_info = FakeGpuInfo( - device_count=1, - cuda_driver_version=12040, - driver_version="550.54.15", +def test_run_debug_console(capsys, set_gpu_info, set_system_info): + set_gpu_info( + FakeGpuInfo( + device_count=1, + cuda_driver_version=12040, + driver_version="550.54.15", + ) ) - system_info = FakeSystemInfo( - total_memory_bytes=32 * 1024**3, - cuda_runtime_path="/usr/local/cuda/include", + set_system_info( + FakeSystemInfo( + total_memory_bytes=32 * 1024**3, + cuda_runtime_path="/usr/local/cuda/include", + ) ) with ( @@ -80,21 +82,25 @@ def test_run_debug_console(capsys): patch("rapids_cli.debug.debug.gather_tools", return_value={}), patch("pathlib.Path.read_text", return_value='NAME="Ubuntu"\nVERSION="22.04"'), ): - run_debug(output_format="console", gpu_info=gpu_info, system_info=system_info) + run_debug(output_format="console") captured = capsys.readouterr() assert "RAPIDS Debug Information" in captured.out -def test_run_debug_json(capsys): - gpu_info = FakeGpuInfo( - device_count=1, - cuda_driver_version=12040, - driver_version="550.54.15", +def test_run_debug_json(capsys, set_gpu_info, set_system_info): + set_gpu_info( + FakeGpuInfo( + device_count=1, + cuda_driver_version=12040, + driver_version="550.54.15", + ) ) - system_info = FakeSystemInfo( - total_memory_bytes=32 * 1024**3, - cuda_runtime_path="/usr/local/cuda/include", + set_system_info( + FakeSystemInfo( + total_memory_bytes=32 * 1024**3, + cuda_runtime_path="/usr/local/cuda/include", + ) ) with ( @@ -109,7 +115,7 @@ def test_run_debug_json(capsys): patch("rapids_cli.debug.debug.gather_tools", return_value={"pip": "pip 23.0"}), patch("pathlib.Path.read_text", return_value='NAME="Ubuntu"\nVERSION="22.04"'), ): - run_debug(output_format="json", gpu_info=gpu_info, system_info=system_info) + run_debug(output_format="json") captured = capsys.readouterr() output = json.loads(captured.out) diff --git a/rapids_cli/tests/test_gpu.py b/rapids_cli/tests/test_gpu.py index f9fdf28..b04d4c8 100644 --- a/rapids_cli/tests/test_gpu.py +++ b/rapids_cli/tests/test_gpu.py @@ -10,25 +10,24 @@ from rapids_cli.hardware import DeviceInfo, FailingGpuInfo, FakeGpuInfo -def test_gpu_check_success(): - gpu_info = FakeGpuInfo(device_count=2) - result = gpu_check(verbose=True, gpu_info=gpu_info) - assert result == "GPU(s) detected: 2" +def test_gpu_check_success(set_gpu_info): + set_gpu_info(FakeGpuInfo(device_count=2)) + assert gpu_check(verbose=True) == "GPU(s) detected: 2" -def test_gpu_check_no_gpus(): - gpu_info = FakeGpuInfo(device_count=0) +def test_gpu_check_no_gpus(set_gpu_info): + set_gpu_info(FakeGpuInfo(device_count=0)) with pytest.raises(AssertionError, match="No GPUs detected"): - gpu_check(verbose=False, gpu_info=gpu_info) + gpu_check(verbose=False) -def test_gpu_check_nvml_error(): - gpu_info = FailingGpuInfo() +def test_gpu_check_nvml_error(set_gpu_info): + set_gpu_info(FailingGpuInfo()) with pytest.raises(ValueError, match="No available GPUs detected"): - gpu_check(verbose=False, gpu_info=gpu_info) + gpu_check(verbose=False) -def test_check_gpu_compute_capability_success(): +def test_check_gpu_compute_capability_success(set_gpu_info): devices = [ DeviceInfo( index=0, @@ -41,26 +40,25 @@ def test_check_gpu_compute_capability_success(): memory_total_bytes=0, ), ] - gpu_info = FakeGpuInfo(device_count=2, devices=devices) - result = check_gpu_compute_capability(verbose=True, gpu_info=gpu_info) - assert result is True + set_gpu_info(FakeGpuInfo(device_count=2, devices=devices)) + assert check_gpu_compute_capability(verbose=True) is True -def test_check_gpu_compute_capability_insufficient(): +def test_check_gpu_compute_capability_insufficient(set_gpu_info): devices = [ DeviceInfo(index=0, compute_capability=(6, 0), memory_total_bytes=0), ] - gpu_info = FakeGpuInfo(device_count=1, devices=devices) + set_gpu_info(FakeGpuInfo(device_count=1, devices=devices)) with pytest.raises( ValueError, match=f"GPU 0 requires compute capability {REQUIRED_COMPUTE_CAPABILITY}", ): - check_gpu_compute_capability(verbose=False, gpu_info=gpu_info) + check_gpu_compute_capability(verbose=False) -def test_check_gpu_compute_capability_no_gpu(): - gpu_info = FailingGpuInfo() +def test_check_gpu_compute_capability_no_gpu(set_gpu_info): + set_gpu_info(FailingGpuInfo()) with pytest.raises( ValueError, match="No GPU - cannot determine GPU Compute Capability" ): - check_gpu_compute_capability(verbose=False, gpu_info=gpu_info) + check_gpu_compute_capability(verbose=False) diff --git a/rapids_cli/tests/test_memory.py b/rapids_cli/tests/test_memory.py index 183d7ff..1674da8 100644 --- a/rapids_cli/tests/test_memory.py +++ b/rapids_cli/tests/test_memory.py @@ -10,59 +10,50 @@ from rapids_cli.hardware import DeviceInfo, FailingGpuInfo, FakeGpuInfo, FakeSystemInfo -def test_get_system_memory(): - system_info = FakeSystemInfo(total_memory_bytes=32 * 1024**3) - result = get_system_memory(verbose=False, system_info=system_info) - assert result == 32.0 +def test_get_system_memory(set_system_info): + set_system_info(FakeSystemInfo(total_memory_bytes=32 * 1024**3)) + assert get_system_memory(verbose=False) == 32.0 -def test_get_gpu_memory_single_gpu(): +def test_get_gpu_memory_single_gpu(set_gpu_info): devices = [ DeviceInfo(index=0, compute_capability=(7, 0), memory_total_bytes=16 * 1024**3) ] - gpu_info = FakeGpuInfo(device_count=1, devices=devices) - result = get_gpu_memory(verbose=False, gpu_info=gpu_info) - assert result == 16.0 + set_gpu_info(FakeGpuInfo(device_count=1, devices=devices)) + assert get_gpu_memory(verbose=False) == 16.0 -def test_get_gpu_memory_multiple_gpus(): +def test_get_gpu_memory_multiple_gpus(set_gpu_info): devices = [ DeviceInfo(index=i, compute_capability=(7, 0), memory_total_bytes=16 * 1024**3) for i in range(4) ] - gpu_info = FakeGpuInfo(device_count=4, devices=devices) - result = get_gpu_memory(verbose=False, gpu_info=gpu_info) - assert result == 64.0 # 16 GB * 4 GPUs + set_gpu_info(FakeGpuInfo(device_count=4, devices=devices)) + assert get_gpu_memory(verbose=False) == 64.0 # 16 GB * 4 GPUs -def test_check_memory_to_gpu_ratio_good_ratio(): +def test_check_memory_to_gpu_ratio_good_ratio(set_gpu_info, set_system_info): devices = [ DeviceInfo(index=0, compute_capability=(7, 0), memory_total_bytes=32 * 1024**3) ] - gpu_info = FakeGpuInfo(device_count=1, devices=devices) - system_info = FakeSystemInfo(total_memory_bytes=64 * 1024**3) - result = check_memory_to_gpu_ratio( - verbose=True, gpu_info=gpu_info, system_info=system_info - ) - assert result is True + set_gpu_info(FakeGpuInfo(device_count=1, devices=devices)) + set_system_info(FakeSystemInfo(total_memory_bytes=64 * 1024**3)) + assert check_memory_to_gpu_ratio(verbose=True) is True -def test_check_memory_to_gpu_ratio_warning(): +def test_check_memory_to_gpu_ratio_warning(set_gpu_info, set_system_info): devices = [ DeviceInfo(index=0, compute_capability=(7, 0), memory_total_bytes=32 * 1024**3) ] - gpu_info = FakeGpuInfo(device_count=1, devices=devices) - system_info = FakeSystemInfo(total_memory_bytes=32 * 1024**3) + set_gpu_info(FakeGpuInfo(device_count=1, devices=devices)) + set_system_info(FakeSystemInfo(total_memory_bytes=32 * 1024**3)) with pytest.warns(UserWarning, match="System Memory to total GPU Memory ratio"): - result = check_memory_to_gpu_ratio( - verbose=True, gpu_info=gpu_info, system_info=system_info - ) - assert result is True + assert check_memory_to_gpu_ratio(verbose=True) is True -def test_check_memory_to_gpu_ratio_no_gpu(): - gpu_info = FailingGpuInfo() +def test_check_memory_to_gpu_ratio_no_gpu(set_gpu_info): + set_gpu_info(FailingGpuInfo()) with pytest.raises( ValueError, match="GPU not found. Please ensure GPUs are installed." ): - check_memory_to_gpu_ratio(verbose=False, gpu_info=gpu_info) + check_memory_to_gpu_ratio(verbose=False) diff --git a/rapids_cli/tests/test_nvlink.py b/rapids_cli/tests/test_nvlink.py index bf5a1ec..f3a35e9 100644 --- a/rapids_cli/tests/test_nvlink.py +++ b/rapids_cli/tests/test_nvlink.py @@ -22,64 +22,59 @@ def _make_device(index: int, nvlink_states: list[bool]) -> DeviceInfo: (False, None), ], ) -def test_check_nvlink_status_success(verbose, expected): +def test_check_nvlink_status_success(set_gpu_info, verbose, expected): """2 GPUs, all NVLinks active — verbose controls whether a summary string is returned.""" # Simulate a V100 with 6 NVLink slots, all active. devices = [_make_device(0, [True] * 6), _make_device(1, [True] * 6)] - gpu_info = FakeGpuInfo(device_count=2, devices=devices) - result = check_nvlink_status(verbose=verbose, gpu_info=gpu_info) - assert result == expected + set_gpu_info(FakeGpuInfo(device_count=2, devices=devices)) + assert check_nvlink_status(verbose=verbose) == expected -def test_check_nvlink_status_single_gpu(): +def test_check_nvlink_status_single_gpu(set_gpu_info): """Single GPU — NVLink is not applicable, check skips early.""" - gpu_info = FakeGpuInfo(device_count=1, devices=[_make_device(0, [])]) - result = check_nvlink_status(verbose=False, gpu_info=gpu_info) - assert result is False + set_gpu_info(FakeGpuInfo(device_count=1, devices=[_make_device(0, [])])) + assert check_nvlink_status(verbose=False) is False -def test_check_nvlink_status_no_gpu(): +def test_check_nvlink_status_no_gpu(set_gpu_info): """GPU info unavailable — surfaces as a GPU-not-found error.""" - gpu_info = FailingGpuInfo() + set_gpu_info(FailingGpuInfo()) with pytest.raises( ValueError, match="GPU not found. Please ensure GPUs are installed." ): - check_nvlink_status(verbose=False, gpu_info=gpu_info) + check_nvlink_status(verbose=False) -def test_check_nvlink_status_not_supported(): +def test_check_nvlink_status_not_supported(set_gpu_info): """NVLink not supported on any device — check skips silently like single-GPU case.""" # When NVML reports NVLink as not supported, NvmlGpuInfo records an empty list. devices = [_make_device(0, []), _make_device(1, [])] - gpu_info = FakeGpuInfo(device_count=2, devices=devices) - result = check_nvlink_status(verbose=False, gpu_info=gpu_info) - assert result is False + set_gpu_info(FakeGpuInfo(device_count=2, devices=devices)) + assert check_nvlink_status(verbose=False) is False -def test_check_nvlink_status_link_inactive(): +def test_check_nvlink_status_link_inactive(set_gpu_info): """A supported link is inactive — check fails and reports which GPU and link.""" devices = [_make_device(0, [False] * 6), _make_device(1, [False] * 6)] - gpu_info = FakeGpuInfo(device_count=2, devices=devices) + set_gpu_info(FakeGpuInfo(device_count=2, devices=devices)) with pytest.raises(ValueError, match="NVLink inactive on:"): - check_nvlink_status(verbose=False, gpu_info=gpu_info) + check_nvlink_status(verbose=False) -def test_check_nvlink_status_partial_failure(): +def test_check_nvlink_status_partial_failure(set_gpu_info): """Some links active, some inactive — all failures are reported in a single error.""" # V100 with 6 NVLink slots: link 0 active, link 1 inactive, rest active. states = [True, False, True, True, True, True] devices = [_make_device(0, states), _make_device(1, states)] - gpu_info = FakeGpuInfo(device_count=2, devices=devices) + set_gpu_info(FakeGpuInfo(device_count=2, devices=devices)) with pytest.raises(ValueError, match="NVLink inactive on:") as exc_info: - check_nvlink_status(verbose=False, gpu_info=gpu_info) - # Both GPUs should have link 1 reported as failed. + check_nvlink_status(verbose=False) assert "GPU 0 link 1" in str(exc_info.value) assert "GPU 1 link 1" in str(exc_info.value) -def test_check_nvlink_status_mixed_link_counts(): +def test_check_nvlink_status_mixed_link_counts(set_gpu_info): """Links of differing counts (e.g. A100=12) iterate fully and succeed when all active.""" devices = [_make_device(0, [True] * 12), _make_device(1, [True] * 12)] - gpu_info = FakeGpuInfo(device_count=2, devices=devices) - result = check_nvlink_status(verbose=True, gpu_info=gpu_info) - assert result == "All NVLinks active across 2 GPUs" + set_gpu_info(FakeGpuInfo(device_count=2, devices=devices)) + assert check_nvlink_status(verbose=True) == "All NVLinks active across 2 GPUs" From 56ba0292d7fc04cc090f5f8f70dd547b97a0e096 Mon Sep 17 00:00:00 2001 From: Mike McCarty Date: Wed, 22 Apr 2026 10:08:04 -0400 Subject: [PATCH 3/5] Address PR review feedback: custom exception, move fakes, minimize diff noise MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add HardwareInfoError custom exception replacing generic ValueError for hardware failures (suggested by jayavenkatesh19 and ncclementi) - Move test fakes (FakeGpuInfo, FakeSystemInfo, FailingGpuInfo, FailingSystemInfo) from hardware.py to rapids_cli/tests/fakes.py - Narrow NVMLError catch in nvlink enumeration to NVMLError_InvalidArgument and NVMLError_NotSupported (matching PR #143 pattern) - Remove unnecessary `from __future__ import annotations` from check modules and debug.py - Remove `**kwargs` from check functions that didn't have it on main - Revert cosmetic variable renames (ver→version, info→toolkit_info) to minimize diff noise Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Mike McCarty --- rapids_cli/debug/debug.py | 10 ++-- rapids_cli/doctor/checks/cuda_driver.py | 7 ++- rapids_cli/doctor/checks/cuda_toolkit.py | 22 ++++---- rapids_cli/doctor/checks/gpu.py | 11 ++-- rapids_cli/doctor/checks/memory.py | 11 ++-- rapids_cli/doctor/checks/nvlink.py | 7 ++- rapids_cli/hardware.py | 67 ++++-------------------- rapids_cli/tests/__init__.py | 2 + rapids_cli/tests/fakes.py | 65 +++++++++++++++++++++++ rapids_cli/tests/test_cuda.py | 2 +- rapids_cli/tests/test_debug.py | 2 +- rapids_cli/tests/test_gpu.py | 3 +- rapids_cli/tests/test_hardware.py | 23 ++++---- rapids_cli/tests/test_memory.py | 3 +- rapids_cli/tests/test_nvlink.py | 3 +- 15 files changed, 129 insertions(+), 109 deletions(-) create mode 100644 rapids_cli/tests/__init__.py create mode 100644 rapids_cli/tests/fakes.py diff --git a/rapids_cli/debug/debug.py b/rapids_cli/debug/debug.py index 9344c7f..1da1a48 100644 --- a/rapids_cli/debug/debug.py +++ b/rapids_cli/debug/debug.py @@ -2,8 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 """This module contains the debug subcommand for the Rapids CLI.""" -from __future__ import annotations - import json import platform import subprocess @@ -22,11 +20,11 @@ def gather_cuda_version(): """Return CUDA driver version as a string, similar to nvidia-smi output.""" - ver = get_gpu_info().cuda_driver_version + version = get_gpu_info().cuda_driver_version # pynvml returns an int like 12040 for 12.4, so format as string - major = ver // 1000 - minor = (ver % 1000) // 10 - patch = ver % 10 + major = version // 1000 + minor = (version % 1000) // 10 + patch = version % 10 if patch == 0: return f"{major}.{minor}" else: diff --git a/rapids_cli/doctor/checks/cuda_driver.py b/rapids_cli/doctor/checks/cuda_driver.py index 0b92f36..f709811 100644 --- a/rapids_cli/doctor/checks/cuda_driver.py +++ b/rapids_cli/doctor/checks/cuda_driver.py @@ -2,14 +2,13 @@ # SPDX-License-Identifier: Apache-2.0 """Check for CUDA and driver compatibility.""" -from __future__ import annotations - +from rapids_cli.hardware import HardwareInfoError from rapids_cli.providers import get_gpu_info -def cuda_check(verbose=False, **kwargs): +def cuda_check(verbose=False): """Check CUDA availability.""" try: return get_gpu_info().cuda_driver_version - except ValueError as e: + except HardwareInfoError as e: raise ValueError("Unable to look up CUDA version") from e diff --git a/rapids_cli/doctor/checks/cuda_toolkit.py b/rapids_cli/doctor/checks/cuda_toolkit.py index b7a9ced..d2a3df8 100644 --- a/rapids_cli/doctor/checks/cuda_toolkit.py +++ b/rapids_cli/doctor/checks/cuda_toolkit.py @@ -190,22 +190,24 @@ def cuda_toolkit_check(verbose=False, **kwargs): """Check CUDA toolkit library availability and version consistency.""" from rapids_cli.providers import get_toolkit_info - info = get_toolkit_info() + toolkit_info = get_toolkit_info() # Check library findability - if info.missing_libs: - any_found_via = next(iter(info.found_libs.values()), None) - raise ValueError(_format_missing_error(info.missing_libs, any_found_via)) + if toolkit_info.missing_libs: + any_found_via = next(iter(toolkit_info.found_libs.values()), None) + raise ValueError( + _format_missing_error(toolkit_info.missing_libs, any_found_via) + ) # Check driver availability - if info.driver_major is None: + if toolkit_info.driver_major is None: raise ValueError( "Unable to query the GPU driver's CUDA version. " "RAPIDS requires a working NVIDIA GPU driver." ) - driver_major = info.driver_major - toolkit_major = info.toolkit_major + driver_major = toolkit_info.driver_major + toolkit_major = toolkit_info.toolkit_major # Compare toolkit to driver (only error when toolkit > driver, drivers are backward compatible) if toolkit_major is not None and toolkit_major > driver_major: @@ -213,14 +215,14 @@ def cuda_toolkit_check(verbose=False, **kwargs): _format_mismatch_error( toolkit_major, driver_major, - info.found_libs.get("cudart"), - info.cudart_path, + toolkit_info.found_libs.get("cudart"), + toolkit_info.cudart_path, ) ) # Only check system paths if CUDA was found via system/CUDA_HOME. # When found via conda or pip, RAPIDS uses those libs and ignores system paths. - cudart_source = info.found_libs.get("cudart", "") + cudart_source = toolkit_info.found_libs.get("cudart", "") if cudart_source not in ("conda", "site-packages"): if _CUDA_SYMLINK.exists(): _check_path_version( diff --git a/rapids_cli/doctor/checks/gpu.py b/rapids_cli/doctor/checks/gpu.py index 1c14f36..848a5a9 100644 --- a/rapids_cli/doctor/checks/gpu.py +++ b/rapids_cli/doctor/checks/gpu.py @@ -2,28 +2,27 @@ # SPDX-License-Identifier: Apache-2.0 """GPU checks for the doctor command.""" -from __future__ import annotations - +from rapids_cli.hardware import HardwareInfoError from rapids_cli.providers import get_gpu_info REQUIRED_COMPUTE_CAPABILITY = 7 -def gpu_check(verbose=False, **kwargs): +def gpu_check(verbose=False): """Check GPU availability.""" try: num_gpus = get_gpu_info().device_count - except ValueError as e: + except HardwareInfoError as e: raise ValueError("No available GPUs detected") from e assert num_gpus > 0, "No GPUs detected" return f"GPU(s) detected: {num_gpus}" -def check_gpu_compute_capability(verbose=False, **kwargs): +def check_gpu_compute_capability(verbose=False): """Check the system for GPU Compute Capability.""" try: devices = get_gpu_info().devices - except ValueError as e: + except HardwareInfoError as e: raise ValueError("No GPU - cannot determine GPU Compute Capability") from e for dev in devices: diff --git a/rapids_cli/doctor/checks/memory.py b/rapids_cli/doctor/checks/memory.py index 35e711e..e60a12c 100644 --- a/rapids_cli/doctor/checks/memory.py +++ b/rapids_cli/doctor/checks/memory.py @@ -2,31 +2,30 @@ # SPDX-License-Identifier: Apache-2.0 """Memory checks.""" -from __future__ import annotations - import warnings +from rapids_cli.hardware import HardwareInfoError from rapids_cli.providers import get_gpu_info, get_system_info -def get_system_memory(verbose=False, **kwargs): +def get_system_memory(verbose=False): """Get the total system memory.""" return get_system_info().total_memory_bytes / (1024**3) -def get_gpu_memory(verbose=False, **kwargs): +def get_gpu_memory(verbose=False): """Get the total GPU memory.""" return sum(dev.memory_total_bytes for dev in get_gpu_info().devices) / (1024**3) -def check_memory_to_gpu_ratio(verbose=True, **kwargs): +def check_memory_to_gpu_ratio(verbose=True): """Check the system for a 2:1 ratio of system Memory to total GPU Memory. This is especially useful for Dask. """ try: _ = get_gpu_info().device_count - except ValueError as e: + except HardwareInfoError as e: raise ValueError("GPU not found. Please ensure GPUs are installed.") from e ratio = get_system_memory() / get_gpu_memory() diff --git a/rapids_cli/doctor/checks/nvlink.py b/rapids_cli/doctor/checks/nvlink.py index 6bd1b7d..15b5878 100644 --- a/rapids_cli/doctor/checks/nvlink.py +++ b/rapids_cli/doctor/checks/nvlink.py @@ -2,17 +2,16 @@ # SPDX-License-Identifier: Apache-2.0 """Check for NVLink status.""" -from __future__ import annotations - +from rapids_cli.hardware import HardwareInfoError from rapids_cli.providers import get_gpu_info -def check_nvlink_status(verbose=True, **kwargs): +def check_nvlink_status(verbose=True): """Check NVLink status across all GPUs.""" gpu_info = get_gpu_info() try: device_count = gpu_info.device_count - except ValueError as e: + except HardwareInfoError as e: raise ValueError("GPU not found. Please ensure GPUs are installed.") from e # NVLink requires at least 2 GPUs to be meaningful. A single GPU has nothing diff --git a/rapids_cli/hardware.py b/rapids_cli/hardware.py index 94aab52..947b986 100644 --- a/rapids_cli/hardware.py +++ b/rapids_cli/hardware.py @@ -18,6 +18,10 @@ class DeviceInfo: nvlink_states: list[bool] = field(default_factory=list) +class HardwareInfoError(Exception): + """Raised when hardware information cannot be obtained.""" + + @runtime_checkable class GpuInfoProvider(Protocol): """Read-only interface for GPU information.""" @@ -81,7 +85,7 @@ def _ensure_loaded(self) -> None: try: pynvml.nvmlInit() except pynvml.NVMLError as e: - raise ValueError("Unable to initialize GPU driver (NVML)") from e + raise HardwareInfoError("Unable to initialize GPU driver (NVML)") from e self._device_count = pynvml.nvmlDeviceGetCount() self._cuda_driver_version = pynvml.nvmlSystemGetCudaDriverVersion() @@ -98,7 +102,10 @@ def _ensure_loaded(self) -> None: try: state = pynvml.nvmlDeviceGetNvLinkState(handle, link_id) nvlink_states.append(bool(state)) - except pynvml.NVMLError: + except ( + pynvml.NVMLError_InvalidArgument, + pynvml.NVMLError_NotSupported, + ): break self._devices.append( @@ -171,59 +178,3 @@ def cuda_runtime_path(self) -> str | None: ) self._cuda_path_loaded = True return self._cuda_runtime_path - - -@dataclass -class FakeGpuInfo: - """Test fake for GPU information with pre-set data.""" - - device_count: int = 0 - devices: list[DeviceInfo] = field(default_factory=list) - cuda_driver_version: int = 0 - driver_version: str = "" - - -@dataclass -class FakeSystemInfo: - """Test fake for system information with pre-set data.""" - - total_memory_bytes: int = 0 - cuda_runtime_path: str | None = None - - -class FailingGpuInfo: - """Test fake that raises ValueError on any property access.""" - - @property - def device_count(self) -> int: - """Raise ValueError.""" - raise ValueError("No GPU available") - - @property - def devices(self) -> list[DeviceInfo]: - """Raise ValueError.""" - raise ValueError("No GPU available") - - @property - def cuda_driver_version(self) -> int: - """Raise ValueError.""" - raise ValueError("No GPU available") - - @property - def driver_version(self) -> str: - """Raise ValueError.""" - raise ValueError("No GPU available") - - -class FailingSystemInfo: - """Test fake that raises ValueError on any property access.""" - - @property - def total_memory_bytes(self) -> int: - """Raise ValueError.""" - raise ValueError("System info unavailable") - - @property - def cuda_runtime_path(self) -> str | None: - """Raise ValueError.""" - raise ValueError("System info unavailable") diff --git a/rapids_cli/tests/__init__.py b/rapids_cli/tests/__init__.py new file mode 100644 index 0000000..c8dd57a --- /dev/null +++ b/rapids_cli/tests/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/rapids_cli/tests/fakes.py b/rapids_cli/tests/fakes.py new file mode 100644 index 0000000..7818633 --- /dev/null +++ b/rapids_cli/tests/fakes.py @@ -0,0 +1,65 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Test fakes for hardware providers.""" + +from __future__ import annotations + +from dataclasses import dataclass, field + +from rapids_cli.hardware import DeviceInfo, HardwareInfoError + + +@dataclass +class FakeGpuInfo: + """Test fake for GPU information with pre-set data.""" + + device_count: int = 0 + devices: list[DeviceInfo] = field(default_factory=list) + cuda_driver_version: int = 0 + driver_version: str = "" + + +@dataclass +class FakeSystemInfo: + """Test fake for system information with pre-set data.""" + + total_memory_bytes: int = 0 + cuda_runtime_path: str | None = None + + +class FailingGpuInfo: + """Test fake that raises HardwareInfoError on any property access.""" + + @property + def device_count(self) -> int: + """Raise HardwareInfoError.""" + raise HardwareInfoError("No GPU available") + + @property + def devices(self) -> list[DeviceInfo]: + """Raise HardwareInfoError.""" + raise HardwareInfoError("No GPU available") + + @property + def cuda_driver_version(self) -> int: + """Raise HardwareInfoError.""" + raise HardwareInfoError("No GPU available") + + @property + def driver_version(self) -> str: + """Raise HardwareInfoError.""" + raise HardwareInfoError("No GPU available") + + +class FailingSystemInfo: + """Test fake that raises HardwareInfoError on any property access.""" + + @property + def total_memory_bytes(self) -> int: + """Raise HardwareInfoError.""" + raise HardwareInfoError("System info unavailable") + + @property + def cuda_runtime_path(self) -> str | None: + """Raise HardwareInfoError.""" + raise HardwareInfoError("System info unavailable") diff --git a/rapids_cli/tests/test_cuda.py b/rapids_cli/tests/test_cuda.py index 518c3fb..c123d5b 100644 --- a/rapids_cli/tests/test_cuda.py +++ b/rapids_cli/tests/test_cuda.py @@ -3,7 +3,7 @@ import pytest from rapids_cli.doctor.checks.cuda_driver import cuda_check -from rapids_cli.hardware import FailingGpuInfo, FakeGpuInfo +from rapids_cli.tests.fakes import FailingGpuInfo, FakeGpuInfo def test_cuda_check_success(set_gpu_info): diff --git a/rapids_cli/tests/test_debug.py b/rapids_cli/tests/test_debug.py index 1da549c..141d0f0 100644 --- a/rapids_cli/tests/test_debug.py +++ b/rapids_cli/tests/test_debug.py @@ -10,7 +10,7 @@ gather_tools, run_debug, ) -from rapids_cli.hardware import FakeGpuInfo, FakeSystemInfo +from rapids_cli.tests.fakes import FakeGpuInfo, FakeSystemInfo def test_gather_cuda_version(set_gpu_info): diff --git a/rapids_cli/tests/test_gpu.py b/rapids_cli/tests/test_gpu.py index b04d4c8..ab81a47 100644 --- a/rapids_cli/tests/test_gpu.py +++ b/rapids_cli/tests/test_gpu.py @@ -7,7 +7,8 @@ check_gpu_compute_capability, gpu_check, ) -from rapids_cli.hardware import DeviceInfo, FailingGpuInfo, FakeGpuInfo +from rapids_cli.hardware import DeviceInfo +from rapids_cli.tests.fakes import FailingGpuInfo, FakeGpuInfo def test_gpu_check_success(set_gpu_info): diff --git a/rapids_cli/tests/test_hardware.py b/rapids_cli/tests/test_hardware.py index 1236e0f..215d68c 100644 --- a/rapids_cli/tests/test_hardware.py +++ b/rapids_cli/tests/test_hardware.py @@ -8,13 +8,16 @@ from rapids_cli.hardware import ( DefaultSystemInfo, DeviceInfo, + GpuInfoProvider, + HardwareInfoError, + NvmlGpuInfo, + SystemInfoProvider, +) +from rapids_cli.tests.fakes import ( FailingGpuInfo, FailingSystemInfo, FakeGpuInfo, FakeSystemInfo, - GpuInfoProvider, - NvmlGpuInfo, - SystemInfoProvider, ) # --- NvmlGpuInfo tests --- @@ -26,7 +29,7 @@ def test_nvml_gpu_info_init_failure(): side_effect=pynvml.NVMLError(pynvml.NVML_ERROR_DRIVER_NOT_LOADED), ): gpu_info = NvmlGpuInfo() - with pytest.raises(ValueError, match="Unable to initialize GPU driver"): + with pytest.raises(HardwareInfoError, match="Unable to initialize GPU driver"): _ = gpu_info.device_count @@ -203,22 +206,22 @@ def test_fake_system_info_satisfies_protocol(): def test_failing_gpu_info_device_count(): - with pytest.raises(ValueError, match="No GPU available"): + with pytest.raises(HardwareInfoError, match="No GPU available"): _ = FailingGpuInfo().device_count def test_failing_gpu_info_devices(): - with pytest.raises(ValueError, match="No GPU available"): + with pytest.raises(HardwareInfoError, match="No GPU available"): _ = FailingGpuInfo().devices def test_failing_gpu_info_cuda_driver_version(): - with pytest.raises(ValueError, match="No GPU available"): + with pytest.raises(HardwareInfoError, match="No GPU available"): _ = FailingGpuInfo().cuda_driver_version def test_failing_gpu_info_driver_version(): - with pytest.raises(ValueError, match="No GPU available"): + with pytest.raises(HardwareInfoError, match="No GPU available"): _ = FailingGpuInfo().driver_version @@ -226,10 +229,10 @@ def test_failing_gpu_info_driver_version(): def test_failing_system_info_total_memory(): - with pytest.raises(ValueError, match="System info unavailable"): + with pytest.raises(HardwareInfoError, match="System info unavailable"): _ = FailingSystemInfo().total_memory_bytes def test_failing_system_info_cuda_runtime_path(): - with pytest.raises(ValueError, match="System info unavailable"): + with pytest.raises(HardwareInfoError, match="System info unavailable"): _ = FailingSystemInfo().cuda_runtime_path diff --git a/rapids_cli/tests/test_memory.py b/rapids_cli/tests/test_memory.py index 1674da8..f1c73f8 100644 --- a/rapids_cli/tests/test_memory.py +++ b/rapids_cli/tests/test_memory.py @@ -7,7 +7,8 @@ get_gpu_memory, get_system_memory, ) -from rapids_cli.hardware import DeviceInfo, FailingGpuInfo, FakeGpuInfo, FakeSystemInfo +from rapids_cli.hardware import DeviceInfo +from rapids_cli.tests.fakes import FailingGpuInfo, FakeGpuInfo, FakeSystemInfo def test_get_system_memory(set_system_info): diff --git a/rapids_cli/tests/test_nvlink.py b/rapids_cli/tests/test_nvlink.py index f3a35e9..5fd32c5 100644 --- a/rapids_cli/tests/test_nvlink.py +++ b/rapids_cli/tests/test_nvlink.py @@ -3,7 +3,8 @@ import pytest from rapids_cli.doctor.checks.nvlink import check_nvlink_status -from rapids_cli.hardware import DeviceInfo, FailingGpuInfo, FakeGpuInfo +from rapids_cli.hardware import DeviceInfo +from rapids_cli.tests.fakes import FailingGpuInfo, FakeGpuInfo def _make_device(index: int, nvlink_states: list[bool]) -> DeviceInfo: From 25ff1d4092af59408dcec1406e082221dd3bf2c0 Mon Sep 17 00:00:00 2001 From: Mike McCarty Date: Wed, 22 Apr 2026 10:32:04 -0400 Subject: [PATCH 4/5] Reinstate **kwargs on check functions for plugin forward-compatibility External checks (cudf, cuml, etc.) need to absorb future kwargs without breaking when the orchestrator adds new arguments. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Mike McCarty --- rapids_cli/doctor/checks/cuda_driver.py | 2 +- rapids_cli/doctor/checks/gpu.py | 4 ++-- rapids_cli/doctor/checks/memory.py | 6 +++--- rapids_cli/doctor/checks/nvlink.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/rapids_cli/doctor/checks/cuda_driver.py b/rapids_cli/doctor/checks/cuda_driver.py index f709811..99ad672 100644 --- a/rapids_cli/doctor/checks/cuda_driver.py +++ b/rapids_cli/doctor/checks/cuda_driver.py @@ -6,7 +6,7 @@ from rapids_cli.providers import get_gpu_info -def cuda_check(verbose=False): +def cuda_check(verbose=False, **kwargs): """Check CUDA availability.""" try: return get_gpu_info().cuda_driver_version diff --git a/rapids_cli/doctor/checks/gpu.py b/rapids_cli/doctor/checks/gpu.py index 848a5a9..2b0417b 100644 --- a/rapids_cli/doctor/checks/gpu.py +++ b/rapids_cli/doctor/checks/gpu.py @@ -8,7 +8,7 @@ REQUIRED_COMPUTE_CAPABILITY = 7 -def gpu_check(verbose=False): +def gpu_check(verbose=False, **kwargs): """Check GPU availability.""" try: num_gpus = get_gpu_info().device_count @@ -18,7 +18,7 @@ def gpu_check(verbose=False): return f"GPU(s) detected: {num_gpus}" -def check_gpu_compute_capability(verbose=False): +def check_gpu_compute_capability(verbose=False, **kwargs): """Check the system for GPU Compute Capability.""" try: devices = get_gpu_info().devices diff --git a/rapids_cli/doctor/checks/memory.py b/rapids_cli/doctor/checks/memory.py index e60a12c..46d1b4d 100644 --- a/rapids_cli/doctor/checks/memory.py +++ b/rapids_cli/doctor/checks/memory.py @@ -8,17 +8,17 @@ from rapids_cli.providers import get_gpu_info, get_system_info -def get_system_memory(verbose=False): +def get_system_memory(verbose=False, **kwargs): """Get the total system memory.""" return get_system_info().total_memory_bytes / (1024**3) -def get_gpu_memory(verbose=False): +def get_gpu_memory(verbose=False, **kwargs): """Get the total GPU memory.""" return sum(dev.memory_total_bytes for dev in get_gpu_info().devices) / (1024**3) -def check_memory_to_gpu_ratio(verbose=True): +def check_memory_to_gpu_ratio(verbose=True, **kwargs): """Check the system for a 2:1 ratio of system Memory to total GPU Memory. This is especially useful for Dask. diff --git a/rapids_cli/doctor/checks/nvlink.py b/rapids_cli/doctor/checks/nvlink.py index 15b5878..567b83d 100644 --- a/rapids_cli/doctor/checks/nvlink.py +++ b/rapids_cli/doctor/checks/nvlink.py @@ -6,7 +6,7 @@ from rapids_cli.providers import get_gpu_info -def check_nvlink_status(verbose=True): +def check_nvlink_status(verbose=True, **kwargs): """Check NVLink status across all GPUs.""" gpu_info = get_gpu_info() try: From 326c789b26c424f7b97078d0791ad8cfd2740ecd Mon Sep 17 00:00:00 2001 From: Mike McCarty Date: Tue, 28 Apr 2026 16:12:30 -0400 Subject: [PATCH 5/5] Replace provider globals with a dataclass instance Wraps the three module-level provider variables in a _Providers dataclass, eliminating the need for `global` statements while keeping the same set_providers/get_* public API. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Mike McCarty --- rapids_cli/providers.py | 48 ++++++++++++++++++++---------------- rapids_cli/tests/conftest.py | 12 ++++----- 2 files changed, 33 insertions(+), 27 deletions(-) diff --git a/rapids_cli/providers.py b/rapids_cli/providers.py index 1641fa4..e94d440 100644 --- a/rapids_cli/providers.py +++ b/rapids_cli/providers.py @@ -4,21 +4,31 @@ The doctor orchestrator installs real providers once per run via ``set_providers``; check and debug functions read them via the ``get_*`` -accessors. Tests swap in fakes with ``monkeypatch.setattr`` against the -module-level globals (or via the fixtures in ``rapids_cli/tests/conftest.py``). +accessors. Tests swap in fakes with ``monkeypatch.setattr`` on the +``_providers`` dataclass instance (or via the fixtures in +``rapids_cli/tests/conftest.py``). """ from __future__ import annotations +from dataclasses import dataclass, field from typing import TYPE_CHECKING if TYPE_CHECKING: from rapids_cli.doctor.checks.cuda_toolkit import CudaToolkitInfo from rapids_cli.hardware import GpuInfoProvider, SystemInfoProvider -_gpu_info: GpuInfoProvider | None = None -_system_info: SystemInfoProvider | None = None -_toolkit_info: CudaToolkitInfo | None = None + +@dataclass +class _Providers: + """Container for the process-wide hardware providers.""" + + gpu_info: GpuInfoProvider | None = field(default=None) + system_info: SystemInfoProvider | None = field(default=None) + toolkit_info: CudaToolkitInfo | None = field(default=None) + + +_providers = _Providers() def set_providers( @@ -28,40 +38,36 @@ def set_providers( toolkit_info: CudaToolkitInfo | None = None, ) -> None: """Install providers for the current run. Only non-None args are applied.""" - global _gpu_info, _system_info, _toolkit_info if gpu_info is not None: - _gpu_info = gpu_info + _providers.gpu_info = gpu_info if system_info is not None: - _system_info = system_info + _providers.system_info = system_info if toolkit_info is not None: - _toolkit_info = toolkit_info + _providers.toolkit_info = toolkit_info def get_gpu_info() -> GpuInfoProvider: """Return the installed GPU info provider, lazily creating a real one.""" - global _gpu_info - if _gpu_info is None: # pragma: no cover + if _providers.gpu_info is None: # pragma: no cover from rapids_cli.hardware import NvmlGpuInfo - _gpu_info = NvmlGpuInfo() - return _gpu_info + _providers.gpu_info = NvmlGpuInfo() + return _providers.gpu_info def get_system_info() -> SystemInfoProvider: """Return the installed system info provider, lazily creating a real one.""" - global _system_info - if _system_info is None: # pragma: no cover + if _providers.system_info is None: # pragma: no cover from rapids_cli.hardware import DefaultSystemInfo - _system_info = DefaultSystemInfo() - return _system_info + _providers.system_info = DefaultSystemInfo() + return _providers.system_info def get_toolkit_info() -> CudaToolkitInfo: """Return the installed toolkit info, lazily gathering it from the system.""" - global _toolkit_info - if _toolkit_info is None: # pragma: no cover + if _providers.toolkit_info is None: # pragma: no cover from rapids_cli.doctor.checks.cuda_toolkit import _gather_toolkit_info - _toolkit_info = _gather_toolkit_info() - return _toolkit_info + _providers.toolkit_info = _gather_toolkit_info() + return _providers.toolkit_info diff --git a/rapids_cli/tests/conftest.py b/rapids_cli/tests/conftest.py index 71bae86..5ac127b 100644 --- a/rapids_cli/tests/conftest.py +++ b/rapids_cli/tests/conftest.py @@ -17,9 +17,9 @@ def _reset_providers(monkeypatch): ``set_system_info`` / ``set_toolkit_info`` fixtures, which install fakes via ``monkeypatch.setattr`` so they auto-revert after the test. """ - monkeypatch.setattr(providers, "_gpu_info", None) - monkeypatch.setattr(providers, "_system_info", None) - monkeypatch.setattr(providers, "_toolkit_info", None) + monkeypatch.setattr(providers._providers, "gpu_info", None) + monkeypatch.setattr(providers._providers, "system_info", None) + monkeypatch.setattr(providers._providers, "toolkit_info", None) @pytest.fixture @@ -27,7 +27,7 @@ def set_gpu_info(monkeypatch): """Install a fake GPU info provider for the duration of the test.""" def _set(fake): - monkeypatch.setattr(providers, "_gpu_info", fake) + monkeypatch.setattr(providers._providers, "gpu_info", fake) return _set @@ -37,7 +37,7 @@ def set_system_info(monkeypatch): """Install a fake system info provider for the duration of the test.""" def _set(fake): - monkeypatch.setattr(providers, "_system_info", fake) + monkeypatch.setattr(providers._providers, "system_info", fake) return _set @@ -47,6 +47,6 @@ def set_toolkit_info(monkeypatch): """Install a fake CUDA toolkit info for the duration of the test.""" def _set(fake): - monkeypatch.setattr(providers, "_toolkit_info", fake) + monkeypatch.setattr(providers._providers, "toolkit_info", fake) return _set