diff --git a/rapids_cli/debug/debug.py b/rapids_cli/debug/debug.py index fca4d1d..1da1a48 100644 --- a/rapids_cli/debug/debug.py +++ b/rapids_cli/debug/debug.py @@ -10,17 +10,17 @@ from importlib.metadata import distributions, version from pathlib import Path -import cuda.pathfinder -import pynvml from rich.console import Console from rich.table import Table +from rapids_cli.providers import get_gpu_info, get_system_info + console = Console() def gather_cuda_version(): """Return CUDA driver version as a string, similar to nvidia-smi output.""" - version = pynvml.nvmlSystemGetCudaDriverVersion() + version = get_gpu_info().cuda_driver_version # pynvml returns an int like 12040 for 12.4, so format as string major = version // 1000 minor = (version % 1000) // 10 @@ -69,16 +69,18 @@ def gather_tools(): def run_debug(output_format="console"): """Run debug.""" - pynvml.nvmlInit() + gpu_info = get_gpu_info() + system_info = get_system_info() + debug_info = { "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "platform": platform.platform(), "nvidia_smi_output": gather_command_output( ["nvidia-smi"], "Nvidia-smi not installed" ), - "driver_version": pynvml.nvmlSystemGetDriverVersion(), + "driver_version": gpu_info.driver_version, "cuda_version": gather_cuda_version(), - "cuda_runtime_path": cuda.pathfinder.find_nvidia_header_directory("cudart"), + "cuda_runtime_path": system_info.cuda_runtime_path, "system_ctk": sorted( [str(p) for p in Path("/usr/local").glob("cuda*") if p.is_dir()] ), diff --git a/rapids_cli/doctor/checks/cuda_driver.py b/rapids_cli/doctor/checks/cuda_driver.py index 252dd47..99ad672 100644 --- a/rapids_cli/doctor/checks/cuda_driver.py +++ b/rapids_cli/doctor/checks/cuda_driver.py @@ -2,17 +2,13 @@ # SPDX-License-Identifier: Apache-2.0 """Check for CUDA and driver compatibility.""" -import pynvml +from rapids_cli.hardware import HardwareInfoError +from rapids_cli.providers import get_gpu_info -def cuda_check(verbose=False): +def cuda_check(verbose=False, **kwargs): """Check CUDA availability.""" try: - pynvml.nvmlInit() - try: - cuda_version = pynvml.nvmlSystemGetCudaDriverVersion() - return cuda_version - except pynvml.NVMLError as e: - raise ValueError("Unable to look up CUDA version") from e - except pynvml.NVMLError as e: + return get_gpu_info().cuda_driver_version + except HardwareInfoError as e: raise ValueError("Unable to look up CUDA version") from e diff --git a/rapids_cli/doctor/checks/cuda_toolkit.py b/rapids_cli/doctor/checks/cuda_toolkit.py index 033bc52..d2a3df8 100644 --- a/rapids_cli/doctor/checks/cuda_toolkit.py +++ b/rapids_cli/doctor/checks/cuda_toolkit.py @@ -186,12 +186,11 @@ def _gather_toolkit_info() -> CudaToolkitInfo: # pragma: no cover return info -def cuda_toolkit_check( - verbose=False, *, toolkit_info: CudaToolkitInfo | None = None, **kwargs -): +def cuda_toolkit_check(verbose=False, **kwargs): """Check CUDA toolkit library availability and version consistency.""" - if toolkit_info is None: # pragma: no cover - toolkit_info = _gather_toolkit_info() + from rapids_cli.providers import get_toolkit_info + + toolkit_info = get_toolkit_info() # Check library findability if toolkit_info.missing_libs: diff --git a/rapids_cli/doctor/checks/gpu.py b/rapids_cli/doctor/checks/gpu.py index 77e6ca6..2b0417b 100644 --- a/rapids_cli/doctor/checks/gpu.py +++ b/rapids_cli/doctor/checks/gpu.py @@ -2,38 +2,35 @@ # SPDX-License-Identifier: Apache-2.0 """GPU checks for the doctor command.""" -import pynvml +from rapids_cli.hardware import HardwareInfoError +from rapids_cli.providers import get_gpu_info REQUIRED_COMPUTE_CAPABILITY = 7 -def gpu_check(verbose=False): +def gpu_check(verbose=False, **kwargs): """Check GPU availability.""" try: - pynvml.nvmlInit() - num_gpus = pynvml.nvmlDeviceGetCount() - except pynvml.NVMLError as e: + num_gpus = get_gpu_info().device_count + except HardwareInfoError as e: raise ValueError("No available GPUs detected") from e assert num_gpus > 0, "No GPUs detected" return f"GPU(s) detected: {num_gpus}" -def check_gpu_compute_capability(verbose): +def check_gpu_compute_capability(verbose=False, **kwargs): """Check the system for GPU Compute Capability.""" try: - pynvml.nvmlInit() - except pynvml.NVMLError as e: + devices = get_gpu_info().devices + except HardwareInfoError as e: raise ValueError("No GPU - cannot determine GPU Compute Capability") from e - for i in range(pynvml.nvmlDeviceGetCount()): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle) - if major >= REQUIRED_COMPUTE_CAPABILITY: + for dev in devices: + if dev.compute_capability[0] >= REQUIRED_COMPUTE_CAPABILITY: continue - else: - raise ValueError( - f"GPU {i} requires compute capability {REQUIRED_COMPUTE_CAPABILITY} " - f"or higher but only has {major}.{minor}." - "See https://developer.nvidia.com/cuda-gpus for more information." - ) + raise ValueError( + f"GPU {dev.index} requires compute capability {REQUIRED_COMPUTE_CAPABILITY} " + f"or higher but only has {dev.compute_capability[0]}.{dev.compute_capability[1]}." + "See https://developer.nvidia.com/cuda-gpus for more information." + ) return True diff --git a/rapids_cli/doctor/checks/memory.py b/rapids_cli/doctor/checks/memory.py index cb1fcb5..46d1b4d 100644 --- a/rapids_cli/doctor/checks/memory.py +++ b/rapids_cli/doctor/checks/memory.py @@ -4,45 +4,31 @@ import warnings -import psutil -import pynvml +from rapids_cli.hardware import HardwareInfoError +from rapids_cli.providers import get_gpu_info, get_system_info -def get_system_memory(verbose=False): +def get_system_memory(verbose=False, **kwargs): """Get the total system memory.""" - virtual_memory = psutil.virtual_memory() - total_memory = virtual_memory.total / (1024**3) # converts bytes to gigabytes - return total_memory + return get_system_info().total_memory_bytes / (1024**3) -def get_gpu_memory(verbose=False): +def get_gpu_memory(verbose=False, **kwargs): """Get the total GPU memory.""" - pynvml.nvmlInit() - gpus = pynvml.nvmlDeviceGetCount() - gpu_memory_total = 0 - for i in range(gpus): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) - gpu_memory_total += memory_info.total / (1024**3) # converts to gigabytes + return sum(dev.memory_total_bytes for dev in get_gpu_info().devices) / (1024**3) - pynvml.nvmlShutdown() - return gpu_memory_total - -def check_memory_to_gpu_ratio(verbose=True): +def check_memory_to_gpu_ratio(verbose=True, **kwargs): """Check the system for a 2:1 ratio of system Memory to total GPU Memory. This is especially useful for Dask. - """ try: - pynvml.nvmlInit() - except pynvml.NVMLError as e: + _ = get_gpu_info().device_count + except HardwareInfoError as e: raise ValueError("GPU not found. Please ensure GPUs are installed.") from e - system_memory = get_system_memory(verbose) - gpu_memory = get_gpu_memory(verbose) - ratio = system_memory / gpu_memory + ratio = get_system_memory() / get_gpu_memory() if ratio < 1.8: warnings.warn( "System Memory to total GPU Memory ratio not at least 2:1 ratio. " diff --git a/rapids_cli/doctor/checks/nvlink.py b/rapids_cli/doctor/checks/nvlink.py index 6dd6c66..567b83d 100644 --- a/rapids_cli/doctor/checks/nvlink.py +++ b/rapids_cli/doctor/checks/nvlink.py @@ -2,18 +2,18 @@ # SPDX-License-Identifier: Apache-2.0 """Check for NVLink status.""" -import pynvml +from rapids_cli.hardware import HardwareInfoError +from rapids_cli.providers import get_gpu_info def check_nvlink_status(verbose=True, **kwargs): """Check NVLink status across all GPUs.""" + gpu_info = get_gpu_info() try: - pynvml.nvmlInit() - except pynvml.NVMLError as e: + device_count = gpu_info.device_count + except HardwareInfoError as e: raise ValueError("GPU not found. Please ensure GPUs are installed.") from e - device_count = pynvml.nvmlDeviceGetCount() - # NVLink requires at least 2 GPUs to be meaningful. A single GPU has nothing # to link to, so there is nothing to check. if device_count < 2: @@ -23,29 +23,20 @@ def check_nvlink_status(verbose=True, **kwargs): # model). Mixed configurations — e.g. some NVLink-capable GPUs alongside some # that are not — are not handled and may produce misleading results. - failed_links: list[tuple[int, int]] = [] - - for gpu_idx in range(device_count): - handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx) - # NVML provides no API to query the number of NVLink slots on a device - # (e.g. V100=6, A100=12, H100=18). The only way to discover the real count - # is to iterate up to NVML_NVLINK_MAX_LINKS and stop when the driver signals - # that link_id is out of range via NVMLError_InvalidArgument. - for link_id in range(pynvml.NVML_NVLINK_MAX_LINKS): - try: - # nvmlDeviceGetNvLinkState(device, link) returns NVML_FEATURE_ENABLED - # if the link is active, or NVML_FEATURE_DISABLED if it is not. - state = pynvml.nvmlDeviceGetNvLinkState(handle, link_id) - if state == pynvml.NVML_FEATURE_DISABLED: - failed_links.append((gpu_idx, link_id)) - except pynvml.NVMLError_NotSupported: - # The driver reports NVLink is not supported on this system. - # There is nothing to check — skip like the single-GPU case above. - return False - except pynvml.NVMLError_InvalidArgument: - # link_id exceeds the number of NVLink slots on this device. - # Stop iterating links for this GPU. - break + devices = gpu_info.devices + + # An empty nvlink_states means the driver reported NVLink as unsupported (or + # no links were enumerated) for that device. Treat a system where no device + # advertises links the same as the single-GPU case — nothing to check. + if all(not dev.nvlink_states for dev in devices): + return False + + failed_links: list[tuple[int, int]] = [ + (dev.index, link_id) + for dev in devices + for link_id, active in enumerate(dev.nvlink_states) + if not active + ] if failed_links: details = ", ".join(f"GPU {gpu} link {link}" for gpu, link in failed_links) diff --git a/rapids_cli/doctor/doctor.py b/rapids_cli/doctor/doctor.py index e7cd0ad..ed9ea43 100644 --- a/rapids_cli/doctor/doctor.py +++ b/rapids_cli/doctor/doctor.py @@ -8,8 +8,10 @@ from rich.console import Console +from rapids_cli import providers from rapids_cli._compatibility import entry_points from rapids_cli.constants import DOCTOR_SYMBOL +from rapids_cli.hardware import DefaultSystemInfo, NvmlGpuInfo console = Console() @@ -76,6 +78,8 @@ def doctor_check( console.print("Dry run, skipping checks") return True + providers.set_providers(gpu_info=NvmlGpuInfo(), system_info=DefaultSystemInfo()) + results: list[CheckResult] = [] with console.status("[bold green]Running checks...") as ui_status: for i, check_fn in enumerate(checks): diff --git a/rapids_cli/hardware.py b/rapids_cli/hardware.py new file mode 100644 index 0000000..947b986 --- /dev/null +++ b/rapids_cli/hardware.py @@ -0,0 +1,180 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Hardware abstraction layer for GPU and system information.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Protocol, runtime_checkable + + +@dataclass +class DeviceInfo: + """Per-GPU device information.""" + + index: int + compute_capability: tuple[int, int] + memory_total_bytes: int + nvlink_states: list[bool] = field(default_factory=list) + + +class HardwareInfoError(Exception): + """Raised when hardware information cannot be obtained.""" + + +@runtime_checkable +class GpuInfoProvider(Protocol): + """Read-only interface for GPU information.""" + + @property + def device_count(self) -> int: + """Return number of GPU devices.""" + ... + + @property + def devices(self) -> list[DeviceInfo]: + """Return list of device information.""" + ... + + @property + def cuda_driver_version(self) -> int: + """Return CUDA driver version as integer.""" + ... + + @property + def driver_version(self) -> str: + """Return driver version string.""" + ... + + +@runtime_checkable +class SystemInfoProvider(Protocol): + """Read-only interface for system information.""" + + @property + def total_memory_bytes(self) -> int: + """Return total system memory in bytes.""" + ... + + @property + def cuda_runtime_path(self) -> str | None: + """Return path to CUDA runtime headers.""" + ... + + +class NvmlGpuInfo: + """Real GPU info provider backed by pynvml. + + Lazily loads all device information on first property access and caches results. + """ + + def __init__(self) -> None: + """Initialize with empty cached state.""" + self._loaded = False + self._device_count = 0 + self._devices: list[DeviceInfo] = [] + self._cuda_driver_version = 0 + self._driver_version = "" + + def _ensure_loaded(self) -> None: + if self._loaded: + return + + import pynvml + + try: + pynvml.nvmlInit() + except pynvml.NVMLError as e: + raise HardwareInfoError("Unable to initialize GPU driver (NVML)") from e + + self._device_count = pynvml.nvmlDeviceGetCount() + self._cuda_driver_version = pynvml.nvmlSystemGetCudaDriverVersion() + self._driver_version = pynvml.nvmlSystemGetDriverVersion() + + self._devices = [] + for i in range(self._device_count): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle) + memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) + + nvlink_states: list[bool] = [] + for link_id in range(pynvml.NVML_NVLINK_MAX_LINKS): + try: + state = pynvml.nvmlDeviceGetNvLinkState(handle, link_id) + nvlink_states.append(bool(state)) + except ( + pynvml.NVMLError_InvalidArgument, + pynvml.NVMLError_NotSupported, + ): + break + + self._devices.append( + DeviceInfo( + index=i, + compute_capability=(major, minor), + memory_total_bytes=memory_info.total, + nvlink_states=nvlink_states, + ) + ) + + self._loaded = True + + @property + def device_count(self) -> int: + """Return number of GPU devices.""" + self._ensure_loaded() + return self._device_count + + @property + def devices(self) -> list[DeviceInfo]: + """Return list of device information.""" + self._ensure_loaded() + return self._devices + + @property + def cuda_driver_version(self) -> int: + """Return CUDA driver version as integer (e.g. 12040).""" + self._ensure_loaded() + return self._cuda_driver_version + + @property + def driver_version(self) -> str: + """Return driver version string.""" + self._ensure_loaded() + return self._driver_version + + +class DefaultSystemInfo: + """Real system info provider backed by psutil and cuda.pathfinder. + + Lazily loads each piece of information on first access. + """ + + def __init__(self) -> None: + """Initialize with empty cached state.""" + self._memory_loaded = False + self._total_memory_bytes = 0 + self._cuda_path_loaded = False + self._cuda_runtime_path: str | None = None + + @property + def total_memory_bytes(self) -> int: + """Return total system memory in bytes.""" + if not self._memory_loaded: + import psutil + + self._total_memory_bytes = psutil.virtual_memory().total + self._memory_loaded = True + return self._total_memory_bytes + + @property + def cuda_runtime_path(self) -> str | None: + """Return path to CUDA runtime headers.""" + if not self._cuda_path_loaded: + import cuda.pathfinder + + self._cuda_runtime_path = cuda.pathfinder.find_nvidia_header_directory( + "cudart" + ) + self._cuda_path_loaded = True + return self._cuda_runtime_path diff --git a/rapids_cli/providers.py b/rapids_cli/providers.py new file mode 100644 index 0000000..e94d440 --- /dev/null +++ b/rapids_cli/providers.py @@ -0,0 +1,73 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Process-wide hardware provider registry. + +The doctor orchestrator installs real providers once per run via +``set_providers``; check and debug functions read them via the ``get_*`` +accessors. Tests swap in fakes with ``monkeypatch.setattr`` on the +``_providers`` dataclass instance (or via the fixtures in +``rapids_cli/tests/conftest.py``). +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from rapids_cli.doctor.checks.cuda_toolkit import CudaToolkitInfo + from rapids_cli.hardware import GpuInfoProvider, SystemInfoProvider + + +@dataclass +class _Providers: + """Container for the process-wide hardware providers.""" + + gpu_info: GpuInfoProvider | None = field(default=None) + system_info: SystemInfoProvider | None = field(default=None) + toolkit_info: CudaToolkitInfo | None = field(default=None) + + +_providers = _Providers() + + +def set_providers( + *, + gpu_info: GpuInfoProvider | None = None, + system_info: SystemInfoProvider | None = None, + toolkit_info: CudaToolkitInfo | None = None, +) -> None: + """Install providers for the current run. Only non-None args are applied.""" + if gpu_info is not None: + _providers.gpu_info = gpu_info + if system_info is not None: + _providers.system_info = system_info + if toolkit_info is not None: + _providers.toolkit_info = toolkit_info + + +def get_gpu_info() -> GpuInfoProvider: + """Return the installed GPU info provider, lazily creating a real one.""" + if _providers.gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + _providers.gpu_info = NvmlGpuInfo() + return _providers.gpu_info + + +def get_system_info() -> SystemInfoProvider: + """Return the installed system info provider, lazily creating a real one.""" + if _providers.system_info is None: # pragma: no cover + from rapids_cli.hardware import DefaultSystemInfo + + _providers.system_info = DefaultSystemInfo() + return _providers.system_info + + +def get_toolkit_info() -> CudaToolkitInfo: + """Return the installed toolkit info, lazily gathering it from the system.""" + if _providers.toolkit_info is None: # pragma: no cover + from rapids_cli.doctor.checks.cuda_toolkit import _gather_toolkit_info + + _providers.toolkit_info = _gather_toolkit_info() + return _providers.toolkit_info diff --git a/rapids_cli/tests/__init__.py b/rapids_cli/tests/__init__.py new file mode 100644 index 0000000..c8dd57a --- /dev/null +++ b/rapids_cli/tests/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/rapids_cli/tests/conftest.py b/rapids_cli/tests/conftest.py new file mode 100644 index 0000000..5ac127b --- /dev/null +++ b/rapids_cli/tests/conftest.py @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Shared test fixtures for the rapids-cli test suite.""" + +from __future__ import annotations + +import pytest + +from rapids_cli import providers + + +@pytest.fixture(autouse=True) +def _reset_providers(monkeypatch): + """Ensure each test starts with a clean provider registry. + + Tests that need specific providers installed use the ``set_gpu_info`` / + ``set_system_info`` / ``set_toolkit_info`` fixtures, which install fakes + via ``monkeypatch.setattr`` so they auto-revert after the test. + """ + monkeypatch.setattr(providers._providers, "gpu_info", None) + monkeypatch.setattr(providers._providers, "system_info", None) + monkeypatch.setattr(providers._providers, "toolkit_info", None) + + +@pytest.fixture +def set_gpu_info(monkeypatch): + """Install a fake GPU info provider for the duration of the test.""" + + def _set(fake): + monkeypatch.setattr(providers._providers, "gpu_info", fake) + + return _set + + +@pytest.fixture +def set_system_info(monkeypatch): + """Install a fake system info provider for the duration of the test.""" + + def _set(fake): + monkeypatch.setattr(providers._providers, "system_info", fake) + + return _set + + +@pytest.fixture +def set_toolkit_info(monkeypatch): + """Install a fake CUDA toolkit info for the duration of the test.""" + + def _set(fake): + monkeypatch.setattr(providers._providers, "toolkit_info", fake) + + return _set diff --git a/rapids_cli/tests/fakes.py b/rapids_cli/tests/fakes.py new file mode 100644 index 0000000..7818633 --- /dev/null +++ b/rapids_cli/tests/fakes.py @@ -0,0 +1,65 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Test fakes for hardware providers.""" + +from __future__ import annotations + +from dataclasses import dataclass, field + +from rapids_cli.hardware import DeviceInfo, HardwareInfoError + + +@dataclass +class FakeGpuInfo: + """Test fake for GPU information with pre-set data.""" + + device_count: int = 0 + devices: list[DeviceInfo] = field(default_factory=list) + cuda_driver_version: int = 0 + driver_version: str = "" + + +@dataclass +class FakeSystemInfo: + """Test fake for system information with pre-set data.""" + + total_memory_bytes: int = 0 + cuda_runtime_path: str | None = None + + +class FailingGpuInfo: + """Test fake that raises HardwareInfoError on any property access.""" + + @property + def device_count(self) -> int: + """Raise HardwareInfoError.""" + raise HardwareInfoError("No GPU available") + + @property + def devices(self) -> list[DeviceInfo]: + """Raise HardwareInfoError.""" + raise HardwareInfoError("No GPU available") + + @property + def cuda_driver_version(self) -> int: + """Raise HardwareInfoError.""" + raise HardwareInfoError("No GPU available") + + @property + def driver_version(self) -> str: + """Raise HardwareInfoError.""" + raise HardwareInfoError("No GPU available") + + +class FailingSystemInfo: + """Test fake that raises HardwareInfoError on any property access.""" + + @property + def total_memory_bytes(self) -> int: + """Raise HardwareInfoError.""" + raise HardwareInfoError("System info unavailable") + + @property + def cuda_runtime_path(self) -> str | None: + """Raise HardwareInfoError.""" + raise HardwareInfoError("System info unavailable") diff --git a/rapids_cli/tests/test_cuda.py b/rapids_cli/tests/test_cuda.py index c6d4525..c123d5b 100644 --- a/rapids_cli/tests/test_cuda.py +++ b/rapids_cli/tests/test_cuda.py @@ -1,34 +1,17 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import patch - -import pynvml import pytest from rapids_cli.doctor.checks.cuda_driver import cuda_check +from rapids_cli.tests.fakes import FailingGpuInfo, FakeGpuInfo -def test_cuda_check_success(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), - ): - assert cuda_check(verbose=True) == 12050 - - -def test_cuda_check_init_fails(): - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): - with pytest.raises(ValueError, match="Unable to look up CUDA version"): - cuda_check() +def test_cuda_check_success(set_gpu_info): + set_gpu_info(FakeGpuInfo(cuda_driver_version=12050)) + assert cuda_check(verbose=True) == 12050 -def test_cuda_check_version_query_fails(): - with ( - patch("pynvml.nvmlInit"), - patch( - "pynvml.nvmlSystemGetCudaDriverVersion", - side_effect=pynvml.NVMLError(1), - ), - ): - with pytest.raises(ValueError, match="Unable to look up CUDA version"): - cuda_check() +def test_cuda_check_no_gpu(set_gpu_info): + set_gpu_info(FailingGpuInfo()) + with pytest.raises(ValueError, match="Unable to look up CUDA version"): + cuda_check(verbose=False) diff --git a/rapids_cli/tests/test_cuda_toolkit.py b/rapids_cli/tests/test_cuda_toolkit.py index 8d1a19a..7487bbf 100644 --- a/rapids_cli/tests/test_cuda_toolkit.py +++ b/rapids_cli/tests/test_cuda_toolkit.py @@ -71,9 +71,9 @@ def test_ctypes_cuda_version_oserror(): # Check function tests -def test_check_success(): - info = _make_info() - result = cuda_toolkit_check(verbose=True, toolkit_info=info) +def test_check_success(set_toolkit_info): + set_toolkit_info(_make_info()) + result = cuda_toolkit_check(verbose=True) assert isinstance(result, str) assert "CUDA 12" in result @@ -90,75 +90,83 @@ def test_check_success(): ], ids=["all_missing", "partial_missing"], ) -def test_check_missing_libs(found_libs, missing_libs, expected_match): - info = _make_info( - found_libs=found_libs, - missing_libs=missing_libs, - cudart_path=None if not found_libs else "/usr/lib/libcudart.so", - toolkit_major=None if not found_libs else 12, +def test_check_missing_libs(set_toolkit_info, found_libs, missing_libs, expected_match): + set_toolkit_info( + _make_info( + found_libs=found_libs, + missing_libs=missing_libs, + cudart_path=None if not found_libs else "/usr/lib/libcudart.so", + toolkit_major=None if not found_libs else 12, + ) ) with pytest.raises(ValueError, match=expected_match): - cuda_toolkit_check(toolkit_info=info) + cuda_toolkit_check() -def test_check_driver_query_fails(): - info = _make_info(driver_major=None) +def test_check_driver_query_fails(set_toolkit_info): + set_toolkit_info(_make_info(driver_major=None)) with pytest.raises(ValueError, match="Unable to query"): - cuda_toolkit_check(toolkit_info=info) + cuda_toolkit_check() -def test_check_toolkit_newer_than_driver(): +def test_check_toolkit_newer_than_driver(set_toolkit_info): """CUDA 13 toolkit + CUDA 12 driver = error.""" - info = _make_info( - found_libs={"cudart": "conda", "nvrtc": "conda", "nvvm": "conda"}, - cudart_path="/usr/lib/libcudart.so.13", - toolkit_major=13, - driver_major=12, + set_toolkit_info( + _make_info( + found_libs={"cudart": "conda", "nvrtc": "conda", "nvvm": "conda"}, + cudart_path="/usr/lib/libcudart.so.13", + toolkit_major=13, + driver_major=12, + ) ) with pytest.raises(ValueError, match="newer than what the GPU driver supports"): - cuda_toolkit_check(toolkit_info=info) + cuda_toolkit_check() -def test_check_toolkit_older_than_driver_passes(): +def test_check_toolkit_older_than_driver_passes(set_toolkit_info): """CUDA 12 toolkit + CUDA 13 driver = fine (backward compatible).""" - info = _make_info(toolkit_major=12, driver_major=13) - assert cuda_toolkit_check(verbose=False, toolkit_info=info) is True + set_toolkit_info(_make_info(toolkit_major=12, driver_major=13)) + assert cuda_toolkit_check(verbose=False) is True -def test_check_cuda_symlink_newer_than_driver(tmp_path): +def test_check_cuda_symlink_newer_than_driver(set_toolkit_info, tmp_path): """Only checked when CUDA was found via system paths, not conda/pip.""" symlink_target = tmp_path / "cuda-13.0" symlink_target.mkdir() symlink_path = tmp_path / "cuda" symlink_path.symlink_to(symlink_target) - info = _make_info( - found_libs={ - "cudart": "system-search", - "nvrtc": "system-search", - "nvvm": "system-search", - }, - toolkit_major=12, - driver_major=12, + set_toolkit_info( + _make_info( + found_libs={ + "cudart": "system-search", + "nvrtc": "system-search", + "nvvm": "system-search", + }, + toolkit_major=12, + driver_major=12, + ) ) with ( patch("rapids_cli.doctor.checks.cuda_toolkit._CUDA_SYMLINK", symlink_path), patch.dict("os.environ", {}, clear=True), ): with pytest.raises(ValueError, match="points to CUDA 13"): - cuda_toolkit_check(toolkit_info=info) + cuda_toolkit_check() -def test_check_cuda_home_newer_than_driver(): +def test_check_cuda_home_newer_than_driver(set_toolkit_info): """Only checked when CUDA was found via system paths, not conda/pip.""" - info = _make_info( - found_libs={ - "cudart": "system-search", - "nvrtc": "system-search", - "nvvm": "system-search", - }, - toolkit_major=12, - driver_major=12, + set_toolkit_info( + _make_info( + found_libs={ + "cudart": "system-search", + "nvrtc": "system-search", + "nvvm": "system-search", + }, + toolkit_major=12, + driver_major=12, + ) ) with ( patch( @@ -167,4 +175,4 @@ def test_check_cuda_home_newer_than_driver(): patch.dict("os.environ", {"CUDA_HOME": "/usr/local/cuda-13.0"}, clear=True), ): with pytest.raises(ValueError, match="CUDA_HOME"): - cuda_toolkit_check(toolkit_info=info) + cuda_toolkit_check() diff --git a/rapids_cli/tests/test_debug.py b/rapids_cli/tests/test_debug.py index 91c330c..141d0f0 100644 --- a/rapids_cli/tests/test_debug.py +++ b/rapids_cli/tests/test_debug.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 import json -from unittest.mock import MagicMock, patch +from unittest.mock import patch from rapids_cli.debug.debug import ( gather_command_output, @@ -10,24 +10,20 @@ gather_tools, run_debug, ) +from rapids_cli.tests.fakes import FakeGpuInfo, FakeSystemInfo -def test_gather_cuda_version(): - """Test CUDA version gathering.""" - with patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040): - result = gather_cuda_version() - assert result == "12.4" +def test_gather_cuda_version(set_gpu_info): + set_gpu_info(FakeGpuInfo(cuda_driver_version=12040)) + assert gather_cuda_version() == "12.4" -def test_gather_cuda_version_with_patch(): - """Test CUDA version with patch number.""" - with patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12345): - result = gather_cuda_version() - assert result == "12.34.5" +def test_gather_cuda_version_with_patch(set_gpu_info): + set_gpu_info(FakeGpuInfo(cuda_driver_version=12345)) + assert gather_cuda_version() == "12.34.5" def test_gather_package_versions(): - """Test package version gathering.""" result = gather_package_versions() assert isinstance(result, dict) assert len(result) > 0 @@ -36,25 +32,21 @@ def test_gather_package_versions(): def test_gather_command_output_success(): - """Test successful command output gathering.""" result = gather_command_output(["echo", "test"]) assert result == "test" def test_gather_command_output_with_fallback(): - """Test command output with fallback.""" result = gather_command_output(["nonexistent_command"], fallback_output="fallback") assert result == "fallback" def test_gather_command_output_no_fallback(): - """Test command output without fallback.""" result = gather_command_output(["nonexistent_command"]) assert result is None def test_gather_tools(): - """Test tools gathering.""" with ( patch( "rapids_cli.debug.debug.gather_command_output", @@ -68,19 +60,22 @@ def test_gather_tools(): assert "g++" in result -def test_run_debug_console(capsys): - """Test run_debug with console output.""" - mock_vm = MagicMock() - mock_vm.total = 32 * 1024**3 +def test_run_debug_console(capsys, set_gpu_info, set_system_info): + set_gpu_info( + FakeGpuInfo( + device_count=1, + cuda_driver_version=12040, + driver_version="550.54.15", + ) + ) + set_system_info( + FakeSystemInfo( + total_memory_bytes=32 * 1024**3, + cuda_runtime_path="/usr/local/cuda/include", + ) + ) with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54.15"), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040), - patch( - "cuda.pathfinder.find_nvidia_header_directory", - return_value="/usr/local/cuda/include", - ), patch("pathlib.Path.glob", return_value=[]), patch("rapids_cli.debug.debug.gather_package_versions", return_value={}), patch("rapids_cli.debug.debug.gather_command_output", return_value=None), @@ -93,16 +88,22 @@ def test_run_debug_console(capsys): assert "RAPIDS Debug Information" in captured.out -def test_run_debug_json(capsys): - """Test run_debug with JSON output.""" +def test_run_debug_json(capsys, set_gpu_info, set_system_info): + set_gpu_info( + FakeGpuInfo( + device_count=1, + cuda_driver_version=12040, + driver_version="550.54.15", + ) + ) + set_system_info( + FakeSystemInfo( + total_memory_bytes=32 * 1024**3, + cuda_runtime_path="/usr/local/cuda/include", + ) + ) + with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54.15"), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040), - patch( - "cuda.pathfinder.find_nvidia_header_directory", - return_value="/usr/local/cuda/include", - ), patch("pathlib.Path.glob", return_value=[]), patch( "rapids_cli.debug.debug.gather_package_versions", diff --git a/rapids_cli/tests/test_gpu.py b/rapids_cli/tests/test_gpu.py index a895bc2..ab81a47 100644 --- a/rapids_cli/tests/test_gpu.py +++ b/rapids_cli/tests/test_gpu.py @@ -1,7 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import patch - import pytest from rapids_cli.doctor.checks.gpu import ( @@ -9,67 +7,59 @@ check_gpu_compute_capability, gpu_check, ) +from rapids_cli.hardware import DeviceInfo +from rapids_cli.tests.fakes import FailingGpuInfo, FakeGpuInfo -def test_gpu_check_success(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - ): - result = gpu_check(verbose=True) - assert result == "GPU(s) detected: 2" - +def test_gpu_check_success(set_gpu_info): + set_gpu_info(FakeGpuInfo(device_count=2)) + assert gpu_check(verbose=True) == "GPU(s) detected: 2" -def test_gpu_check_no_gpus(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=0), - ): - with pytest.raises(AssertionError, match="No GPUs detected"): - gpu_check(verbose=False) +def test_gpu_check_no_gpus(set_gpu_info): + set_gpu_info(FakeGpuInfo(device_count=0)) + with pytest.raises(AssertionError, match="No GPUs detected"): + gpu_check(verbose=False) -def test_gpu_check_nvml_error(): - import pynvml - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): - with pytest.raises(ValueError, match="No available GPUs detected"): - gpu_check(verbose=False) +def test_gpu_check_nvml_error(set_gpu_info): + set_gpu_info(FailingGpuInfo()) + with pytest.raises(ValueError, match="No available GPUs detected"): + gpu_check(verbose=False) -def test_check_gpu_compute_capability_success(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex"), - patch( - "pynvml.nvmlDeviceGetCudaComputeCapability", - return_value=(REQUIRED_COMPUTE_CAPABILITY, 5), +def test_check_gpu_compute_capability_success(set_gpu_info): + devices = [ + DeviceInfo( + index=0, + compute_capability=(REQUIRED_COMPUTE_CAPABILITY, 5), + memory_total_bytes=0, ), + DeviceInfo( + index=1, + compute_capability=(REQUIRED_COMPUTE_CAPABILITY, 5), + memory_total_bytes=0, + ), + ] + set_gpu_info(FakeGpuInfo(device_count=2, devices=devices)) + assert check_gpu_compute_capability(verbose=True) is True + + +def test_check_gpu_compute_capability_insufficient(set_gpu_info): + devices = [ + DeviceInfo(index=0, compute_capability=(6, 0), memory_total_bytes=0), + ] + set_gpu_info(FakeGpuInfo(device_count=1, devices=devices)) + with pytest.raises( + ValueError, + match=f"GPU 0 requires compute capability {REQUIRED_COMPUTE_CAPABILITY}", ): - result = check_gpu_compute_capability(verbose=True) - assert result is True + check_gpu_compute_capability(verbose=False) -def test_check_gpu_compute_capability_insufficient(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=1), - patch("pynvml.nvmlDeviceGetHandleByIndex"), - patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(6, 0)), +def test_check_gpu_compute_capability_no_gpu(set_gpu_info): + set_gpu_info(FailingGpuInfo()) + with pytest.raises( + ValueError, match="No GPU - cannot determine GPU Compute Capability" ): - with pytest.raises( - ValueError, - match=f"GPU 0 requires compute capability {REQUIRED_COMPUTE_CAPABILITY}", - ): - check_gpu_compute_capability(verbose=False) - - -def test_check_gpu_compute_capability_no_gpu(): - import pynvml - - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): - with pytest.raises( - ValueError, match="No GPU - cannot determine GPU Compute Capability" - ): - check_gpu_compute_capability(verbose=False) + check_gpu_compute_capability(verbose=False) diff --git a/rapids_cli/tests/test_hardware.py b/rapids_cli/tests/test_hardware.py new file mode 100644 index 0000000..215d68c --- /dev/null +++ b/rapids_cli/tests/test_hardware.py @@ -0,0 +1,238 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +from unittest.mock import MagicMock, patch + +import pynvml +import pytest + +from rapids_cli.hardware import ( + DefaultSystemInfo, + DeviceInfo, + GpuInfoProvider, + HardwareInfoError, + NvmlGpuInfo, + SystemInfoProvider, +) +from rapids_cli.tests.fakes import ( + FailingGpuInfo, + FailingSystemInfo, + FakeGpuInfo, + FakeSystemInfo, +) + +# --- NvmlGpuInfo tests --- + + +def test_nvml_gpu_info_init_failure(): + with patch( + "pynvml.nvmlInit", + side_effect=pynvml.NVMLError(pynvml.NVML_ERROR_DRIVER_NOT_LOADED), + ): + gpu_info = NvmlGpuInfo() + with pytest.raises(HardwareInfoError, match="Unable to initialize GPU driver"): + _ = gpu_info.device_count + + +def test_nvml_gpu_info_loads_once(): + mock_handle = MagicMock() + mock_memory = MagicMock() + mock_memory.total = 16 * 1024**3 + + with ( + patch("pynvml.nvmlInit") as mock_init, + patch("pynvml.nvmlDeviceGetCount", return_value=1), + patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), + patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54"), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(7, 5)), + patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), + patch( + "pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported + ), + ): + gpu_info = NvmlGpuInfo() + # Access multiple properties to verify caching + _ = gpu_info.device_count + _ = gpu_info.devices + _ = gpu_info.cuda_driver_version + _ = gpu_info.driver_version + # nvmlInit should be called exactly once + mock_init.assert_called_once() + + +def test_nvml_gpu_info_device_data(): + mock_handle = MagicMock() + mock_memory = MagicMock() + mock_memory.total = 24 * 1024**3 + + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=2), + patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12060), + patch("pynvml.nvmlSystemGetDriverVersion", return_value="560.10"), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(9, 0)), + patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), + patch("pynvml.nvmlDeviceGetNvLinkState", return_value=1), + ): + gpu_info = NvmlGpuInfo() + assert gpu_info.device_count == 2 + assert len(gpu_info.devices) == 2 + assert gpu_info.devices[0].compute_capability == (9, 0) + assert gpu_info.devices[0].memory_total_bytes == 24 * 1024**3 + assert gpu_info.cuda_driver_version == 12060 + assert gpu_info.driver_version == "560.10" + + +def test_nvml_gpu_info_nvlink_states(): + mock_handle = MagicMock() + mock_memory = MagicMock() + mock_memory.total = 16 * 1024**3 + + def nvlink_side_effect(handle, link_id): + if link_id < 2: + return 1 + raise pynvml.NVMLError_NotSupported() + + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=1), + patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), + patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54"), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(7, 5)), + patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), + patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=nvlink_side_effect), + ): + gpu_info = NvmlGpuInfo() + assert gpu_info.devices[0].nvlink_states == [True, True] + + +def test_nvml_gpu_info_no_nvlink(): + mock_handle = MagicMock() + mock_memory = MagicMock() + mock_memory.total = 16 * 1024**3 + + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=1), + patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), + patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54"), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(7, 5)), + patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), + patch( + "pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported + ), + ): + gpu_info = NvmlGpuInfo() + assert gpu_info.devices[0].nvlink_states == [] + + +# --- DefaultSystemInfo tests --- + + +def test_default_system_info_total_memory(): + mock_vm = MagicMock() + mock_vm.total = 64 * 1024**3 + with patch("psutil.virtual_memory", return_value=mock_vm): + sys_info = DefaultSystemInfo() + assert sys_info.total_memory_bytes == 64 * 1024**3 + + +def test_default_system_info_cuda_runtime_path(): + with patch( + "cuda.pathfinder.find_nvidia_header_directory", + return_value="/usr/local/cuda/include", + ): + sys_info = DefaultSystemInfo() + assert sys_info.cuda_runtime_path == "/usr/local/cuda/include" + + +def test_default_system_info_caches(): + mock_vm = MagicMock() + mock_vm.total = 64 * 1024**3 + with patch("psutil.virtual_memory", return_value=mock_vm) as mock_psutil: + sys_info = DefaultSystemInfo() + _ = sys_info.total_memory_bytes + _ = sys_info.total_memory_bytes + mock_psutil.assert_called_once() + + +# --- FakeGpuInfo tests --- + + +def test_fake_gpu_info_defaults(): + fake = FakeGpuInfo() + assert fake.device_count == 0 + assert fake.devices == [] + assert fake.cuda_driver_version == 0 + assert fake.driver_version == "" + + +def test_fake_gpu_info_custom(): + devices = [ + DeviceInfo(index=0, compute_capability=(8, 0), memory_total_bytes=32 * 1024**3) + ] + fake = FakeGpuInfo( + device_count=1, + devices=devices, + cuda_driver_version=12040, + driver_version="550.0", + ) + assert fake.device_count == 1 + assert len(fake.devices) == 1 + assert fake.cuda_driver_version == 12040 + + +def test_fake_gpu_info_satisfies_protocol(): + assert isinstance(FakeGpuInfo(), GpuInfoProvider) + + +# --- FakeSystemInfo tests --- + + +def test_fake_system_info_defaults(): + fake = FakeSystemInfo() + assert fake.total_memory_bytes == 0 + assert fake.cuda_runtime_path is None + + +def test_fake_system_info_satisfies_protocol(): + assert isinstance(FakeSystemInfo(), SystemInfoProvider) + + +# --- FailingGpuInfo tests --- + + +def test_failing_gpu_info_device_count(): + with pytest.raises(HardwareInfoError, match="No GPU available"): + _ = FailingGpuInfo().device_count + + +def test_failing_gpu_info_devices(): + with pytest.raises(HardwareInfoError, match="No GPU available"): + _ = FailingGpuInfo().devices + + +def test_failing_gpu_info_cuda_driver_version(): + with pytest.raises(HardwareInfoError, match="No GPU available"): + _ = FailingGpuInfo().cuda_driver_version + + +def test_failing_gpu_info_driver_version(): + with pytest.raises(HardwareInfoError, match="No GPU available"): + _ = FailingGpuInfo().driver_version + + +# --- FailingSystemInfo tests --- + + +def test_failing_system_info_total_memory(): + with pytest.raises(HardwareInfoError, match="System info unavailable"): + _ = FailingSystemInfo().total_memory_bytes + + +def test_failing_system_info_cuda_runtime_path(): + with pytest.raises(HardwareInfoError, match="System info unavailable"): + _ = FailingSystemInfo().cuda_runtime_path diff --git a/rapids_cli/tests/test_memory.py b/rapids_cli/tests/test_memory.py index 572df33..f1c73f8 100644 --- a/rapids_cli/tests/test_memory.py +++ b/rapids_cli/tests/test_memory.py @@ -1,7 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import MagicMock, patch - import pytest from rapids_cli.doctor.checks.memory import ( @@ -9,74 +7,54 @@ get_gpu_memory, get_system_memory, ) +from rapids_cli.hardware import DeviceInfo +from rapids_cli.tests.fakes import FailingGpuInfo, FakeGpuInfo, FakeSystemInfo -def test_get_system_memory(): - mock_vm = MagicMock() - mock_vm.total = 32 * 1024**3 # 32 GB in bytes - with patch("psutil.virtual_memory", return_value=mock_vm): - result = get_system_memory(verbose=False) - assert result == 32.0 +def test_get_system_memory(set_system_info): + set_system_info(FakeSystemInfo(total_memory_bytes=32 * 1024**3)) + assert get_system_memory(verbose=False) == 32.0 -def test_get_gpu_memory_single_gpu(): - mock_handle = MagicMock() - mock_memory_info = MagicMock() - mock_memory_info.total = 16 * 1024**3 # 16 GB in bytes +def test_get_gpu_memory_single_gpu(set_gpu_info): + devices = [ + DeviceInfo(index=0, compute_capability=(7, 0), memory_total_bytes=16 * 1024**3) + ] + set_gpu_info(FakeGpuInfo(device_count=1, devices=devices)) + assert get_gpu_memory(verbose=False) == 16.0 - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=1), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory_info), - patch("pynvml.nvmlShutdown"), - ): - result = get_gpu_memory(verbose=False) - assert result == 16.0 +def test_get_gpu_memory_multiple_gpus(set_gpu_info): + devices = [ + DeviceInfo(index=i, compute_capability=(7, 0), memory_total_bytes=16 * 1024**3) + for i in range(4) + ] + set_gpu_info(FakeGpuInfo(device_count=4, devices=devices)) + assert get_gpu_memory(verbose=False) == 64.0 # 16 GB * 4 GPUs -def test_get_gpu_memory_multiple_gpus(): - mock_handle = MagicMock() - mock_memory_info = MagicMock() - mock_memory_info.total = 16 * 1024**3 # 16 GB per GPU - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=4), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory_info), - patch("pynvml.nvmlShutdown"), - ): - result = get_gpu_memory(verbose=False) - assert result == 64.0 # 16 GB * 4 GPUs +def test_check_memory_to_gpu_ratio_good_ratio(set_gpu_info, set_system_info): + devices = [ + DeviceInfo(index=0, compute_capability=(7, 0), memory_total_bytes=32 * 1024**3) + ] + set_gpu_info(FakeGpuInfo(device_count=1, devices=devices)) + set_system_info(FakeSystemInfo(total_memory_bytes=64 * 1024**3)) + assert check_memory_to_gpu_ratio(verbose=True) is True -def test_check_memory_to_gpu_ratio_good_ratio(): - with ( - patch("pynvml.nvmlInit"), - patch("rapids_cli.doctor.checks.memory.get_system_memory", return_value=64.0), - patch("rapids_cli.doctor.checks.memory.get_gpu_memory", return_value=32.0), - ): - result = check_memory_to_gpu_ratio(verbose=True) - assert result is True +def test_check_memory_to_gpu_ratio_warning(set_gpu_info, set_system_info): + devices = [ + DeviceInfo(index=0, compute_capability=(7, 0), memory_total_bytes=32 * 1024**3) + ] + set_gpu_info(FakeGpuInfo(device_count=1, devices=devices)) + set_system_info(FakeSystemInfo(total_memory_bytes=32 * 1024**3)) + with pytest.warns(UserWarning, match="System Memory to total GPU Memory ratio"): + assert check_memory_to_gpu_ratio(verbose=True) is True -def test_check_memory_to_gpu_ratio_warning(): - with ( - patch("pynvml.nvmlInit"), - patch("rapids_cli.doctor.checks.memory.get_system_memory", return_value=32.0), - patch("rapids_cli.doctor.checks.memory.get_gpu_memory", return_value=32.0), +def test_check_memory_to_gpu_ratio_no_gpu(set_gpu_info): + set_gpu_info(FailingGpuInfo()) + with pytest.raises( + ValueError, match="GPU not found. Please ensure GPUs are installed." ): - with pytest.warns(UserWarning, match="System Memory to total GPU Memory ratio"): - result = check_memory_to_gpu_ratio(verbose=True) - assert result is True - - -def test_check_memory_to_gpu_ratio_no_gpu(): - import pynvml - - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): - with pytest.raises( - ValueError, match="GPU not found. Please ensure GPUs are installed." - ): - check_memory_to_gpu_ratio(verbose=False) + check_memory_to_gpu_ratio(verbose=False) diff --git a/rapids_cli/tests/test_nvlink.py b/rapids_cli/tests/test_nvlink.py index 4deb0dc..5fd32c5 100644 --- a/rapids_cli/tests/test_nvlink.py +++ b/rapids_cli/tests/test_nvlink.py @@ -1,10 +1,19 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import MagicMock, patch - import pytest from rapids_cli.doctor.checks.nvlink import check_nvlink_status +from rapids_cli.hardware import DeviceInfo +from rapids_cli.tests.fakes import FailingGpuInfo, FakeGpuInfo + + +def _make_device(index: int, nvlink_states: list[bool]) -> DeviceInfo: + return DeviceInfo( + index=index, + compute_capability=(7, 0), + memory_total_bytes=0, + nvlink_states=nvlink_states, + ) @pytest.mark.parametrize( @@ -14,132 +23,59 @@ (False, None), ], ) -def test_check_nvlink_status_success(verbose, expected): +def test_check_nvlink_status_success(set_gpu_info, verbose, expected): """2 GPUs, all NVLinks active — verbose controls whether a summary string is returned.""" - import pynvml + # Simulate a V100 with 6 NVLink slots, all active. + devices = [_make_device(0, [True] * 6), _make_device(1, [True] * 6)] + set_gpu_info(FakeGpuInfo(device_count=2, devices=devices)) + assert check_nvlink_status(verbose=verbose) == expected - mock_handle = MagicMock() - # Simulate a V100 with 6 NVLink slots; link_id >= 6 is out of range. - def mock_link_state(handle, link_id): - if link_id >= 6: - raise pynvml.NVMLError_InvalidArgument - return pynvml.NVML_FEATURE_ENABLED - - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=mock_link_state), - ): - result = check_nvlink_status(verbose=verbose) - assert result == expected - - -def test_check_nvlink_status_single_gpu(): +def test_check_nvlink_status_single_gpu(set_gpu_info): """Single GPU — NVLink is not applicable, check skips early.""" - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=1), - ): - result = check_nvlink_status(verbose=False) - assert result is False - + set_gpu_info(FakeGpuInfo(device_count=1, devices=[_make_device(0, [])])) + assert check_nvlink_status(verbose=False) is False -def test_check_nvlink_status_no_gpu(): - """nvmlInit fails — no GPUs installed.""" - import pynvml - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): - with pytest.raises( - ValueError, match="GPU not found. Please ensure GPUs are installed." - ): - check_nvlink_status(verbose=False) - - -def test_check_nvlink_status_not_supported(): - """NVLink is not supported on this system — check skips silently like single-GPU case.""" - import pynvml - - mock_handle = MagicMock() - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch( - "pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported - ), +def test_check_nvlink_status_no_gpu(set_gpu_info): + """GPU info unavailable — surfaces as a GPU-not-found error.""" + set_gpu_info(FailingGpuInfo()) + with pytest.raises( + ValueError, match="GPU not found. Please ensure GPUs are installed." ): - result = check_nvlink_status(verbose=False) - assert result is False + check_nvlink_status(verbose=False) -def test_check_nvlink_status_link_inactive(): - """A supported link is inactive — check fails and reports which GPU and link.""" - import pynvml - - mock_handle = MagicMock() +def test_check_nvlink_status_not_supported(set_gpu_info): + """NVLink not supported on any device — check skips silently like single-GPU case.""" + # When NVML reports NVLink as not supported, NvmlGpuInfo records an empty list. + devices = [_make_device(0, []), _make_device(1, [])] + set_gpu_info(FakeGpuInfo(device_count=2, devices=devices)) + assert check_nvlink_status(verbose=False) is False - # Simulate a V100 with 6 NVLink slots, all inactive. - def mock_link_state(handle, link_id): - if link_id >= 6: - raise pynvml.NVMLError_InvalidArgument - return pynvml.NVML_FEATURE_DISABLED - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=mock_link_state), - ): - with pytest.raises(ValueError, match="NVLink inactive on:"): - check_nvlink_status(verbose=False) +def test_check_nvlink_status_link_inactive(set_gpu_info): + """A supported link is inactive — check fails and reports which GPU and link.""" + devices = [_make_device(0, [False] * 6), _make_device(1, [False] * 6)] + set_gpu_info(FakeGpuInfo(device_count=2, devices=devices)) + with pytest.raises(ValueError, match="NVLink inactive on:"): + check_nvlink_status(verbose=False) -def test_check_nvlink_status_partial_failure(): +def test_check_nvlink_status_partial_failure(set_gpu_info): """Some links active, some inactive — all failures are reported in a single error.""" - import pynvml - - mock_handle = MagicMock() - - # Simulate a V100 with 6 NVLink slots: link 0 active, link 1 inactive, rest active. - def mock_link_state(handle, link_id): - if link_id >= 6: - raise pynvml.NVMLError_InvalidArgument - if link_id == 1: - return pynvml.NVML_FEATURE_DISABLED - return pynvml.NVML_FEATURE_ENABLED - - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=mock_link_state), - ): - with pytest.raises(ValueError, match="NVLink inactive on:") as exc_info: - check_nvlink_status(verbose=False) - # Both GPUs should have link 1 reported as failed - assert "GPU 0 link 1" in str(exc_info.value) - assert "GPU 1 link 1" in str(exc_info.value) - - -def test_check_nvlink_status_invalid_argument(): - """NVMLError_InvalidArgument stops link iteration early — check succeeds for valid links.""" - import pynvml - - mock_handle = MagicMock() - - # Simulate an A100 with 12 NVLink slots; link_id >= 12 is out of range. - def mock_link_state(handle, link_id): - if link_id >= 12: - raise pynvml.NVMLError_InvalidArgument - return pynvml.NVML_FEATURE_ENABLED - - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=mock_link_state), - ): - result = check_nvlink_status(verbose=True) - assert result == "All NVLinks active across 2 GPUs" + # V100 with 6 NVLink slots: link 0 active, link 1 inactive, rest active. + states = [True, False, True, True, True, True] + devices = [_make_device(0, states), _make_device(1, states)] + set_gpu_info(FakeGpuInfo(device_count=2, devices=devices)) + with pytest.raises(ValueError, match="NVLink inactive on:") as exc_info: + check_nvlink_status(verbose=False) + assert "GPU 0 link 1" in str(exc_info.value) + assert "GPU 1 link 1" in str(exc_info.value) + + +def test_check_nvlink_status_mixed_link_counts(set_gpu_info): + """Links of differing counts (e.g. A100=12) iterate fully and succeed when all active.""" + devices = [_make_device(0, [True] * 12), _make_device(1, [True] * 12)] + set_gpu_info(FakeGpuInfo(device_count=2, devices=devices)) + assert check_nvlink_status(verbose=True) == "All NVLinks active across 2 GPUs"