diff --git a/CLAUDE.md b/CLAUDE.md index d73a716..5b794f0 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -111,7 +111,7 @@ The doctor command discovers and runs checks via Python entry points defined in ### Key Dependencies - `rich` and `rich-click` for terminal output and CLI interface -- `pynvml` (nvidia-ml-py) for GPU information +- `cuda-core` for GPU information - `cuda-pathfinder` for locating CUDA installations - `psutil` for system memory checks diff --git a/conda/recipes/rapids-cli/recipe.yaml b/conda/recipes/rapids-cli/recipe.yaml index 4e3ad4a..63ab677 100644 --- a/conda/recipes/rapids-cli/recipe.yaml +++ b/conda/recipes/rapids-cli/recipe.yaml @@ -32,12 +32,14 @@ requirements: - python - importlib-metadata >=4.13.0 - cuda-pathfinder >=1.2.3 - - nvidia-ml-py >=12.0 - packaging - psutil - pyyaml - rich - rich-click + - cuda-bindings>=12.9.6,!=13.0.*,!=13.1.* + # TODO: Change to cuda-core >= 1.0.0 once that's released + - cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core tests: - script: diff --git a/dependencies.yaml b/dependencies.yaml index d312739..0318431 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -61,8 +61,9 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - cuda-core >=0.6.0 - - nvidia-ml-py>=12.0 + - cuda-bindings>=12.9.6,!=13.0.*,!=13.1.* + # TODO: Change to cuda-core >= 1.0.0 once that's released + - cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core - cuda-pathfinder >=1.2.3 - packaging - psutil diff --git a/docs/source/api/debug.rst b/docs/source/api/debug.rst index aa4e84b..d892970 100644 --- a/docs/source/api/debug.rst +++ b/docs/source/api/debug.rst @@ -10,7 +10,7 @@ for troubleshooting RAPIDS installations. :func:`~rapids_cli.debug.debug.run_debug` is the main entry point. It collects: - Platform and OS details (from ``platform`` and ``/etc/os-release``) -- NVIDIA driver and CUDA versions (via ``pynvml``) +- NVIDIA driver and CUDA versions (via ``cuda.core.system``) - CUDA runtime path (via ``cuda-pathfinder``) - System CUDA toolkit locations (globbing ``/usr/local/cuda*``) - Python version and hash info diff --git a/docs/source/plugin_development.rst b/docs/source/plugin_development.rst index d5b5e45..6ecd9af 100644 --- a/docs/source/plugin_development.rst +++ b/docs/source/plugin_development.rst @@ -95,15 +95,13 @@ GPU memory requirement check: .. code-block:: python - import pynvml + from cuda.core import system def gpu_memory_check(verbose=False, **kwargs): """Check that GPU has at least 8GB memory.""" - pynvml.nvmlInit() - handle = pynvml.nvmlDeviceGetHandleByIndex(0) - mem = pynvml.nvmlDeviceGetMemoryInfo(handle) - available_gb = mem.total / (1024**3) + device = system.Device(index=0) + available_gb = device.memory_info.total / (1024**3) if available_gb < 8: raise ValueError( diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst index 5da7f2c..9c8a23b 100644 --- a/docs/source/troubleshooting.rst +++ b/docs/source/troubleshooting.rst @@ -19,7 +19,7 @@ No GPUs Detected .. code-block:: bash - python -c "import pynvml; pynvml.nvmlInit(); print(pynvml.nvmlDeviceGetCount())" + python -c "from cuda.core import system; system.Device.get_device_count()" 3. If running in a container, ensure GPU passthrough is enabled: diff --git a/pyproject.toml b/pyproject.toml index 882cc68..29e58b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,10 +7,10 @@ license-files = ["LICENSE"] readme = "README.md" requires-python = ">=3.10" dependencies = [ - "cuda-core >=0.6.0", + "cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*", + "cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core", "cuda-pathfinder >=1.2.3", "importlib-metadata >= 4.13.0; python_version < '3.12'", - "nvidia-ml-py>=12.0", "packaging", "psutil", "pyyaml", @@ -49,6 +49,10 @@ version-file = "rapids_cli/_version.py" [tool.hatch.version] source = "vcs" +[tool.hatch.metadata] +# TODO: Remove me when cuda-core 1.0 is released +allow-direct-references = true + [tool.black] # this should match the oldest version of Python the library supports target-version = ["py310"] diff --git a/rapids_cli/debug/debug.py b/rapids_cli/debug/debug.py index fca4d1d..f3160d1 100644 --- a/rapids_cli/debug/debug.py +++ b/rapids_cli/debug/debug.py @@ -11,7 +11,7 @@ from pathlib import Path import cuda.pathfinder -import pynvml +from cuda.core import system from rich.console import Console from rich.table import Table @@ -20,11 +20,7 @@ def gather_cuda_version(): """Return CUDA driver version as a string, similar to nvidia-smi output.""" - version = pynvml.nvmlSystemGetCudaDriverVersion() - # pynvml returns an int like 12040 for 12.4, so format as string - major = version // 1000 - minor = (version % 1000) // 10 - patch = version % 10 + major, minor, patch = system.get_driver_version_full() if patch == 0: return f"{major}.{minor}" else: @@ -69,14 +65,13 @@ def gather_tools(): def run_debug(output_format="console"): """Run debug.""" - pynvml.nvmlInit() debug_info = { "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "platform": platform.platform(), "nvidia_smi_output": gather_command_output( ["nvidia-smi"], "Nvidia-smi not installed" ), - "driver_version": pynvml.nvmlSystemGetDriverVersion(), + "driver_version": ".".join(str(x) for x in system.get_driver_version_full(kernel_mode=True)), "cuda_version": gather_cuda_version(), "cuda_runtime_path": cuda.pathfinder.find_nvidia_header_directory("cudart"), "system_ctk": sorted( diff --git a/rapids_cli/doctor/checks/cuda_driver.py b/rapids_cli/doctor/checks/cuda_driver.py index 252dd47..1aa3a41 100644 --- a/rapids_cli/doctor/checks/cuda_driver.py +++ b/rapids_cli/doctor/checks/cuda_driver.py @@ -2,17 +2,14 @@ # SPDX-License-Identifier: Apache-2.0 """Check for CUDA and driver compatibility.""" -import pynvml +from cuda.core import system def cuda_check(verbose=False): """Check CUDA availability.""" + try: - pynvml.nvmlInit() - try: - cuda_version = pynvml.nvmlSystemGetCudaDriverVersion() - return cuda_version - except pynvml.NVMLError as e: - raise ValueError("Unable to look up CUDA version") from e - except pynvml.NVMLError as e: + cuda_version = system.get_driver_version_full(kernel_mode=True) + return cuda_version[0] * 1000 + cuda_version[1] * 10 + cuda_version[2] + except system.NvmlError as e: raise ValueError("Unable to look up CUDA version") from e diff --git a/rapids_cli/doctor/checks/gpu.py b/rapids_cli/doctor/checks/gpu.py index 77e6ca6..cf77e9a 100644 --- a/rapids_cli/doctor/checks/gpu.py +++ b/rapids_cli/doctor/checks/gpu.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 """GPU checks for the doctor command.""" -import pynvml +from cuda.core import system REQUIRED_COMPUTE_CAPABILITY = 7 @@ -10,9 +10,8 @@ def gpu_check(verbose=False): """Check GPU availability.""" try: - pynvml.nvmlInit() - num_gpus = pynvml.nvmlDeviceGetCount() - except pynvml.NVMLError as e: + num_gpus = system.Device.get_device_count() + except system.NvmlError as e: raise ValueError("No available GPUs detected") from e assert num_gpus > 0, "No GPUs detected" return f"GPU(s) detected: {num_gpus}" @@ -21,13 +20,14 @@ def gpu_check(verbose=False): def check_gpu_compute_capability(verbose): """Check the system for GPU Compute Capability.""" try: - pynvml.nvmlInit() - except pynvml.NVMLError as e: + num_gpus = system.Device.get_device_count() + if num_gpus == 0: + raise system.NvmlError(1) + except system.NvmlError as e: raise ValueError("No GPU - cannot determine GPU Compute Capability") from e - for i in range(pynvml.nvmlDeviceGetCount()): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle) + for i, device in enumerate(system.Device.get_all_devices()): + major, minor = device.cuda_compute_capability if major >= REQUIRED_COMPUTE_CAPABILITY: continue else: diff --git a/rapids_cli/doctor/checks/memory.py b/rapids_cli/doctor/checks/memory.py index cb1fcb5..138d764 100644 --- a/rapids_cli/doctor/checks/memory.py +++ b/rapids_cli/doctor/checks/memory.py @@ -5,7 +5,8 @@ import warnings import psutil -import pynvml + +from cuda.core import system def get_system_memory(verbose=False): @@ -17,15 +18,11 @@ def get_system_memory(verbose=False): def get_gpu_memory(verbose=False): """Get the total GPU memory.""" - pynvml.nvmlInit() - gpus = pynvml.nvmlDeviceGetCount() + gpu_memory_total = 0 - for i in range(gpus): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) - gpu_memory_total += memory_info.total / (1024**3) # converts to gigabytes + for device in system.Device.get_all_devices(): + gpu_memory_total += device.memory_info.total / (1024**3) # converts to gigabytes - pynvml.nvmlShutdown() return gpu_memory_total @@ -36,9 +33,10 @@ def check_memory_to_gpu_ratio(verbose=True): """ try: - pynvml.nvmlInit() - except pynvml.NVMLError as e: - raise ValueError("GPU not found. Please ensure GPUs are installed.") from e + if system.Device.get_device_count() == 0: + raise system.NvmlError(1) + except system.NvmlError: + raise ValueError("GPU not found. Please ensure GPUs are installed.") system_memory = get_system_memory(verbose) gpu_memory = get_gpu_memory(verbose) diff --git a/rapids_cli/doctor/checks/nvlink.py b/rapids_cli/doctor/checks/nvlink.py index 6dd6c66..82debf6 100644 --- a/rapids_cli/doctor/checks/nvlink.py +++ b/rapids_cli/doctor/checks/nvlink.py @@ -2,18 +2,19 @@ # SPDX-License-Identifier: Apache-2.0 """Check for NVLink status.""" -import pynvml +from cuda.core import system +from cuda.bindings import nvml def check_nvlink_status(verbose=True, **kwargs): """Check NVLink status across all GPUs.""" try: - pynvml.nvmlInit() - except pynvml.NVMLError as e: + device_count = system.Device.get_device_count() + if device_count == 0: + raise system.NvmlError(1) + except system.NvmlError as e: raise ValueError("GPU not found. Please ensure GPUs are installed.") from e - device_count = pynvml.nvmlDeviceGetCount() - # NVLink requires at least 2 GPUs to be meaningful. A single GPU has nothing # to link to, so there is nothing to check. if device_count < 2: @@ -25,24 +26,22 @@ def check_nvlink_status(verbose=True, **kwargs): failed_links: list[tuple[int, int]] = [] - for gpu_idx in range(device_count): - handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx) + for gpu_idx, device in enumerate(system.Device.get_all_devices()): # NVML provides no API to query the number of NVLink slots on a device # (e.g. V100=6, A100=12, H100=18). The only way to discover the real count # is to iterate up to NVML_NVLINK_MAX_LINKS and stop when the driver signals # that link_id is out of range via NVMLError_InvalidArgument. - for link_id in range(pynvml.NVML_NVLINK_MAX_LINKS): + for link_id in range(nvml.NVLINK_MAX_LINKS): try: # nvmlDeviceGetNvLinkState(device, link) returns NVML_FEATURE_ENABLED # if the link is active, or NVML_FEATURE_DISABLED if it is not. - state = pynvml.nvmlDeviceGetNvLinkState(handle, link_id) - if state == pynvml.NVML_FEATURE_DISABLED: + if not device.get_nvlink(link_id).state: failed_links.append((gpu_idx, link_id)) - except pynvml.NVMLError_NotSupported: + except system.NotSupportedError: # The driver reports NVLink is not supported on this system. # There is nothing to check — skip like the single-GPU case above. return False - except pynvml.NVMLError_InvalidArgument: + except system.InvalidArgumentError: # link_id exceeds the number of NVLink slots on this device. # Stop iterating links for this GPU. break diff --git a/rapids_cli/tests/test_cuda.py b/rapids_cli/tests/test_cuda.py index c6d4525..c026575 100644 --- a/rapids_cli/tests/test_cuda.py +++ b/rapids_cli/tests/test_cuda.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 from unittest.mock import patch -import pynvml import pytest from rapids_cli.doctor.checks.cuda_driver import cuda_check @@ -10,24 +9,27 @@ def test_cuda_check_success(): with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), + patch("cuda.core.system.get_driver_version_full", return_value=(12, 5, 0)), ): assert cuda_check(verbose=True) == 12050 def test_cuda_check_init_fails(): - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): + from cuda.bindings import nvml + + with patch("cuda.bindings.nvml.init_v2", side_effect=nvml.NvmlError(1)): with pytest.raises(ValueError, match="Unable to look up CUDA version"): cuda_check() def test_cuda_check_version_query_fails(): + from cuda.bindings import nvml + with ( - patch("pynvml.nvmlInit"), + patch("cuda.bindings.nvml.init_v2"), patch( - "pynvml.nvmlSystemGetCudaDriverVersion", - side_effect=pynvml.NVMLError(1), + "cuda.bindings.nvml.system_get_cuda_driver_version", + side_effect=nvml.NvmlError(1), ), ): with pytest.raises(ValueError, match="Unable to look up CUDA version"): diff --git a/rapids_cli/tests/test_debug.py b/rapids_cli/tests/test_debug.py index 91c330c..d94c1f6 100644 --- a/rapids_cli/tests/test_debug.py +++ b/rapids_cli/tests/test_debug.py @@ -14,14 +14,14 @@ def test_gather_cuda_version(): """Test CUDA version gathering.""" - with patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040): + with patch("cuda.core.system.get_driver_version_full", return_value=(12, 4, 0)): result = gather_cuda_version() assert result == "12.4" def test_gather_cuda_version_with_patch(): """Test CUDA version with patch number.""" - with patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12345): + with patch("cuda.core.system.get_driver_version_full", return_value=(12, 34, 5)): result = gather_cuda_version() assert result == "12.34.5" @@ -74,9 +74,9 @@ def test_run_debug_console(capsys): mock_vm.total = 32 * 1024**3 with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54.15"), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040), + patch("cuda.bindings.nvml.init_v2"), + patch("cuda.bindings.nvml.system_get_driver_version", return_value="550.54.15"), + patch("cuda.bindings.nvml.system_get_cuda_driver_version", return_value=12040), patch( "cuda.pathfinder.find_nvidia_header_directory", return_value="/usr/local/cuda/include", @@ -95,10 +95,11 @@ def test_run_debug_console(capsys): def test_run_debug_json(capsys): """Test run_debug with JSON output.""" + with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54.15"), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040), + patch("cuda.bindings.nvml.init_v2"), + patch("cuda.bindings.nvml.system_get_driver_version", return_value="550.54.15"), + patch("cuda.bindings.nvml.system_get_cuda_driver_version", return_value=12040), patch( "cuda.pathfinder.find_nvidia_header_directory", return_value="/usr/local/cuda/include", diff --git a/rapids_cli/tests/test_gpu.py b/rapids_cli/tests/test_gpu.py index a895bc2..3b3665c 100644 --- a/rapids_cli/tests/test_gpu.py +++ b/rapids_cli/tests/test_gpu.py @@ -13,8 +13,8 @@ def test_gpu_check_success(): with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), + patch("cuda.bindings.nvml.init_v2"), + patch("cuda.bindings.nvml.device_get_count_v2", return_value=2), ): result = gpu_check(verbose=True) assert result == "GPU(s) detected: 2" @@ -22,28 +22,28 @@ def test_gpu_check_success(): def test_gpu_check_no_gpus(): with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=0), + patch("cuda.bindings.nvml.init_v2"), + patch("cuda.bindings.nvml.device_get_count_v2", return_value=0), ): with pytest.raises(AssertionError, match="No GPUs detected"): gpu_check(verbose=False) def test_gpu_check_nvml_error(): - import pynvml + from cuda.bindings import nvml - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): + with patch("cuda.bindings.nvml.init_v2", side_effect=nvml.NvmlError(1)): with pytest.raises(ValueError, match="No available GPUs detected"): gpu_check(verbose=False) def test_check_gpu_compute_capability_success(): with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex"), + patch("cuda.bindings.nvml.init_v2"), + patch("cuda.bindings.nvml.device_get_count_v2", return_value=2), + patch("cuda.bindings.nvml.device_get_handle_by_index_v2", return_value=0xffffffff), patch( - "pynvml.nvmlDeviceGetCudaComputeCapability", + "cuda.bindings.nvml.device_get_cuda_compute_capability", return_value=(REQUIRED_COMPUTE_CAPABILITY, 5), ), ): @@ -53,10 +53,10 @@ def test_check_gpu_compute_capability_success(): def test_check_gpu_compute_capability_insufficient(): with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=1), - patch("pynvml.nvmlDeviceGetHandleByIndex"), - patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(6, 0)), + patch("cuda.bindings.nvml.init_v2"), + patch("cuda.bindings.nvml.device_get_count_v2", return_value=1), + patch("cuda.bindings.nvml.device_get_handle_by_index_v2", return_value=0xffffffff), + patch("cuda.bindings.nvml.device_get_cuda_compute_capability", return_value=(6, 0)), ): with pytest.raises( ValueError, @@ -66,9 +66,9 @@ def test_check_gpu_compute_capability_insufficient(): def test_check_gpu_compute_capability_no_gpu(): - import pynvml + from cuda.bindings import nvml - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): + with patch("cuda.bindings.nvml.init_v2", side_effect=nvml.NvmlError(1)): with pytest.raises( ValueError, match="No GPU - cannot determine GPU Compute Capability" ): diff --git a/rapids_cli/tests/test_memory.py b/rapids_cli/tests/test_memory.py index 572df33..4ef5e32 100644 --- a/rapids_cli/tests/test_memory.py +++ b/rapids_cli/tests/test_memory.py @@ -20,32 +20,30 @@ def test_get_system_memory(): def test_get_gpu_memory_single_gpu(): - mock_handle = MagicMock() mock_memory_info = MagicMock() mock_memory_info.total = 16 * 1024**3 # 16 GB in bytes with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=1), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory_info), - patch("pynvml.nvmlShutdown"), + patch("cuda.bindings.nvml.init_v2"), + patch("cuda.bindings.nvml.device_get_count_v2", return_value=1), + patch("cuda.bindings.nvml.device_get_handle_by_index_v2", return_value=0xffffffff), + patch("cuda.bindings.nvml.device_get_memory_info_v2", return_value=mock_memory_info), + patch("cuda.bindings.nvml.shutdown"), ): result = get_gpu_memory(verbose=False) assert result == 16.0 def test_get_gpu_memory_multiple_gpus(): - mock_handle = MagicMock() mock_memory_info = MagicMock() mock_memory_info.total = 16 * 1024**3 # 16 GB per GPU with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=4), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory_info), - patch("pynvml.nvmlShutdown"), + patch("cuda.bindings.nvml.init_v2"), + patch("cuda.bindings.nvml.device_get_count_v2", return_value=4), + patch("cuda.bindings.nvml.device_get_handle_by_index_v2", return_value=0xffffffff), + patch("cuda.bindings.nvml.device_get_memory_info_v2", return_value=mock_memory_info), + patch("cuda.bindings.nvml.shutdown"), ): result = get_gpu_memory(verbose=False) assert result == 64.0 # 16 GB * 4 GPUs @@ -53,7 +51,8 @@ def test_get_gpu_memory_multiple_gpus(): def test_check_memory_to_gpu_ratio_good_ratio(): with ( - patch("pynvml.nvmlInit"), + patch("cuda.bindings.nvml.init_v2"), + patch("cuda.bindings.nvml.device_get_count_v2", return_value=2), patch("rapids_cli.doctor.checks.memory.get_system_memory", return_value=64.0), patch("rapids_cli.doctor.checks.memory.get_gpu_memory", return_value=32.0), ): @@ -63,7 +62,8 @@ def test_check_memory_to_gpu_ratio_good_ratio(): def test_check_memory_to_gpu_ratio_warning(): with ( - patch("pynvml.nvmlInit"), + patch("cuda.bindings.nvml.init_v2"), + patch("cuda.bindings.nvml.device_get_count_v2", return_value=2), patch("rapids_cli.doctor.checks.memory.get_system_memory", return_value=32.0), patch("rapids_cli.doctor.checks.memory.get_gpu_memory", return_value=32.0), ): @@ -73,9 +73,12 @@ def test_check_memory_to_gpu_ratio_warning(): def test_check_memory_to_gpu_ratio_no_gpu(): - import pynvml + from cuda.bindings import nvml - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): + with ( + patch("cuda.bindings.nvml.init_v2"), + patch("cuda.bindings.nvml.device_get_count_v2", return_value=0), + ): with pytest.raises( ValueError, match="GPU not found. Please ensure GPUs are installed." ): diff --git a/rapids_cli/tests/test_nvlink.py b/rapids_cli/tests/test_nvlink.py index 4deb0dc..142d2e9 100644 --- a/rapids_cli/tests/test_nvlink.py +++ b/rapids_cli/tests/test_nvlink.py @@ -16,21 +16,19 @@ ) def test_check_nvlink_status_success(verbose, expected): """2 GPUs, all NVLinks active — verbose controls whether a summary string is returned.""" - import pynvml - - mock_handle = MagicMock() + from cuda.bindings import nvml # Simulate a V100 with 6 NVLink slots; link_id >= 6 is out of range. def mock_link_state(handle, link_id): if link_id >= 6: - raise pynvml.NVMLError_InvalidArgument - return pynvml.NVML_FEATURE_ENABLED + raise nvml.InvalidArgumentError(0) + return nvml.EnableState.FEATURE_ENABLED with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=mock_link_state), + patch("cuda.bindings.nvml.init_v2"), + patch("cuda.bindings.nvml.device_get_count_v2", return_value=2), + patch("cuda.bindings.nvml.device_get_handle_by_index_v2", return_value=0xffffffff), + patch("cuda.bindings.nvml.device_get_nvlink_state", side_effect=mock_link_state), ): result = check_nvlink_status(verbose=verbose) assert result == expected @@ -39,8 +37,8 @@ def mock_link_state(handle, link_id): def test_check_nvlink_status_single_gpu(): """Single GPU — NVLink is not applicable, check skips early.""" with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=1), + patch("cuda.bindings.nvml.init_v2"), + patch("cuda.bindings.nvml.device_get_count_v2", return_value=1), ): result = check_nvlink_status(verbose=False) assert result is False @@ -48,9 +46,9 @@ def test_check_nvlink_status_single_gpu(): def test_check_nvlink_status_no_gpu(): """nvmlInit fails — no GPUs installed.""" - import pynvml + from cuda.bindings import nvml - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): + with patch("cuda.bindings.nvml.init_v2", side_effect=nvml.NvmlError(1)): with pytest.raises( ValueError, match="GPU not found. Please ensure GPUs are installed." ): @@ -59,15 +57,14 @@ def test_check_nvlink_status_no_gpu(): def test_check_nvlink_status_not_supported(): """NVLink is not supported on this system — check skips silently like single-GPU case.""" - import pynvml + from cuda.bindings import nvml - mock_handle = MagicMock() with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("cuda.bindings.nvml.init_v2"), + patch("cuda.bindings.nvml.device_get_count_v2", return_value=2), + patch("cuda.bindings.nvml.device_get_handle_by_index_v2", return_value=0xffffffff), patch( - "pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported + "cuda.bindings.nvml.device_get_nvlink_state", side_effect=nvml.NotSupportedError(1) ), ): result = check_nvlink_status(verbose=False) @@ -76,21 +73,19 @@ def test_check_nvlink_status_not_supported(): def test_check_nvlink_status_link_inactive(): """A supported link is inactive — check fails and reports which GPU and link.""" - import pynvml - - mock_handle = MagicMock() + from cuda.bindings import nvml # Simulate a V100 with 6 NVLink slots, all inactive. def mock_link_state(handle, link_id): if link_id >= 6: - raise pynvml.NVMLError_InvalidArgument - return pynvml.NVML_FEATURE_DISABLED + raise nvml.InvalidArgumentError(0) + return nvml.EnableState.FEATURE_DISABLED with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=mock_link_state), + patch("cuda.bindings.nvml.init_v2"), + patch("cuda.bindings.nvml.device_get_count_v2", return_value=2), + patch("cuda.bindings.nvml.device_get_handle_by_index_v2", return_value=0xffffffff), + patch("cuda.bindings.nvml.device_get_nvlink_state", side_effect=mock_link_state), ): with pytest.raises(ValueError, match="NVLink inactive on:"): check_nvlink_status(verbose=False) @@ -98,23 +93,21 @@ def mock_link_state(handle, link_id): def test_check_nvlink_status_partial_failure(): """Some links active, some inactive — all failures are reported in a single error.""" - import pynvml - - mock_handle = MagicMock() + from cuda.bindings import nvml # Simulate a V100 with 6 NVLink slots: link 0 active, link 1 inactive, rest active. def mock_link_state(handle, link_id): if link_id >= 6: - raise pynvml.NVMLError_InvalidArgument + raise nvml.InvalidArgumentError(0) if link_id == 1: - return pynvml.NVML_FEATURE_DISABLED - return pynvml.NVML_FEATURE_ENABLED + return nvml.EnableState.FEATURE_DISABLED + return nvml.EnableState.FEATURE_ENABLED with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=mock_link_state), + patch("cuda.bindings.nvml.init_v2"), + patch("cuda.bindings.nvml.device_get_count_v2", return_value=2), + patch("cuda.bindings.nvml.device_get_handle_by_index_v2", return_value=0xffffffff), + patch("cuda.bindings.nvml.device_get_nvlink_state", side_effect=mock_link_state), ): with pytest.raises(ValueError, match="NVLink inactive on:") as exc_info: check_nvlink_status(verbose=False) @@ -125,21 +118,19 @@ def mock_link_state(handle, link_id): def test_check_nvlink_status_invalid_argument(): """NVMLError_InvalidArgument stops link iteration early — check succeeds for valid links.""" - import pynvml - - mock_handle = MagicMock() + from cuda.bindings import nvml # Simulate an A100 with 12 NVLink slots; link_id >= 12 is out of range. def mock_link_state(handle, link_id): if link_id >= 12: - raise pynvml.NVMLError_InvalidArgument - return pynvml.NVML_FEATURE_ENABLED + raise nvml.InvalidArgumentError(0) + return nvml.EnableState.FEATURE_ENABLED with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=mock_link_state), + patch("cuda.bindings.nvml.init_v2"), + patch("cuda.bindings.nvml.device_get_count_v2", return_value=2), + patch("cuda.bindings.nvml.device_get_handle_by_index_v2", return_value=0xffffffff), + patch("cuda.bindings.nvml.device_get_nvlink_state", side_effect=mock_link_state), ): result = check_nvlink_status(verbose=True) assert result == "All NVLinks active across 2 GPUs"