Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ The doctor command discovers and runs checks via Python entry points defined in
### Key Dependencies

- `rich` and `rich-click` for terminal output and CLI interface
- `pynvml` (nvidia-ml-py) for GPU information
- `cuda-core` for GPU information
- `cuda-pathfinder` for locating CUDA installations
- `psutil` for system memory checks

Expand Down
4 changes: 3 additions & 1 deletion conda/recipes/rapids-cli/recipe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,14 @@ requirements:
- python
- importlib-metadata >=4.13.0
- cuda-pathfinder >=1.2.3
- nvidia-ml-py >=12.0
- packaging
- psutil
- pyyaml
- rich
- rich-click
- cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*
# TODO: Change to cuda-core >= 1.0.0 once that's released
- cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core

tests:
- script:
Expand Down
5 changes: 3 additions & 2 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,9 @@ dependencies:
common:
- output_types: [conda, requirements, pyproject]
packages:
- cuda-core >=0.6.0
- nvidia-ml-py>=12.0
- cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*
# TODO: Change to cuda-core >= 1.0.0 once that's released
- cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core
- cuda-pathfinder >=1.2.3
- packaging
- psutil
Expand Down
2 changes: 1 addition & 1 deletion docs/source/api/debug.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ for troubleshooting RAPIDS installations.
:func:`~rapids_cli.debug.debug.run_debug` is the main entry point. It collects:

- Platform and OS details (from ``platform`` and ``/etc/os-release``)
- NVIDIA driver and CUDA versions (via ``pynvml``)
- NVIDIA driver and CUDA versions (via ``cuda.core.system``)
- CUDA runtime path (via ``cuda-pathfinder``)
- System CUDA toolkit locations (globbing ``/usr/local/cuda*``)
- Python version and hash info
Expand Down
8 changes: 3 additions & 5 deletions docs/source/plugin_development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -95,15 +95,13 @@ GPU memory requirement check:

.. code-block:: python

import pynvml
from cuda.core import system


def gpu_memory_check(verbose=False, **kwargs):
"""Check that GPU has at least 8GB memory."""
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
available_gb = mem.total / (1024**3)
device = system.Device(index=0)
available_gb = device.memory_info.total / (1024**3)

if available_gb < 8:
raise ValueError(
Expand Down
2 changes: 1 addition & 1 deletion docs/source/troubleshooting.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ No GPUs Detected

.. code-block:: bash

python -c "import pynvml; pynvml.nvmlInit(); print(pynvml.nvmlDeviceGetCount())"
python -c "from cuda.core import system; system.Device.get_device_count()"

3. If running in a container, ensure GPU passthrough is enabled:

Expand Down
8 changes: 6 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ license-files = ["LICENSE"]
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"cuda-core >=0.6.0",
"cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*",
"cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core",
"cuda-pathfinder >=1.2.3",
"importlib-metadata >= 4.13.0; python_version < '3.12'",
"nvidia-ml-py>=12.0",
"packaging",
"psutil",
"pyyaml",
Expand Down Expand Up @@ -49,6 +49,10 @@ version-file = "rapids_cli/_version.py"
[tool.hatch.version]
source = "vcs"

[tool.hatch.metadata]
# TODO: Remove me when cuda-core 1.0 is released
allow-direct-references = true

[tool.black]
# this should match the oldest version of Python the library supports
target-version = ["py310"]
Expand Down
11 changes: 3 additions & 8 deletions rapids_cli/debug/debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pathlib import Path

import cuda.pathfinder
import pynvml
from cuda.core import system
from rich.console import Console
from rich.table import Table

Expand All @@ -20,11 +20,7 @@

def gather_cuda_version():
"""Return CUDA driver version as a string, similar to nvidia-smi output."""
version = pynvml.nvmlSystemGetCudaDriverVersion()
# pynvml returns an int like 12040 for 12.4, so format as string
major = version // 1000
minor = (version % 1000) // 10
patch = version % 10
major, minor, patch = system.get_driver_version_full()
if patch == 0:
return f"{major}.{minor}"
else:
Expand Down Expand Up @@ -69,14 +65,13 @@ def gather_tools():

def run_debug(output_format="console"):
"""Run debug."""
pynvml.nvmlInit()
debug_info = {
"date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"platform": platform.platform(),
"nvidia_smi_output": gather_command_output(
["nvidia-smi"], "Nvidia-smi not installed"
),
"driver_version": pynvml.nvmlSystemGetDriverVersion(),
"driver_version": ".".join(str(x) for x in system.get_driver_version_full(kernel_mode=True)),
"cuda_version": gather_cuda_version(),
"cuda_runtime_path": cuda.pathfinder.find_nvidia_header_directory("cudart"),
"system_ctk": sorted(
Expand Down
13 changes: 5 additions & 8 deletions rapids_cli/doctor/checks/cuda_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,14 @@
# SPDX-License-Identifier: Apache-2.0
"""Check for CUDA and driver compatibility."""

import pynvml
from cuda.core import system


def cuda_check(verbose=False):
"""Check CUDA availability."""

try:
pynvml.nvmlInit()
try:
cuda_version = pynvml.nvmlSystemGetCudaDriverVersion()
return cuda_version
except pynvml.NVMLError as e:
raise ValueError("Unable to look up CUDA version") from e
except pynvml.NVMLError as e:
cuda_version = system.get_driver_version_full(kernel_mode=True)
return cuda_version[0] * 1000 + cuda_version[1] * 10 + cuda_version[2]
except system.NvmlError as e:
raise ValueError("Unable to look up CUDA version") from e
18 changes: 9 additions & 9 deletions rapids_cli/doctor/checks/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,16 @@
# SPDX-License-Identifier: Apache-2.0
"""GPU checks for the doctor command."""

import pynvml
from cuda.core import system

REQUIRED_COMPUTE_CAPABILITY = 7


def gpu_check(verbose=False):
"""Check GPU availability."""
try:
pynvml.nvmlInit()
num_gpus = pynvml.nvmlDeviceGetCount()
except pynvml.NVMLError as e:
num_gpus = system.Device.get_device_count()
except system.NvmlError as e:
raise ValueError("No available GPUs detected") from e
assert num_gpus > 0, "No GPUs detected"
return f"GPU(s) detected: {num_gpus}"
Expand All @@ -21,13 +20,14 @@ def gpu_check(verbose=False):
def check_gpu_compute_capability(verbose):
"""Check the system for GPU Compute Capability."""
try:
pynvml.nvmlInit()
except pynvml.NVMLError as e:
num_gpus = system.Device.get_device_count()
if num_gpus == 0:
raise system.NvmlError(1)
except system.NvmlError as e:
raise ValueError("No GPU - cannot determine GPU Compute Capability") from e

for i in range(pynvml.nvmlDeviceGetCount()):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
for i, device in enumerate(system.Device.get_all_devices()):
major, minor = device.cuda_compute_capability
if major >= REQUIRED_COMPUTE_CAPABILITY:
continue
else:
Expand Down
20 changes: 9 additions & 11 deletions rapids_cli/doctor/checks/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import warnings

import psutil
import pynvml

from cuda.core import system


def get_system_memory(verbose=False):
Expand All @@ -17,15 +18,11 @@ def get_system_memory(verbose=False):

def get_gpu_memory(verbose=False):
"""Get the total GPU memory."""
pynvml.nvmlInit()
gpus = pynvml.nvmlDeviceGetCount()

gpu_memory_total = 0
for i in range(gpus):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
gpu_memory_total += memory_info.total / (1024**3) # converts to gigabytes
for device in system.Device.get_all_devices():
gpu_memory_total += device.memory_info.total / (1024**3) # converts to gigabytes

pynvml.nvmlShutdown()
return gpu_memory_total


Expand All @@ -36,9 +33,10 @@ def check_memory_to_gpu_ratio(verbose=True):

"""
try:
pynvml.nvmlInit()
except pynvml.NVMLError as e:
raise ValueError("GPU not found. Please ensure GPUs are installed.") from e
if system.Device.get_device_count() == 0:
raise system.NvmlError(1)
except system.NvmlError:
raise ValueError("GPU not found. Please ensure GPUs are installed.")

system_memory = get_system_memory(verbose)
gpu_memory = get_gpu_memory(verbose)
Expand Down
23 changes: 11 additions & 12 deletions rapids_cli/doctor/checks/nvlink.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,19 @@
# SPDX-License-Identifier: Apache-2.0
"""Check for NVLink status."""

import pynvml
from cuda.core import system
from cuda.bindings import nvml


def check_nvlink_status(verbose=True, **kwargs):
"""Check NVLink status across all GPUs."""
try:
pynvml.nvmlInit()
except pynvml.NVMLError as e:
device_count = system.Device.get_device_count()
if device_count == 0:
raise system.NvmlError(1)
except system.NvmlError as e:
raise ValueError("GPU not found. Please ensure GPUs are installed.") from e

device_count = pynvml.nvmlDeviceGetCount()

# NVLink requires at least 2 GPUs to be meaningful. A single GPU has nothing
# to link to, so there is nothing to check.
if device_count < 2:
Expand All @@ -25,24 +26,22 @@ def check_nvlink_status(verbose=True, **kwargs):

failed_links: list[tuple[int, int]] = []

for gpu_idx in range(device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx)
for gpu_idx, device in enumerate(system.Device.get_all_devices()):
# NVML provides no API to query the number of NVLink slots on a device
# (e.g. V100=6, A100=12, H100=18). The only way to discover the real count
# is to iterate up to NVML_NVLINK_MAX_LINKS and stop when the driver signals
# that link_id is out of range via NVMLError_InvalidArgument.
for link_id in range(pynvml.NVML_NVLINK_MAX_LINKS):
for link_id in range(nvml.NVLINK_MAX_LINKS):
try:
# nvmlDeviceGetNvLinkState(device, link) returns NVML_FEATURE_ENABLED
# if the link is active, or NVML_FEATURE_DISABLED if it is not.
state = pynvml.nvmlDeviceGetNvLinkState(handle, link_id)
if state == pynvml.NVML_FEATURE_DISABLED:
if not device.get_nvlink(link_id).state:
failed_links.append((gpu_idx, link_id))
except pynvml.NVMLError_NotSupported:
except system.NotSupportedError:
# The driver reports NVLink is not supported on this system.
# There is nothing to check — skip like the single-GPU case above.
return False
except pynvml.NVMLError_InvalidArgument:
except system.InvalidArgumentError:
# link_id exceeds the number of NVLink slots on this device.
# Stop iterating links for this GPU.
break
Expand Down
16 changes: 9 additions & 7 deletions rapids_cli/tests/test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,34 @@
# SPDX-License-Identifier: Apache-2.0
from unittest.mock import patch

import pynvml
import pytest

from rapids_cli.doctor.checks.cuda_driver import cuda_check


def test_cuda_check_success():
with (
patch("pynvml.nvmlInit"),
patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050),
patch("cuda.core.system.get_driver_version_full", return_value=(12, 5, 0)),
):
assert cuda_check(verbose=True) == 12050


def test_cuda_check_init_fails():
with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)):
from cuda.bindings import nvml

with patch("cuda.bindings.nvml.init_v2", side_effect=nvml.NvmlError(1)):
with pytest.raises(ValueError, match="Unable to look up CUDA version"):
cuda_check()


def test_cuda_check_version_query_fails():
from cuda.bindings import nvml

with (
patch("pynvml.nvmlInit"),
patch("cuda.bindings.nvml.init_v2"),
patch(
"pynvml.nvmlSystemGetCudaDriverVersion",
side_effect=pynvml.NVMLError(1),
"cuda.bindings.nvml.system_get_cuda_driver_version",
side_effect=nvml.NvmlError(1),
),
):
with pytest.raises(ValueError, match="Unable to look up CUDA version"):
Expand Down
17 changes: 9 additions & 8 deletions rapids_cli/tests/test_debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@

def test_gather_cuda_version():
"""Test CUDA version gathering."""
with patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040):
with patch("cuda.core.system.get_driver_version_full", return_value=(12, 4, 0)):
result = gather_cuda_version()
assert result == "12.4"


def test_gather_cuda_version_with_patch():
"""Test CUDA version with patch number."""
with patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12345):
with patch("cuda.core.system.get_driver_version_full", return_value=(12, 34, 5)):
result = gather_cuda_version()
assert result == "12.34.5"

Expand Down Expand Up @@ -74,9 +74,9 @@ def test_run_debug_console(capsys):
mock_vm.total = 32 * 1024**3

with (
patch("pynvml.nvmlInit"),
patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54.15"),
patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040),
patch("cuda.bindings.nvml.init_v2"),
patch("cuda.bindings.nvml.system_get_driver_version", return_value="550.54.15"),
patch("cuda.bindings.nvml.system_get_cuda_driver_version", return_value=12040),
patch(
"cuda.pathfinder.find_nvidia_header_directory",
return_value="/usr/local/cuda/include",
Expand All @@ -95,10 +95,11 @@ def test_run_debug_console(capsys):

def test_run_debug_json(capsys):
"""Test run_debug with JSON output."""

with (
patch("pynvml.nvmlInit"),
patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54.15"),
patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040),
patch("cuda.bindings.nvml.init_v2"),
patch("cuda.bindings.nvml.system_get_driver_version", return_value="550.54.15"),
patch("cuda.bindings.nvml.system_get_cuda_driver_version", return_value=12040),
patch(
"cuda.pathfinder.find_nvidia_header_directory",
return_value="/usr/local/cuda/include",
Expand Down
Loading