Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions rapids_cli/debug/debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,17 @@
from importlib.metadata import distributions, version
from pathlib import Path

import cuda.pathfinder
import pynvml
from rich.console import Console
from rich.table import Table

from rapids_cli.providers import get_gpu_info, get_system_info

console = Console()


def gather_cuda_version():
"""Return CUDA driver version as a string, similar to nvidia-smi output."""
version = pynvml.nvmlSystemGetCudaDriverVersion()
version = get_gpu_info().cuda_driver_version
# pynvml returns an int like 12040 for 12.4, so format as string
major = version // 1000
minor = (version % 1000) // 10
Expand Down Expand Up @@ -69,16 +69,18 @@ def gather_tools():

def run_debug(output_format="console"):
"""Run debug."""
pynvml.nvmlInit()
gpu_info = get_gpu_info()
system_info = get_system_info()

debug_info = {
"date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"platform": platform.platform(),
"nvidia_smi_output": gather_command_output(
["nvidia-smi"], "Nvidia-smi not installed"
),
"driver_version": pynvml.nvmlSystemGetDriverVersion(),
"driver_version": gpu_info.driver_version,
"cuda_version": gather_cuda_version(),
"cuda_runtime_path": cuda.pathfinder.find_nvidia_header_directory("cudart"),
"cuda_runtime_path": system_info.cuda_runtime_path,
"system_ctk": sorted(
[str(p) for p in Path("/usr/local").glob("cuda*") if p.is_dir()]
),
Expand Down
14 changes: 5 additions & 9 deletions rapids_cli/doctor/checks/cuda_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,13 @@
# SPDX-License-Identifier: Apache-2.0
"""Check for CUDA and driver compatibility."""

import pynvml
from rapids_cli.hardware import HardwareInfoError
from rapids_cli.providers import get_gpu_info


def cuda_check(verbose=False):
def cuda_check(verbose=False, **kwargs):
"""Check CUDA availability."""
try:
pynvml.nvmlInit()
try:
cuda_version = pynvml.nvmlSystemGetCudaDriverVersion()
return cuda_version
except pynvml.NVMLError as e:
raise ValueError("Unable to look up CUDA version") from e
except pynvml.NVMLError as e:
return get_gpu_info().cuda_driver_version
except HardwareInfoError as e:
raise ValueError("Unable to look up CUDA version") from e
9 changes: 4 additions & 5 deletions rapids_cli/doctor/checks/cuda_toolkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,12 +186,11 @@ def _gather_toolkit_info() -> CudaToolkitInfo: # pragma: no cover
return info


def cuda_toolkit_check(
verbose=False, *, toolkit_info: CudaToolkitInfo | None = None, **kwargs
):
def cuda_toolkit_check(verbose=False, **kwargs):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what happened here with passing toolkit_info: CudaToolkitInfo ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The toolkit_info parameter was replaced by the provider registry — cuda_toolkit_check now reads from get_toolkit_info() (via rapids_cli.providers) instead of receiving it as a kwarg. This is the core change of the DI refactor: checks no longer need provider parameters threaded through their signatures. Tests install fakes into the registry via monkeypatch.setattr fixtures in conftest.py.

Also reverted the local variable name back to toolkit_info in 56ba029 to minimize the diff.

"""Check CUDA toolkit library availability and version consistency."""
if toolkit_info is None: # pragma: no cover
toolkit_info = _gather_toolkit_info()
from rapids_cli.providers import get_toolkit_info

toolkit_info = get_toolkit_info()

# Check library findability
if toolkit_info.missing_libs:
Expand Down
33 changes: 15 additions & 18 deletions rapids_cli/doctor/checks/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,35 @@
# SPDX-License-Identifier: Apache-2.0
"""GPU checks for the doctor command."""

import pynvml
from rapids_cli.hardware import HardwareInfoError
from rapids_cli.providers import get_gpu_info

REQUIRED_COMPUTE_CAPABILITY = 7


def gpu_check(verbose=False):
def gpu_check(verbose=False, **kwargs):
"""Check GPU availability."""
try:
pynvml.nvmlInit()
num_gpus = pynvml.nvmlDeviceGetCount()
except pynvml.NVMLError as e:
num_gpus = get_gpu_info().device_count
except HardwareInfoError as e:
raise ValueError("No available GPUs detected") from e
assert num_gpus > 0, "No GPUs detected"
return f"GPU(s) detected: {num_gpus}"


def check_gpu_compute_capability(verbose):
def check_gpu_compute_capability(verbose=False, **kwargs):
"""Check the system for GPU Compute Capability."""
try:
pynvml.nvmlInit()
except pynvml.NVMLError as e:
devices = get_gpu_info().devices
except HardwareInfoError as e:
raise ValueError("No GPU - cannot determine GPU Compute Capability") from e

for i in range(pynvml.nvmlDeviceGetCount()):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
if major >= REQUIRED_COMPUTE_CAPABILITY:
for dev in devices:
if dev.compute_capability[0] >= REQUIRED_COMPUTE_CAPABILITY:
continue
else:
raise ValueError(
f"GPU {i} requires compute capability {REQUIRED_COMPUTE_CAPABILITY} "
f"or higher but only has {major}.{minor}."
"See https://developer.nvidia.com/cuda-gpus for more information."
)
raise ValueError(
f"GPU {dev.index} requires compute capability {REQUIRED_COMPUTE_CAPABILITY} "
f"or higher but only has {dev.compute_capability[0]}.{dev.compute_capability[1]}."
"See https://developer.nvidia.com/cuda-gpus for more information."
)
return True
34 changes: 10 additions & 24 deletions rapids_cli/doctor/checks/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,45 +4,31 @@

import warnings

import psutil
import pynvml
from rapids_cli.hardware import HardwareInfoError
from rapids_cli.providers import get_gpu_info, get_system_info


def get_system_memory(verbose=False):
def get_system_memory(verbose=False, **kwargs):
"""Get the total system memory."""
virtual_memory = psutil.virtual_memory()
total_memory = virtual_memory.total / (1024**3) # converts bytes to gigabytes
return total_memory
return get_system_info().total_memory_bytes / (1024**3)


def get_gpu_memory(verbose=False):
def get_gpu_memory(verbose=False, **kwargs):
"""Get the total GPU memory."""
pynvml.nvmlInit()
gpus = pynvml.nvmlDeviceGetCount()
gpu_memory_total = 0
for i in range(gpus):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
gpu_memory_total += memory_info.total / (1024**3) # converts to gigabytes
return sum(dev.memory_total_bytes for dev in get_gpu_info().devices) / (1024**3)

pynvml.nvmlShutdown()
return gpu_memory_total


def check_memory_to_gpu_ratio(verbose=True):
def check_memory_to_gpu_ratio(verbose=True, **kwargs):
"""Check the system for a 2:1 ratio of system Memory to total GPU Memory.

This is especially useful for Dask.

"""
try:
pynvml.nvmlInit()
except pynvml.NVMLError as e:
_ = get_gpu_info().device_count
except HardwareInfoError as e:
raise ValueError("GPU not found. Please ensure GPUs are installed.") from e

system_memory = get_system_memory(verbose)
gpu_memory = get_gpu_memory(verbose)
ratio = system_memory / gpu_memory
ratio = get_system_memory() / get_gpu_memory()
if ratio < 1.8:
warnings.warn(
"System Memory to total GPU Memory ratio not at least 2:1 ratio. "
Expand Down
47 changes: 19 additions & 28 deletions rapids_cli/doctor/checks/nvlink.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@
# SPDX-License-Identifier: Apache-2.0
"""Check for NVLink status."""

import pynvml
from rapids_cli.hardware import HardwareInfoError
from rapids_cli.providers import get_gpu_info


def check_nvlink_status(verbose=True, **kwargs):
"""Check NVLink status across all GPUs."""
gpu_info = get_gpu_info()
try:
pynvml.nvmlInit()
except pynvml.NVMLError as e:
device_count = gpu_info.device_count
except HardwareInfoError as e:
raise ValueError("GPU not found. Please ensure GPUs are installed.") from e

device_count = pynvml.nvmlDeviceGetCount()

# NVLink requires at least 2 GPUs to be meaningful. A single GPU has nothing
# to link to, so there is nothing to check.
if device_count < 2:
Expand All @@ -23,29 +23,20 @@ def check_nvlink_status(verbose=True, **kwargs):
# model). Mixed configurations — e.g. some NVLink-capable GPUs alongside some
# that are not — are not handled and may produce misleading results.

failed_links: list[tuple[int, int]] = []

for gpu_idx in range(device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx)
# NVML provides no API to query the number of NVLink slots on a device
# (e.g. V100=6, A100=12, H100=18). The only way to discover the real count
# is to iterate up to NVML_NVLINK_MAX_LINKS and stop when the driver signals
# that link_id is out of range via NVMLError_InvalidArgument.
for link_id in range(pynvml.NVML_NVLINK_MAX_LINKS):
try:
# nvmlDeviceGetNvLinkState(device, link) returns NVML_FEATURE_ENABLED
# if the link is active, or NVML_FEATURE_DISABLED if it is not.
state = pynvml.nvmlDeviceGetNvLinkState(handle, link_id)
if state == pynvml.NVML_FEATURE_DISABLED:
failed_links.append((gpu_idx, link_id))
except pynvml.NVMLError_NotSupported:
# The driver reports NVLink is not supported on this system.
# There is nothing to check — skip like the single-GPU case above.
return False
except pynvml.NVMLError_InvalidArgument:
# link_id exceeds the number of NVLink slots on this device.
# Stop iterating links for this GPU.
break
devices = gpu_info.devices

# An empty nvlink_states means the driver reported NVLink as unsupported (or
# no links were enumerated) for that device. Treat a system where no device
# advertises links the same as the single-GPU case — nothing to check.
if all(not dev.nvlink_states for dev in devices):
return False

failed_links: list[tuple[int, int]] = [
(dev.index, link_id)
for dev in devices
for link_id, active in enumerate(dev.nvlink_states)
if not active
]
Comment on lines +34 to +39
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't had the time to try this, but I'm not sure if this is covering the same than the cases we had before. The tests seem have to be changed too so it's hard to know.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The refactored check_nvlink_status preserves the exact same logic from PR #143. It reads the same data through the GpuInfoProvider Protocol (which wraps the same pynvml calls in NvmlGpuInfo) instead of calling pynvml directly. The tests cover: single GPU (skip), multi-GPU all links active, inactive links raising, no-nvlink devices, partial failure, and mixed link counts.


if failed_links:
details = ", ".join(f"GPU {gpu} link {link}" for gpu, link in failed_links)
Expand Down
4 changes: 4 additions & 0 deletions rapids_cli/doctor/doctor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@

from rich.console import Console

from rapids_cli import providers
from rapids_cli._compatibility import entry_points
from rapids_cli.constants import DOCTOR_SYMBOL
from rapids_cli.hardware import DefaultSystemInfo, NvmlGpuInfo

console = Console()

Expand Down Expand Up @@ -76,6 +78,8 @@ def doctor_check(
console.print("Dry run, skipping checks")
return True

providers.set_providers(gpu_info=NvmlGpuInfo(), system_info=DefaultSystemInfo())

results: list[CheckResult] = []
with console.status("[bold green]Running checks...") as ui_status:
for i, check_fn in enumerate(checks):
Expand Down
Loading
Loading