rapidsai · ncclementi · Apr 29, 2026 · Feb 11, 2026 · Apr 20, 2026 · Apr 22, 2026
@@ -10,17 +10,17 @@
 from importlib.metadata import distributions, version
 from pathlib import Path
 
-import cuda.pathfinder
-import pynvml
 from rich.console import Console
 from rich.table import Table
 
+from rapids_cli.providers import get_gpu_info, get_system_info
+
 console = Console()
 
 
 def gather_cuda_version():
     """Return CUDA driver version as a string, similar to nvidia-smi output."""
-    version = pynvml.nvmlSystemGetCudaDriverVersion()
+    version = get_gpu_info().cuda_driver_version
     # pynvml returns an int like 12040 for 12.4, so format as string
     major = version // 1000
     minor = (version % 1000) // 10
@@ -69,16 +69,18 @@ def gather_tools():
 
 def run_debug(output_format="console"):
     """Run debug."""
-    pynvml.nvmlInit()
+    gpu_info = get_gpu_info()
+    system_info = get_system_info()
+
     debug_info = {
         "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
         "platform": platform.platform(),
         "nvidia_smi_output": gather_command_output(
             ["nvidia-smi"], "Nvidia-smi not installed"
         ),
-        "driver_version": pynvml.nvmlSystemGetDriverVersion(),
+        "driver_version": gpu_info.driver_version,
         "cuda_version": gather_cuda_version(),
-        "cuda_runtime_path": cuda.pathfinder.find_nvidia_header_directory("cudart"),
+        "cuda_runtime_path": system_info.cuda_runtime_path,
         "system_ctk": sorted(
             [str(p) for p in Path("/usr/local").glob("cuda*") if p.is_dir()]
         ),

@@ -2,17 +2,13 @@
 # SPDX-License-Identifier: Apache-2.0
 """Check for CUDA and driver compatibility."""
 
-import pynvml
+from rapids_cli.hardware import HardwareInfoError
+from rapids_cli.providers import get_gpu_info
 
 
-def cuda_check(verbose=False):
+def cuda_check(verbose=False, **kwargs):
     """Check CUDA availability."""
     try:
-        pynvml.nvmlInit()
-        try:
-            cuda_version = pynvml.nvmlSystemGetCudaDriverVersion()
-            return cuda_version
-        except pynvml.NVMLError as e:
-            raise ValueError("Unable to look up CUDA version") from e
-    except pynvml.NVMLError as e:
+        return get_gpu_info().cuda_driver_version
+    except HardwareInfoError as e:
         raise ValueError("Unable to look up CUDA version") from e
@@ -186,12 +186,11 @@ def _gather_toolkit_info() -> CudaToolkitInfo:  # pragma: no cover
     return info
 
 
-def cuda_toolkit_check(
-    verbose=False, *, toolkit_info: CudaToolkitInfo | None = None, **kwargs
-):
+def cuda_toolkit_check(verbose=False, **kwargs):
     """Check CUDA toolkit library availability and version consistency."""
-    if toolkit_info is None:  # pragma: no cover
-        toolkit_info = _gather_toolkit_info()
+    from rapids_cli.providers import get_toolkit_info
+
+    toolkit_info = get_toolkit_info()
 
     # Check library findability
     if toolkit_info.missing_libs:

@@ -2,38 +2,35 @@
 # SPDX-License-Identifier: Apache-2.0
 """GPU checks for the doctor command."""
 
-import pynvml
+from rapids_cli.hardware import HardwareInfoError
+from rapids_cli.providers import get_gpu_info
 
 REQUIRED_COMPUTE_CAPABILITY = 7
 
 
-def gpu_check(verbose=False):
+def gpu_check(verbose=False, **kwargs):
     """Check GPU availability."""
     try:
-        pynvml.nvmlInit()
-        num_gpus = pynvml.nvmlDeviceGetCount()
-    except pynvml.NVMLError as e:
+        num_gpus = get_gpu_info().device_count
+    except HardwareInfoError as e:
         raise ValueError("No available GPUs detected") from e
     assert num_gpus > 0, "No GPUs detected"
     return f"GPU(s) detected: {num_gpus}"
 
 
-def check_gpu_compute_capability(verbose):
+def check_gpu_compute_capability(verbose=False, **kwargs):
     """Check the system for GPU Compute Capability."""
     try:
-        pynvml.nvmlInit()
-    except pynvml.NVMLError as e:
+        devices = get_gpu_info().devices
+    except HardwareInfoError as e:
         raise ValueError("No GPU - cannot determine GPU Compute Capability") from e
 
-    for i in range(pynvml.nvmlDeviceGetCount()):
-        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
-        major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
-        if major >= REQUIRED_COMPUTE_CAPABILITY:
+    for dev in devices:
+        if dev.compute_capability[0] >= REQUIRED_COMPUTE_CAPABILITY:
             continue
-        else:
-            raise ValueError(
-                f"GPU {i} requires compute capability {REQUIRED_COMPUTE_CAPABILITY} "
-                f"or higher but only has {major}.{minor}."
-                "See https://developer.nvidia.com/cuda-gpus for more information."
-            )
+        raise ValueError(
+            f"GPU {dev.index} requires compute capability {REQUIRED_COMPUTE_CAPABILITY} "
+            f"or higher but only has {dev.compute_capability[0]}.{dev.compute_capability[1]}."
+            "See https://developer.nvidia.com/cuda-gpus for more information."
+        )
     return True
@@ -4,45 +4,31 @@
 
 import warnings
 
-import psutil
-import pynvml
+from rapids_cli.hardware import HardwareInfoError
+from rapids_cli.providers import get_gpu_info, get_system_info
 
 
-def get_system_memory(verbose=False):
+def get_system_memory(verbose=False, **kwargs):
     """Get the total system memory."""
-    virtual_memory = psutil.virtual_memory()
-    total_memory = virtual_memory.total / (1024**3)  # converts bytes to gigabytes
-    return total_memory
+    return get_system_info().total_memory_bytes / (1024**3)
 
 
-def get_gpu_memory(verbose=False):
+def get_gpu_memory(verbose=False, **kwargs):
     """Get the total GPU memory."""
-    pynvml.nvmlInit()
-    gpus = pynvml.nvmlDeviceGetCount()
-    gpu_memory_total = 0
-    for i in range(gpus):
-        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
-        memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-        gpu_memory_total += memory_info.total / (1024**3)  # converts to gigabytes
+    return sum(dev.memory_total_bytes for dev in get_gpu_info().devices) / (1024**3)
 
-    pynvml.nvmlShutdown()
-    return gpu_memory_total
 
-
-def check_memory_to_gpu_ratio(verbose=True):
+def check_memory_to_gpu_ratio(verbose=True, **kwargs):
     """Check the system for a 2:1 ratio of system Memory to total GPU Memory.
 
     This is especially useful for Dask.
-
     """
     try:
-        pynvml.nvmlInit()
-    except pynvml.NVMLError as e:
+        _ = get_gpu_info().device_count
+    except HardwareInfoError as e:
         raise ValueError("GPU not found. Please ensure GPUs are installed.") from e
 
-    system_memory = get_system_memory(verbose)
-    gpu_memory = get_gpu_memory(verbose)
-    ratio = system_memory / gpu_memory
+    ratio = get_system_memory() / get_gpu_memory()
     if ratio < 1.8:
         warnings.warn(
             "System Memory to total GPU Memory ratio not at least 2:1 ratio. "

@@ -2,18 +2,18 @@
 # SPDX-License-Identifier: Apache-2.0
 """Check for NVLink status."""
 
-import pynvml
+from rapids_cli.hardware import HardwareInfoError
+from rapids_cli.providers import get_gpu_info
 
 
 def check_nvlink_status(verbose=True, **kwargs):
     """Check NVLink status across all GPUs."""
+    gpu_info = get_gpu_info()
     try:
-        pynvml.nvmlInit()
-    except pynvml.NVMLError as e:
+        device_count = gpu_info.device_count
+    except HardwareInfoError as e:
         raise ValueError("GPU not found. Please ensure GPUs are installed.") from e
 
-    device_count = pynvml.nvmlDeviceGetCount()
-
     # NVLink requires at least 2 GPUs to be meaningful. A single GPU has nothing
     # to link to, so there is nothing to check.
     if device_count < 2:
@@ -23,29 +23,20 @@ def check_nvlink_status(verbose=True, **kwargs):
     # model). Mixed configurations — e.g. some NVLink-capable GPUs alongside some
     # that are not — are not handled and may produce misleading results.
 
-    failed_links: list[tuple[int, int]] = []
-
-    for gpu_idx in range(device_count):
-        handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx)
-        # NVML provides no API to query the number of NVLink slots on a device
-        # (e.g. V100=6, A100=12, H100=18). The only way to discover the real count
-        # is to iterate up to NVML_NVLINK_MAX_LINKS and stop when the driver signals
-        # that link_id is out of range via NVMLError_InvalidArgument.
-        for link_id in range(pynvml.NVML_NVLINK_MAX_LINKS):
-            try:
-                # nvmlDeviceGetNvLinkState(device, link) returns NVML_FEATURE_ENABLED
-                # if the link is active, or NVML_FEATURE_DISABLED if it is not.
-                state = pynvml.nvmlDeviceGetNvLinkState(handle, link_id)
-                if state == pynvml.NVML_FEATURE_DISABLED:
-                    failed_links.append((gpu_idx, link_id))
-            except pynvml.NVMLError_NotSupported:
-                # The driver reports NVLink is not supported on this system.
-                # There is nothing to check — skip like the single-GPU case above.
-                return False
-            except pynvml.NVMLError_InvalidArgument:
-                # link_id exceeds the number of NVLink slots on this device.
-                # Stop iterating links for this GPU.
-                break
+    devices = gpu_info.devices
+
+    # An empty nvlink_states means the driver reported NVLink as unsupported (or
+    # no links were enumerated) for that device. Treat a system where no device
+    # advertises links the same as the single-GPU case — nothing to check.
+    if all(not dev.nvlink_states for dev in devices):
+        return False
+
+    failed_links: list[tuple[int, int]] = [
+        (dev.index, link_id)
+        for dev in devices
+        for link_id, active in enumerate(dev.nvlink_states)
+        if not active
+    ]
 
     if failed_links:
         details = ", ".join(f"GPU {gpu} link {link}" for gpu, link in failed_links)

@@ -8,8 +8,10 @@
 
 from rich.console import Console
 
+from rapids_cli import providers
 from rapids_cli._compatibility import entry_points
 from rapids_cli.constants import DOCTOR_SYMBOL
+from rapids_cli.hardware import DefaultSystemInfo, NvmlGpuInfo
 
 console = Console()
 
@@ -76,6 +78,8 @@ def doctor_check(
         console.print("Dry run, skipping checks")
         return True
 
+    providers.set_providers(gpu_info=NvmlGpuInfo(), system_info=DefaultSystemInfo())
+
     results: list[CheckResult] = []
     with console.status("[bold green]Running checks...") as ui_status:
         for i, check_fn in enumerate(checks):