From aad73a2029ce9cbccfe28500098dc4b3a2ed6ce9 Mon Sep 17 00:00:00 2001 From: Jaya Venkatesh Date: Wed, 29 Apr 2026 08:29:22 -0700 Subject: [PATCH 1/4] Fix cuda_toolkit_check reading kernel driver instead of CUDA driver --- rapids_cli/doctor/checks/cuda_toolkit.py | 7 +++++-- rapids_cli/tests/test_cuda_toolkit.py | 21 +++++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/rapids_cli/doctor/checks/cuda_toolkit.py b/rapids_cli/doctor/checks/cuda_toolkit.py index 033bc52..e44efc0 100644 --- a/rapids_cli/doctor/checks/cuda_toolkit.py +++ b/rapids_cli/doctor/checks/cuda_toolkit.py @@ -173,9 +173,12 @@ def _gather_toolkit_info() -> CudaToolkitInfo: # pragma: no cover except (DynamicLibNotFoundError, RuntimeError): info.missing_libs.append(soname) - # Get driver version + # Get driver version. Default mode returns the CUDA Driver API version + # (e.g. 13 for CUDA 13.0), which is what the toolkit-vs-driver comparison + # below expects. kernel_mode=True would return the NVIDIA kernel module + # version (e.g. 580) and silently break all comparisons. try: - info.driver_major = get_driver_version(kernel_mode=True)[0] + info.driver_major = get_driver_version()[0] except Exception: info.driver_major = None diff --git a/rapids_cli/tests/test_cuda_toolkit.py b/rapids_cli/tests/test_cuda_toolkit.py index 8d1a19a..9ef2d11 100644 --- a/rapids_cli/tests/test_cuda_toolkit.py +++ b/rapids_cli/tests/test_cuda_toolkit.py @@ -8,6 +8,7 @@ from rapids_cli.doctor.checks.cuda_toolkit import ( CudaToolkitInfo, _ctypes_cuda_version, + _gather_toolkit_info, _get_toolkit_cuda_major, cuda_toolkit_check, ) @@ -168,3 +169,23 @@ def test_check_cuda_home_newer_than_driver(): ): with pytest.raises(ValueError, match="CUDA_HOME"): cuda_toolkit_check(toolkit_info=info) + + +def test_gather_toolkit_info_driver_major_is_cuda_major(): + """driver_major must be the CUDA Driver API major, not the kernel driver major. + + Regression test: prior code passed kernel_mode=True to get_driver_version, + which returns the NVIDIA kernel module version (e.g. 580) and broke every + toolkit-vs-driver comparison. Skips when the helper can't run at all + (e.g. cuda.pathfinder unavailable, no GPU) so macOS and no-GPU CI runners + pass cleanly; on a real GPU host this would have caught the original bug. + """ + try: + info = _gather_toolkit_info() + except Exception as e: + pytest.skip(f"_gather_toolkit_info unavailable on this platform: {e}") + if info.driver_major is not None: + assert info.driver_major < 100, ( + f"driver_major={info.driver_major} looks like a kernel driver " + f"version, not a CUDA Driver API major" + ) From 512c820a7d5da00366b50ab197d4d105d6cd9b25 Mon Sep 17 00:00:00 2001 From: Jaya Venkatesh Date: Wed, 29 Apr 2026 08:29:31 -0700 Subject: [PATCH 2/4] Add cuda-bindings to runtime deps and missing cuda-core to conda recipe --- conda/recipes/rapids-cli/recipe.yaml | 2 ++ dependencies.yaml | 1 + pyproject.toml | 1 + 3 files changed, 4 insertions(+) diff --git a/conda/recipes/rapids-cli/recipe.yaml b/conda/recipes/rapids-cli/recipe.yaml index 4e3ad4a..7b1d499 100644 --- a/conda/recipes/rapids-cli/recipe.yaml +++ b/conda/recipes/rapids-cli/recipe.yaml @@ -31,6 +31,8 @@ requirements: run: - python - importlib-metadata >=4.13.0 + - cuda-bindings >=12.9.6,!=13.0.*,!=13.1.* + - cuda-core >=0.6.0 - cuda-pathfinder >=1.2.3 - nvidia-ml-py >=12.0 - packaging diff --git a/dependencies.yaml b/dependencies.yaml index d312739..f93919b 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -62,6 +62,7 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cuda-core >=0.6.0 + - cuda-bindings>=12.9.6,!=13.0.*,!=13.1.* - nvidia-ml-py>=12.0 - cuda-pathfinder >=1.2.3 - packaging diff --git a/pyproject.toml b/pyproject.toml index 882cc68..b2c0d4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ license-files = ["LICENSE"] readme = "README.md" requires-python = ">=3.10" dependencies = [ + "cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*", "cuda-core >=0.6.0", "cuda-pathfinder >=1.2.3", "importlib-metadata >= 4.13.0; python_version < '3.12'", From c65de815d514450bc5ecc55aeb1dfb52f4d58d6c Mon Sep 17 00:00:00 2001 From: Jaya Venkatesh Date: Wed, 29 Apr 2026 08:52:48 -0700 Subject: [PATCH 3/4] fix comments --- rapids_cli/doctor/checks/cuda_toolkit.py | 4 ---- rapids_cli/tests/test_cuda_toolkit.py | 9 +-------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/rapids_cli/doctor/checks/cuda_toolkit.py b/rapids_cli/doctor/checks/cuda_toolkit.py index e44efc0..c040e4d 100644 --- a/rapids_cli/doctor/checks/cuda_toolkit.py +++ b/rapids_cli/doctor/checks/cuda_toolkit.py @@ -173,10 +173,6 @@ def _gather_toolkit_info() -> CudaToolkitInfo: # pragma: no cover except (DynamicLibNotFoundError, RuntimeError): info.missing_libs.append(soname) - # Get driver version. Default mode returns the CUDA Driver API version - # (e.g. 13 for CUDA 13.0), which is what the toolkit-vs-driver comparison - # below expects. kernel_mode=True would return the NVIDIA kernel module - # version (e.g. 580) and silently break all comparisons. try: info.driver_major = get_driver_version()[0] except Exception: diff --git a/rapids_cli/tests/test_cuda_toolkit.py b/rapids_cli/tests/test_cuda_toolkit.py index 9ef2d11..38e4f93 100644 --- a/rapids_cli/tests/test_cuda_toolkit.py +++ b/rapids_cli/tests/test_cuda_toolkit.py @@ -172,14 +172,7 @@ def test_check_cuda_home_newer_than_driver(): def test_gather_toolkit_info_driver_major_is_cuda_major(): - """driver_major must be the CUDA Driver API major, not the kernel driver major. - - Regression test: prior code passed kernel_mode=True to get_driver_version, - which returns the NVIDIA kernel module version (e.g. 580) and broke every - toolkit-vs-driver comparison. Skips when the helper can't run at all - (e.g. cuda.pathfinder unavailable, no GPU) so macOS and no-GPU CI runners - pass cleanly; on a real GPU host this would have caught the original bug. - """ + """Regression: driver_major must be the CUDA Driver API major, not the kernel driver major.""" try: info = _gather_toolkit_info() except Exception as e: From 97a78bf9d038d16fada682ed77fdfeb489f82152 Mon Sep 17 00:00:00 2001 From: Jaya Venkatesh Date: Thu, 30 Apr 2026 11:02:08 -0700 Subject: [PATCH 4/4] Document cuda-bindings version pin rationale Signed-off-by: Jaya Venkatesh --- dependencies.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dependencies.yaml b/dependencies.yaml index f93919b..3afec7e 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -62,6 +62,9 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cuda-core >=0.6.0 + # NVML APIs we use via cuda.core.system landed in cuda-bindings + # 12.9.6 (CUDA 12) and 13.2.0 (CUDA 13). The 13.0/13.1 + # wheels pre-date the 13.x landing and are excluded. - cuda-bindings>=12.9.6,!=13.0.*,!=13.1.* - nvidia-ml-py>=12.0 - cuda-pathfinder >=1.2.3