From e461b597054a0695db414cdb7fa730ab2a128a62 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Fri, 12 Jun 2026 12:21:29 -0500 Subject: [PATCH 1/7] Decouple from numba.cuda further in tests in favor of cupy and cuda.core and add minimal extra testing --- ci/matrix.yaml | 2 + ci/test_cuda_compute_minimal_python.sh | 51 +++++++++++++ .../tests/compute/test_binary_search.py | 35 +++++++++ .../cuda_cccl/tests/compute/test_bindings.py | 2 + .../compute/test_deferred_annotations.py | 2 + .../tests/compute/test_func_caching.py | 8 ++ .../cuda_cccl/tests/compute/test_histogram.py | 2 + .../cuda_cccl/tests/compute/test_iterators.py | 10 ++- .../tests/compute/test_merge_sort.py | 55 +++++++------- .../tests/compute/test_nested_struct.py | 5 ++ .../cuda_cccl/tests/compute/test_no_numba.py | 75 +----------------- .../compute/test_permutation_iterator.py | 6 ++ .../tests/compute/test_radix_sort.py | 76 ++++++++++--------- python/cuda_cccl/tests/compute/test_reduce.py | 38 +++++----- python/cuda_cccl/tests/compute/test_scan.py | 16 ++-- .../tests/compute/test_segmented_reduce.py | 3 + .../tests/compute/test_segmented_sort.py | 51 +++++++------ python/cuda_cccl/tests/compute/test_select.py | 31 ++++++++ .../tests/compute/test_shuffle_iterator.py | 1 + .../compute/test_struct_field_validation.py | 2 + .../tests/compute/test_three_way_partition.py | 46 +++++++++++ .../cuda_cccl/tests/compute/test_transform.py | 5 ++ .../tests/compute/test_unique_by_key.py | 69 ++++++++--------- .../test_void_ptr_wrapper_validation.py | 4 +- .../tests/compute/test_zip_iterator.py | 11 ++- 25 files changed, 379 insertions(+), 227 deletions(-) create mode 100755 ci/test_cuda_compute_minimal_python.sh diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 6146d863c87..1fcb1df75ae 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -84,6 +84,7 @@ workflows: - {jobs: ['test'], project: 'python', ctk: ['12.X', '13.X'], py_version: ['3.10'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']} - {jobs: ['test'], project: 'python', ctk: ['12.X','13.0', '13.X'], py_version: ['3.14'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']} - {jobs: ['test'], project: 'python', py_version: '3.14', gpu: 'h100', cxx: 'gcc13'} + - {jobs: ['test_py_compute_minimal'], project: 'python', ctk: '13.X', py_version: '3.14', gpu: 'l4', cxx: 'gcc13'} # CCCL packaging: - {jobs: ['test'], project: 'packaging', ctk: '12.0', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080', args: '-min-cmake'} - {jobs: ['test'], project: 'packaging', ctk: '12.X', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080'} @@ -539,6 +540,7 @@ jobs: test_py_headers: { name: "Test cuda.cccl.headers", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_headers'} } test_py_coop: { name: "Test cuda.coop._experimental", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_coop'} } test_py_par: { name: "Test cuda.compute", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute'} } + test_py_compute_minimal: { name: "Test cuda.compute minimal", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute_minimal'} } test_py_examples: { name: "Test cuda.cccl.examples", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_examples'} } # Run jobs for 'target' project (ci/util/build_and_test_targets.sh): diff --git a/ci/test_cuda_compute_minimal_python.sh b/ci/test_cuda_compute_minimal_python.sh new file mode 100755 index 00000000000..c48b313ed31 --- /dev/null +++ b/ci/test_cuda_compute_minimal_python.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash + +set -euo pipefail + +ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(cd "$ci_dir/.." && pwd)" +source "$ci_dir/pyenv_helper.sh" + +# Parse common arguments +source "$ci_dir/util/python/common_arg_parser.sh" +parse_python_args "$@" +require_py_version "Usage: $0 -py-version " + +cuda_major_version=$(nvcc --version | grep release | awk '{print $6}' | tr -d ',' | cut -d '.' -f 1 | cut -d 'V' -f 2) + +# Setup Python environment +setup_python_env "${py_version}" + +# Fetch or build the cuda_cccl wheel: +if [[ -n "${GITHUB_ACTIONS:-}" ]]; then + wheel_artifact_name=$("$ci_dir/util/workflow/get_wheel_artifact_name.sh") + "$ci_dir/util/artifacts/download.sh" "${wheel_artifact_name}" /home/coder/cccl/ + wheelhouse_dir="/home/coder/cccl/wheelhouse" +else + "$ci_dir/build_cuda_cccl_python.sh" -py-version "${py_version}" + wheelhouse_dir="${repo_root}/wheelhouse" +fi + +# Install cuda_cccl with the minimal CUDA extra. This intentionally avoids the +# full cu* extras because those pull in numba/numba-cuda. In a clean minimal +# environment, the test phase below runs only tests marked no_numba. +CUDA_CCCL_WHEEL_PATH="$(ls "${wheelhouse_dir}"/cuda_cccl-*.whl)" +python -m pip install "${CUDA_CCCL_WHEEL_PATH}[minimal-cu${cuda_major_version}]" +python -m pip install pytest pytest-xdist "cupy-cuda${cuda_major_version}x" + +if python - <<'PY' +try: + import numba.cuda # noqa: F401 +except Exception as exc: + print(f"numba.cuda unavailable; running no_numba subset: {exc!r}") + raise SystemExit(1) +else: + print("numba.cuda available; running full compute test suite.") +PY +then + cd "${repo_root}/python/cuda_cccl/tests/" + python -m pytest -n 6 -v compute/ -m "not large" +else + cd "${repo_root}/python/cuda_cccl/tests/" + python -m pytest -n 6 -v compute/ -m "not large and no_numba" +fi diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py index a4f7d047a9b..94863c3fe74 100644 --- a/python/cuda_cccl/tests/compute/test_binary_search.py +++ b/python/cuda_cccl/tests/compute/test_binary_search.py @@ -6,6 +6,8 @@ import pytest import cuda.compute +from cuda.compute._cpp_compile import compile_cpp_op_code +from cuda.compute.op import RawOp DTYPE_LIST = [ np.int32, @@ -39,6 +41,39 @@ def disable_sass_check(monkeypatch): ) +def _raw_less_i32_op() -> RawOp: + source = """ +extern "C" __device__ void less_i32(void* lhs, void* rhs, void* result) { + int lhs_value = *static_cast(lhs); + int rhs_value = *static_cast(rhs); + *static_cast(result) = lhs_value < rhs_value; +} +""" + return RawOp(ltoir=compile_cpp_op_code(source), name="less_i32") + + +@pytest.mark.no_numba +def test_lower_bound_raw_op_minimal(): + h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32) + h_values = np.array([0, 3, 4, 10], dtype=np.int32) + + d_data = cp.asarray(h_data) + d_values = cp.asarray(h_values) + d_out = cp.empty(len(h_values), dtype=np.uintp) + + cuda.compute.lower_bound( + d_data=d_data, + num_items=len(d_data), + d_values=d_values, + num_values=len(d_values), + d_out=d_out, + comp=_raw_less_i32_op(), + ) + + expected = np.searchsorted(h_data, h_values, side="left").astype(np.uintp) + np.testing.assert_array_equal(d_out.get(), expected) + + @pytest.mark.parametrize("dtype", DTYPE_LIST) @pytest.mark.parametrize( "num_items,num_values", [(0, 0), (0, 128), (128, 0), (512, 128)] diff --git a/python/cuda_cccl/tests/compute/test_bindings.py b/python/cuda_cccl/tests/compute/test_bindings.py index 2f16da620ef..ccfbd8d0bc3 100644 --- a/python/cuda_cccl/tests/compute/test_bindings.py +++ b/python/cuda_cccl/tests/compute/test_bindings.py @@ -4,6 +4,8 @@ import cuda.compute._bindings as bindings +pytestmark = pytest.mark.no_numba + @pytest.fixture( params=[ diff --git a/python/cuda_cccl/tests/compute/test_deferred_annotations.py b/python/cuda_cccl/tests/compute/test_deferred_annotations.py index c30aeda4068..a584b8ffbe2 100644 --- a/python/cuda_cccl/tests/compute/test_deferred_annotations.py +++ b/python/cuda_cccl/tests/compute/test_deferred_annotations.py @@ -7,10 +7,12 @@ import cupy as cp import numpy as np +import pytest from cuda.compute import OpKind, TransformIterator, gpu_struct, reduce_into +@pytest.mark.no_numba def test_deferred_annotations(): # test that we can use @gpu_struct with deferred annotations # GH: #6421 diff --git a/python/cuda_cccl/tests/compute/test_func_caching.py b/python/cuda_cccl/tests/compute/test_func_caching.py index 0da32901a32..5d12458790d 100644 --- a/python/cuda_cccl/tests/compute/test_func_caching.py +++ b/python/cuda_cccl/tests/compute/test_func_caching.py @@ -1,10 +1,12 @@ import numpy as np +import pytest from cuda.compute._caching import CachableFunction global_x = 1 +@pytest.mark.no_numba def test_func_caching_basic(): def func(x): return x @@ -19,6 +21,7 @@ def func(x): assert f1 == f2 +@pytest.mark.no_numba def test_func_caching_different_names(): def func(x): return x @@ -33,6 +36,7 @@ def func2(x): assert f1 != f2 +@pytest.mark.no_numba def test_func_caching_different_code(): def func(x): return x @@ -46,6 +50,7 @@ def func(x): assert f1 != f2 +@pytest.mark.no_numba def test_func_caching_with_closure(): def factory(x): def func(y): @@ -61,6 +66,7 @@ def func(y): assert f1 != f3 +@pytest.mark.no_numba def test_func_caching_with_global_variable(): global global_x @@ -107,6 +113,7 @@ def func(x): assert CachableFunction(func1) != CachableFunction(func3) +@pytest.mark.no_numba def test_func_caching_with_global_np_ufunc(): def make_func(): def func(x): @@ -126,6 +133,7 @@ def func(x): assert CachableFunction(func1) != CachableFunction(func2) +@pytest.mark.no_numba def test_func_caching_with_aliased_np_ufunc(): def make_func1(): amin = np.argmin diff --git a/python/cuda_cccl/tests/compute/test_histogram.py b/python/cuda_cccl/tests/compute/test_histogram.py index 0a9d96674e6..5c104114754 100644 --- a/python/cuda_cccl/tests/compute/test_histogram.py +++ b/python/cuda_cccl/tests/compute/test_histogram.py @@ -12,6 +12,8 @@ CountingIterator, ) +pytestmark = pytest.mark.no_numba + DTYPE_LIST = [ np.uint8, np.uint16, diff --git a/python/cuda_cccl/tests/compute/test_iterators.py b/python/cuda_cccl/tests/compute/test_iterators.py index 49e2ab39d15..c3b6acc9fc9 100644 --- a/python/cuda_cccl/tests/compute/test_iterators.py +++ b/python/cuda_cccl/tests/compute/test_iterators.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import cupy as cp -import numba.cuda import numpy as np import pytest @@ -22,6 +21,7 @@ ) +@pytest.mark.no_numba def test_constant_iterator_equality(): it1 = ConstantIterator(np.int32(0)) it2 = ConstantIterator(np.int32(0)) @@ -32,6 +32,7 @@ def test_constant_iterator_equality(): assert it1.kind != it4.kind +@pytest.mark.no_numba def test_counting_iterator_equality(): it1 = CountingIterator(np.int32(0)) it2 = CountingIterator(np.int32(0)) @@ -42,6 +43,7 @@ def test_counting_iterator_equality(): assert it1.kind != it4.kind +@pytest.mark.no_numba def test_cache_modified_input_iterator_equality(): ary1 = cp.asarray([0, 1, 2], dtype="int32") ary2 = cp.asarray([3, 4, 5], dtype="int32") @@ -122,11 +124,14 @@ def reverse_iterator_array(request): if array_type == "cupy": array = cp.array(base_array) else: + import numba.cuda + array = numba.cuda.to_device(base_array) return array +@pytest.mark.no_numba def test_reverse_input_iterator_equality(): ary1 = cp.asarray([0, 1, 2], dtype="int32") ary2 = cp.asarray([3, 4, 5], dtype="int32") @@ -141,6 +146,7 @@ def test_reverse_input_iterator_equality(): assert it1.kind != it4.kind +@pytest.mark.no_numba def test_reverse_output_iterator_equality(): ary1 = cp.asarray([0, 1, 2], dtype="int32") ary2 = cp.asarray([3, 4, 5], dtype="int32") @@ -155,6 +161,7 @@ def test_reverse_output_iterator_equality(): assert it1.kind != it4.kind +@pytest.mark.no_numba @pytest.mark.parametrize( "shape, itemsize, expected", [ @@ -179,6 +186,7 @@ def test_compute_c_contiguous_strides_in_bytes(shape, itemsize, expected): assert result == expected +@pytest.mark.no_numba @pytest.mark.parametrize( "shape, dtype", [ diff --git a/python/cuda_cccl/tests/compute/test_merge_sort.py b/python/cuda_cccl/tests/compute/test_merge_sort.py index 33d7d15de58..db97d5aca01 100644 --- a/python/cuda_cccl/tests/compute/test_merge_sort.py +++ b/python/cuda_cccl/tests/compute/test_merge_sort.py @@ -5,7 +5,6 @@ from typing import List import cupy as cp -import numba.cuda import numpy as np import pytest @@ -82,11 +81,11 @@ def compare_op(lhs, rhs): def test_merge_sort_keys(dtype, num_items, op): h_in_keys = random_array(num_items, dtype) - d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_keys = cp.asarray(h_in_keys) merge_sort_device(d_in_keys, None, d_in_keys, None, op, num_items) - h_out_keys = d_in_keys.copy_to_host() + h_out_keys = d_in_keys.get() h_in_keys.sort() np.testing.assert_array_equal(h_out_keys, h_in_keys) @@ -102,13 +101,13 @@ def test_merge_sort_pairs(dtype, num_items, op, monkeypatch): h_in_keys = random_array(num_items, dtype) h_in_items = random_array(num_items, np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_items = numba.cuda.to_device(h_in_items) + d_in_keys = cp.asarray(h_in_keys) + d_in_items = cp.asarray(h_in_items) merge_sort_device(d_in_keys, d_in_items, d_in_keys, d_in_items, op, num_items) - h_out_keys = d_in_keys.copy_to_host() - h_out_items = d_in_items.copy_to_host() + h_out_keys = d_in_keys.get() + h_out_items = d_in_items.get() argsort = np.argsort(h_in_keys, stable=True) h_in_keys = np.array(h_in_keys)[argsort] @@ -123,12 +122,12 @@ def test_merge_sort_keys_copy(dtype, num_items, op): h_in_keys = random_array(num_items, dtype) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_out_keys = numba.cuda.to_device(h_out_keys) + d_in_keys = cp.asarray(h_in_keys) + d_out_keys = cp.asarray(h_out_keys) merge_sort_device(d_in_keys, None, d_out_keys, None, op, num_items) - h_out_keys = d_out_keys.copy_to_host() + h_out_keys = d_out_keys.get() h_in_keys.sort() np.testing.assert_array_equal(h_out_keys, h_in_keys) @@ -146,15 +145,15 @@ def test_merge_sort_pairs_copy(dtype, num_items, op, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_items = np.empty(num_items, dtype=np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_items = numba.cuda.to_device(h_in_items) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_items = numba.cuda.to_device(h_out_items) + d_in_keys = cp.asarray(h_in_keys) + d_in_items = cp.asarray(h_in_items) + d_out_keys = cp.asarray(h_out_keys) + d_out_items = cp.asarray(h_out_items) merge_sort_device(d_in_keys, d_in_items, d_out_keys, d_out_items, op, num_items) - h_out_keys = d_out_keys.copy_to_host() - h_out_items = d_out_items.copy_to_host() + h_out_keys = d_out_keys.get() + h_out_items = d_out_items.get() argsort = np.argsort(h_in_keys, stable=True) h_in_keys = np.array(h_in_keys)[argsort] @@ -225,11 +224,11 @@ def compare_complex(lhs, rhs): imaginary = random_array(num_items, np.int64, max_value) h_in_keys = real + 1j * imaginary - d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_keys = cp.asarray(h_in_keys) merge_sort_device(d_in_keys, None, d_in_keys, None, compare_complex, num_items) - h_out_keys = d_in_keys.copy_to_host() + h_out_keys = d_in_keys.get() h_in_keys = h_in_keys[np.argsort(h_in_keys.real, stable=True)] np.testing.assert_array_equal(h_out_keys, h_in_keys) @@ -240,15 +239,15 @@ def test_merge_sort_keys_copy_iterator_input(dtype, num_items, op): h_in_keys = random_array(num_items, dtype) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_out_keys = numba.cuda.to_device(h_out_keys) + d_in_keys = cp.asarray(h_in_keys) + d_out_keys = cp.asarray(h_out_keys) i_input = CacheModifiedInputIterator(d_in_keys, modifier="stream") merge_sort_device(i_input, None, d_out_keys, None, op, num_items) h_in_keys.sort() - h_out_keys = d_out_keys.copy_to_host() + h_out_keys = d_out_keys.get() np.testing.assert_array_equal(h_out_keys, h_in_keys) @@ -265,10 +264,10 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch) h_out_keys = np.empty(num_items, dtype=dtype) h_out_items = np.empty(num_items, dtype=np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_items = numba.cuda.to_device(h_in_items) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_items = numba.cuda.to_device(h_out_items) + d_in_keys = cp.asarray(h_in_keys) + d_in_items = cp.asarray(h_in_items) + d_out_keys = cp.asarray(h_out_keys) + d_out_items = cp.asarray(h_out_items) i_input_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream") i_input_items = CacheModifiedInputIterator(d_in_items, modifier="stream") @@ -277,8 +276,8 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch) i_input_keys, i_input_items, d_out_keys, d_out_items, op, num_items ) - h_out_keys = d_out_keys.copy_to_host() - h_out_items = d_out_items.copy_to_host() + h_out_keys = d_out_keys.get() + h_out_items = d_out_items.get() argsort = np.argsort(h_in_keys, stable=True) h_in_keys = np.array(h_in_keys)[argsort] @@ -307,6 +306,7 @@ def test_merge_sort_with_stream(cuda_stream): np.testing.assert_array_equal(got, h_in_keys) +@pytest.mark.no_numba def test_merge_sort_well_known_less(): dtype = np.int32 @@ -326,6 +326,7 @@ def test_merge_sort_well_known_less(): np.testing.assert_equal(d_out_keys.get(), expected) +@pytest.mark.no_numba def test_merge_sort_well_known_greater(): dtype = np.int32 diff --git a/python/cuda_cccl/tests/compute/test_nested_struct.py b/python/cuda_cccl/tests/compute/test_nested_struct.py index 9cbfa40e393..715873af3c8 100644 --- a/python/cuda_cccl/tests/compute/test_nested_struct.py +++ b/python/cuda_cccl/tests/compute/test_nested_struct.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import cupy as cp import numpy as np +import pytest import cuda.compute from cuda.compute import ZipIterator, gpu_struct @@ -149,6 +150,7 @@ def sum_pixels(p1, p2): assert result["color"]["b"] == expected_b +@pytest.mark.no_numba def test_dict_init_nested_struct(): """Test initializing a nested struct with a dictionary.""" Inner = gpu_struct({"a": np.int32, "b": np.float32}) @@ -162,6 +164,7 @@ def test_dict_init_nested_struct(): assert np.isclose(obj.inner.b, 3.14) +@pytest.mark.no_numba def test_dict_init_per_field(): """Test initializing a struct with a dictionary for a nested field.""" Inner = gpu_struct({"a": np.int32, "b": np.float32}) @@ -175,6 +178,7 @@ def test_dict_init_per_field(): assert np.isclose(obj.inner.b, 3.14) +@pytest.mark.no_numba def test_dict_init_deeply_nested(): """Test initializing deeply nested structs (3+ levels) with dictionaries.""" Level1 = gpu_struct({"value": np.int32}) @@ -189,6 +193,7 @@ def test_dict_init_deeply_nested(): assert obj.middle.nested.value == 42 +@pytest.mark.no_numba def test_dict_init_mixed(): """Test mixed initialization with some dicts and some direct values.""" Inner1 = gpu_struct({"a": np.int32, "b": np.int32}) diff --git a/python/cuda_cccl/tests/compute/test_no_numba.py b/python/cuda_cccl/tests/compute/test_no_numba.py index 8f1d271e1ed..bbab965c8bf 100644 --- a/python/cuda_cccl/tests/compute/test_no_numba.py +++ b/python/cuda_cccl/tests/compute/test_no_numba.py @@ -1,12 +1,6 @@ -import cupy as cp -import numpy as np import pytest -import cuda.compute -from cuda.compute import OpKind - -# Mainly, these tests check that we can use algorithms with OpKind -# operators while not requiring numba to be installed. +# Check that tests marked no_numba fail fast if they import numba. pytestmark = pytest.mark.no_numba @@ -16,70 +10,3 @@ def test_import_numba_raises(): ImportError, match="This test is marked 'no_numba' but attempted to import it" ): import numba.cuda # noqa: F401 - - -def test_reduce_op_kind(): - num_items = 100 - h_input = np.arange(num_items, dtype=np.int32) - d_input = cp.array(h_input) - d_output = cp.empty(1, dtype=np.int32) - - h_init = np.array(0, dtype=np.int32) - cuda.compute.reduce_into( - d_in=d_input, d_out=d_output, num_items=num_items, op=OpKind.PLUS, h_init=h_init - ) - - result = d_output.get()[0] - expected = np.sum(h_input) - assert result == expected - - -def test_binary_transform_op_kind(): - num_items = 100 - h_input1 = np.arange(num_items, dtype=np.int32) - h_input2 = np.arange(num_items, dtype=np.int32) * 2 - d_input1 = cp.array(h_input1) - d_input2 = cp.array(h_input2) - d_output = cp.empty(num_items, dtype=np.int32) - - cuda.compute.binary_transform( - d_in1=d_input1, - d_in2=d_input2, - d_out=d_output, - op=OpKind.PLUS, - num_items=num_items, - ) - - result = d_output.get() - expected = h_input1 + h_input2 - assert np.array_equal(result, expected) - - -def test_segmented_sort_op_kind(): - # Create segments: [3, 1, 4] | [1, 5, 9, 2] | [6, 5] - num_items = 9 - h_keys = np.array([3, 1, 4, 1, 5, 9, 2, 6, 5], dtype=np.int32) - h_offsets = np.array([0, 3, 7, 9], dtype=np.int32) - - d_keys_in = cp.array(h_keys) - d_keys_out = cp.empty(num_items, dtype=np.int32) - d_offsets = cp.array(h_offsets) - - num_segments = len(h_offsets) - 1 - - cuda.compute.segmented_sort( - d_in_keys=d_keys_in, - d_out_keys=d_keys_out, - d_in_values=None, - d_out_values=None, - num_items=num_items, - num_segments=num_segments, - start_offsets_in=d_offsets[:-1], - end_offsets_in=d_offsets[1:], - order=cuda.compute.SortOrder.ASCENDING, - ) - - result = d_keys_out.get() - # Expected: [1, 3, 4] | [1, 2, 5, 9] | [5, 6] - expected = np.array([1, 3, 4, 1, 2, 5, 9, 5, 6], dtype=np.int32) - assert np.array_equal(result, expected) diff --git a/python/cuda_cccl/tests/compute/test_permutation_iterator.py b/python/cuda_cccl/tests/compute/test_permutation_iterator.py index 74d74a7b4e6..226904b75da 100644 --- a/python/cuda_cccl/tests/compute/test_permutation_iterator.py +++ b/python/cuda_cccl/tests/compute/test_permutation_iterator.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import cupy as cp import numpy as np +import pytest import cuda.compute from cuda.compute.iterators import ( @@ -12,6 +13,7 @@ ) +@pytest.mark.no_numba def test_permutation_iterator_equality(): values1 = cp.asarray([10, 20, 30, 40, 50], dtype="int32") values2 = cp.asarray([100, 200, 300], dtype="int32") @@ -37,6 +39,7 @@ def test_permutation_iterator_equality(): assert it1.kind != it5.kind +@pytest.mark.no_numba def test_permutation_iterator_with_array_values(): values = cp.asarray([10, 20, 30, 40, 50], dtype="int32") indices = cp.asarray([2, 0, 4, 1], dtype="int32") @@ -54,6 +57,7 @@ def test_permutation_iterator_with_array_values(): assert d_output[0] == values[indices].sum() +@pytest.mark.no_numba def test_permutation_iterator_with_iterator_values(): values_it = CountingIterator(np.int32(10)) indices = cp.asarray([2, 0, 4, 1], dtype="int32") @@ -157,6 +161,7 @@ def op(a): assert cp.all(d_out == expected) +@pytest.mark.no_numba def test_caching_permutation_iterator(): """Test that iterator compilation is cached across instances with the same structure.""" from cuda.compute._cpp_compile import compile_cpp_op_code @@ -204,6 +209,7 @@ def test_caching_permutation_iterator(): ) +@pytest.mark.no_numba def test_permutation_iterator_advance(): """Test PermutationIterator.__add__ only advances indices, not values.""" # Create values array [10, 20, 30, 40, 50, 60, 70] diff --git a/python/cuda_cccl/tests/compute/test_radix_sort.py b/python/cuda_cccl/tests/compute/test_radix_sort.py index 62b8e7dcb44..67186d3a4f7 100644 --- a/python/cuda_cccl/tests/compute/test_radix_sort.py +++ b/python/cuda_cccl/tests/compute/test_radix_sort.py @@ -6,11 +6,11 @@ from typing import Tuple import cupy as cp -import numba import numpy as np import pytest import cuda.compute +from cuda.core import Device from cuda.compute import ( DoubleBuffer, SortOrder, @@ -148,7 +148,7 @@ def host_sort(h_in_keys, h_in_values, order, begin_bit=None, end_bit=None) -> Tu DTYPE_SIZE, ) def test_radix_sort_keys(dtype, num_items, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -164,12 +164,12 @@ def test_radix_sort_keys(dtype, num_items, monkeypatch): h_in_keys = random_array(num_items, dtype, max_value=20) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_out_keys = numba.cuda.to_device(h_out_keys) + d_in_keys = cp.asarray(h_in_keys) + d_out_keys = cp.asarray(h_out_keys) radix_sort_device(d_in_keys, d_out_keys, None, None, order, num_items) - h_out_keys = d_out_keys.copy_to_host() + h_out_keys = d_out_keys.get() h_in_keys, _ = host_sort(h_in_keys, None, order) @@ -195,17 +195,17 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_values = numba.cuda.to_device(h_in_values) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_values = numba.cuda.to_device(h_out_values) + d_in_keys = cp.asarray(h_in_keys) + d_in_values = cp.asarray(h_in_values) + d_out_keys = cp.asarray(h_out_keys) + d_out_values = cp.asarray(h_out_values) radix_sort_device( d_in_keys, d_out_keys, d_in_values, d_out_values, order, num_items ) - h_out_keys = d_out_keys.copy_to_host() - h_out_values = d_out_values.copy_to_host() + h_out_keys = d_out_keys.get() + h_out_values = d_out_values.get() h_in_keys, h_in_values = host_sort(h_in_keys, h_in_values, order) @@ -218,7 +218,7 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch): DTYPE_SIZE, ) def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -234,14 +234,14 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch): h_in_keys = random_array(num_items, dtype, max_value=20) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_out_keys = numba.cuda.to_device(h_out_keys) + d_in_keys = cp.asarray(h_in_keys) + d_out_keys = cp.asarray(h_out_keys) keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys) radix_sort_device(keys_double_buffer, None, None, None, order, num_items) - h_out_keys = keys_double_buffer.current().copy_to_host() + h_out_keys = keys_double_buffer.current().get() h_in_keys, _ = host_sort(h_in_keys, None, order) @@ -253,7 +253,7 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch): DTYPE_SIZE, ) def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # NOTE: int16 failures seen only with NVRTC 13.1: if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)): import cuda.compute._cccl_interop @@ -270,10 +270,10 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_values = numba.cuda.to_device(h_in_values) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_values = numba.cuda.to_device(h_out_values) + d_in_keys = cp.asarray(h_in_keys) + d_in_values = cp.asarray(h_in_values) + d_out_keys = cp.asarray(h_out_keys) + d_out_values = cp.asarray(h_out_values) keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys) values_double_buffer = DoubleBuffer(d_in_values, d_out_values) @@ -282,8 +282,8 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): keys_double_buffer, None, values_double_buffer, None, order, num_items ) - h_out_keys = keys_double_buffer.current().copy_to_host() - h_out_values = values_double_buffer.current().copy_to_host() + h_out_keys = keys_double_buffer.current().get() + h_out_values = values_double_buffer.current().get() h_in_keys, h_in_values = host_sort(h_in_keys, h_in_values, order) @@ -304,7 +304,7 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): DTYPE_SIZE_BIT_WINDOW, ) def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # NOTE: int16 failures seen only with NVRTC 13.1: if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)): import cuda.compute._cccl_interop @@ -329,10 +329,10 @@ def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_values = numba.cuda.to_device(h_in_values) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_values = numba.cuda.to_device(h_out_values) + d_in_keys = cp.asarray(h_in_keys) + d_in_values = cp.asarray(h_in_values) + d_out_keys = cp.asarray(h_out_keys) + d_out_values = cp.asarray(h_out_values) radix_sort_device( d_in_keys, @@ -345,8 +345,8 @@ def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch): end_bit, ) - h_out_keys = d_out_keys.copy_to_host() - h_out_values = d_out_values.copy_to_host() + h_out_keys = d_out_keys.get() + h_out_values = d_out_values.get() h_in_keys, h_in_values = host_sort( h_in_keys, h_in_values, order, begin_bit, end_bit @@ -384,10 +384,10 @@ def test_radix_sort_pairs_double_buffer_bit_window(dtype, num_items, monkeypatch h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_values = numba.cuda.to_device(h_in_values) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_values = numba.cuda.to_device(h_out_values) + d_in_keys = cp.asarray(h_in_keys) + d_in_values = cp.asarray(h_in_values) + d_out_keys = cp.asarray(h_out_keys) + d_out_values = cp.asarray(h_out_values) keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys) values_double_buffer = DoubleBuffer(d_in_values, d_out_values) @@ -403,8 +403,8 @@ def test_radix_sort_pairs_double_buffer_bit_window(dtype, num_items, monkeypatch end_bit, ) - h_out_keys = keys_double_buffer.current().copy_to_host() - h_out_values = values_double_buffer.current().copy_to_host() + h_out_keys = keys_double_buffer.current().get() + h_out_values = values_double_buffer.current().get() h_in_keys, h_in_values = host_sort( h_in_keys, h_in_values, order, begin_bit, end_bit @@ -469,8 +469,9 @@ def test_radix_sort_with_stream(cuda_stream): np.testing.assert_array_equal(got, h_in_keys) +@pytest.mark.no_numba def test_radix_sort(monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -518,8 +519,9 @@ def test_radix_sort(monkeypatch): np.testing.assert_array_equal(h_out_items, h_in_values) +@pytest.mark.no_numba def test_radix_sort_double_buffer(monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: diff --git a/python/cuda_cccl/tests/compute/test_reduce.py b/python/cuda_cccl/tests/compute/test_reduce.py index df3f024f0ee..15f11428506 100644 --- a/python/cuda_cccl/tests/compute/test_reduce.py +++ b/python/cuda_cccl/tests/compute/test_reduce.py @@ -6,7 +6,6 @@ import random import cupy as cp -import numba.cuda import numpy as np import pytest from cupy.cuda import runtime @@ -73,15 +72,15 @@ def add_op(a, b): def test_device_reduce(dtype, num_items, op): init_value = 42 h_init = np.array([init_value], dtype=dtype) - d_output = numba.cuda.device_array(1, dtype=dtype) + d_output = cp.empty(1, dtype=dtype) h_input = random_int(num_items, dtype) - d_input = numba.cuda.to_device(h_input) + d_input = cp.asarray(h_input) cuda.compute.reduce_into( d_in=d_input, d_out=d_output, num_items=d_input.size, op=op, h_init=h_init ) - h_output = d_output.copy_to_host() + h_output = d_output.get() assert h_output[0] == pytest.approx( sum(h_input) + init_value, rel=0.08 if dtype == np.float16 else 0 ) # obtained relative error value from c2h/include/c2h/check_results.cuh @@ -94,10 +93,10 @@ def test_device_reduce_with_lambda(): num_items = 1024 h_init = np.array([init_value], dtype=dtype) - d_output = numba.cuda.device_array(1, dtype=dtype) + d_output = cp.empty(1, dtype=dtype) h_input = random_int(num_items, dtype) - d_input = numba.cuda.to_device(h_input) + d_input = cp.asarray(h_input) # Use a lambda function directly as the reducer cuda.compute.reduce_into( @@ -107,7 +106,7 @@ def test_device_reduce_with_lambda(): op=lambda a, b: a + b, h_init=h_init, ) - h_output = d_output.copy_to_host() + h_output = d_output.get() assert h_output[0] == sum(h_input) + init_value @@ -118,10 +117,10 @@ def test_device_reduce_with_lambda_variable(): num_items = 1024 h_init = np.array([init_value], dtype=dtype) - d_output = numba.cuda.device_array(1, dtype=dtype) + d_output = cp.empty(1, dtype=dtype) h_input = random_int(num_items, dtype) - d_input = numba.cuda.to_device(h_input) + d_input = cp.asarray(h_input) # Use a lambda function assigned to a variable as the reducer cuda.compute.reduce_into( @@ -131,24 +130,24 @@ def test_device_reduce_with_lambda_variable(): op=add_op_lambda, h_init=h_init, ) - h_output = d_output.copy_to_host() + h_output = d_output.get() assert h_output[0] == sum(h_input) + init_value def test_complex_device_reduce(): h_init = np.array([40.0 + 2.0j], dtype=complex) - d_output = numba.cuda.device_array(1, dtype=complex) + d_output = cp.empty(1, dtype=complex) for num_items in [42, 420000]: real_imag = np.random.random((2, num_items)) h_input = real_imag[0] + 1j * real_imag[1] - d_input = numba.cuda.to_device(h_input) + d_input = cp.asarray(h_input) assert d_input.size == num_items cuda.compute.reduce_into( d_in=d_input, d_out=d_output, num_items=num_items, op=add_op, h_init=h_init ) - result = d_output.copy_to_host()[0] + result = d_output.get()[0] expected = np.sum(h_input, initial=h_init[0]) assert result == pytest.approx(expected) @@ -162,11 +161,11 @@ def _test_device_sum_with_iterator( if use_numpy_array: h_input = np.array(l_varr, dtype_inp) - d_input = numba.cuda.to_device(h_input) + d_input = cp.asarray(h_input) else: d_input = i_input - d_output = numba.cuda.device_array(1, dtype_out) # to store device sum + d_output = cp.empty(1, dtype_out) # to store device sum h_init = np.array([start_sum_with], dtype_out) @@ -174,7 +173,7 @@ def _test_device_sum_with_iterator( d_in=d_input, d_out=d_output, num_items=len(l_varr), op=add_op, h_init=h_init ) - h_output = d_output.copy_to_host() + h_output = d_output.get() assert h_output[0] == expected_result @@ -216,7 +215,7 @@ def test_device_sum_cache_modified_input_it( l_varr = [rng.randrange(100) for _ in range(num_items)] dtype_inp = np.dtype(supported_value_type) dtype_out = dtype_inp - input_devarr = numba.cuda.to_device(np.array(l_varr, dtype=dtype_inp)) + input_devarr = cp.asarray(np.array(l_varr, dtype=dtype_inp)) i_input = CacheModifiedInputIterator(input_devarr, modifier="stream") _test_device_sum_with_iterator( l_varr, start_sum_with, i_input, dtype_inp, dtype_out, use_numpy_array @@ -691,6 +690,7 @@ def add_op(x, y): ) +@pytest.mark.no_numba def test_device_reduce_well_known_plus(): dtype = np.int32 h_init = np.array([0], dtype=dtype) @@ -709,6 +709,7 @@ def test_device_reduce_well_known_plus(): assert (d_output == expected_output).all() +@pytest.mark.no_numba def test_device_reduce_well_known_minimum(): dtype = np.int32 h_init = np.array([100], dtype=dtype) @@ -727,6 +728,7 @@ def test_device_reduce_well_known_minimum(): assert (d_output == expected_output).all() +@pytest.mark.no_numba def test_device_reduce_well_known_maximum(): dtype = np.int32 h_init = np.array([-100], dtype=dtype) @@ -925,6 +927,7 @@ def sqrt(x: dtype) -> dtype: np.testing.assert_allclose(d_output.get(), expected.get(), atol=1e-6) +@pytest.mark.no_numba def test_reduce_with_not_guaranteed_determinism(floating_array): dtype = floating_array.dtype h_init = np.array([0], dtype=dtype) @@ -942,6 +945,7 @@ def test_reduce_with_not_guaranteed_determinism(floating_array): ) +@pytest.mark.no_numba def test_reduce_bool(): h_init = np.array([False]) d_input = cp.array([True, False, True]) diff --git a/python/cuda_cccl/tests/compute/test_scan.py b/python/cuda_cccl/tests/compute/test_scan.py index af6d941b11b..97f5fae630a 100644 --- a/python/cuda_cccl/tests/compute/test_scan.py +++ b/python/cuda_cccl/tests/compute/test_scan.py @@ -4,11 +4,11 @@ import cupy as cp -import numba.cuda import numpy as np import pytest import cuda.compute +from cuda.core import Device from cuda.compute import ( CountingIterator, OpKind, @@ -52,7 +52,7 @@ def scan_device(d_input, d_output, num_items, op, h_init, force_inclusive, strea [True, False], ) def test_scan_array_input(force_inclusive, input_array, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip sass verification if input is complex # as LDL/STL instructions are emitted for complex types. # Also skip for: @@ -214,6 +214,7 @@ def op(a, b): np.testing.assert_allclose(expected, got, rtol=1e-5) +@pytest.mark.no_numba def test_exclusive_scan_well_known_plus(): dtype = np.int32 h_init = np.array([0], dtype=dtype) @@ -232,8 +233,9 @@ def test_exclusive_scan_well_known_plus(): np.testing.assert_equal(d_output.get(), expected) +@pytest.mark.no_numba def test_inclusive_scan_well_known_plus(monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip SASS check for CC 9.0+, due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -354,7 +356,7 @@ def add_op(a, b): def test_reverse_input_iterator(monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip SASS check for CC 9.0+, due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -409,6 +411,7 @@ def add_op(a, b): np.testing.assert_equal(d_output.get(), expected) +@pytest.mark.no_numba @pytest.mark.parametrize( "force_inclusive", [True, False], @@ -430,13 +433,14 @@ def test_future_init_value(force_inclusive): np.testing.assert_array_equal(expected, got) +@pytest.mark.no_numba def test_no_init_value(monkeypatch): force_inclusive = True num_items = 1024 dtype = np.dtype("int32") # Skip SASS check for CC 9.0 due to LDL/STL CI failure. - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability if cc_major >= 9: import cuda.compute._cccl_interop @@ -456,6 +460,7 @@ def test_no_init_value(monkeypatch): np.testing.assert_array_equal(expected, got) +@pytest.mark.no_numba def test_no_init_value_iterator(): force_inclusive = True num_items = 1024 @@ -493,6 +498,7 @@ def test_inclusive_scan_with_lambda(): np.testing.assert_array_equal(d_output.get(), expected) +@pytest.mark.no_numba @pytest.mark.parametrize("force_inclusive", [True, False]) def test_scan_bool_maximum(force_inclusive): h_init = np.array([False], dtype=np.bool_) diff --git a/python/cuda_cccl/tests/compute/test_segmented_reduce.py b/python/cuda_cccl/tests/compute/test_segmented_reduce.py index 950d860a14d..815c98b8edb 100644 --- a/python/cuda_cccl/tests/compute/test_segmented_reduce.py +++ b/python/cuda_cccl/tests/compute/test_segmented_reduce.py @@ -269,6 +269,7 @@ def _plus(a, b): ) +@pytest.mark.no_numba def test_segmented_reduce_well_known_plus(monkeypatch): # Disable SASS verification for this test (LDL instruction in SASS). monkeypatch.setattr( @@ -299,6 +300,7 @@ def test_segmented_reduce_well_known_plus(monkeypatch): np.testing.assert_equal(d_output.get(), expected) +@pytest.mark.no_numba def test_segmented_reduce_well_known_maximum(monkeypatch): # Disable SASS verification for this test (LDL instruction in SASS). monkeypatch.setattr( @@ -329,6 +331,7 @@ def test_segmented_reduce_well_known_maximum(monkeypatch): np.testing.assert_equal(d_output.get(), expected) +@pytest.mark.no_numba def test_segmented_reduce_bool_maximum(monkeypatch): # Disable SASS verification for this test (LDL instruction in SASS). monkeypatch.setattr( diff --git a/python/cuda_cccl/tests/compute/test_segmented_sort.py b/python/cuda_cccl/tests/compute/test_segmented_sort.py index 2bdd03abb42..9c2ab9f2de4 100644 --- a/python/cuda_cccl/tests/compute/test_segmented_sort.py +++ b/python/cuda_cccl/tests/compute/test_segmented_sort.py @@ -5,12 +5,13 @@ from typing import Tuple import cupy as cp -import numba import numpy as np import pytest import cuda.compute +pytestmark = pytest.mark.no_numba + DTYPE_LIST = [ np.uint8, np.int16, @@ -118,8 +119,8 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch): h_in_keys = random_array(num_items, dtype, max_value=50) start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) + d_in_keys = cp.asarray(h_in_keys) + d_out_keys = cp.asarray(np.empty_like(h_in_keys)) cuda.compute.segmented_sort( d_in_keys=d_in_keys, @@ -133,7 +134,7 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch): order=order, ) - h_out_keys = d_out_keys.copy_to_host() + h_out_keys = d_out_keys.get() expected_keys, _ = host_segmented_sort( h_in_keys, None, start_offsets, end_offsets, order ) @@ -153,10 +154,10 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size): start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_vals = numba.cuda.to_device(h_in_vals) - d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) - d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals)) + d_in_keys = cp.asarray(h_in_keys) + d_in_vals = cp.asarray(h_in_vals) + d_out_keys = cp.asarray(np.empty_like(h_in_keys)) + d_out_vals = cp.asarray(np.empty_like(h_in_vals)) cuda.compute.segmented_sort( d_in_keys=d_in_keys, @@ -170,8 +171,8 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size): order=order, ) - h_out_keys = d_out_keys.copy_to_host() - h_out_vals = d_out_vals.copy_to_host() + h_out_keys = d_out_keys.get() + h_out_vals = d_out_vals.get() expected_keys, expected_vals = host_segmented_sort( h_in_keys, h_in_vals, start_offsets, end_offsets, order @@ -189,8 +190,8 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size): h_in_keys = random_array(num_items, dtype, max_value=20) start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) + d_in_keys = cp.asarray(h_in_keys) + d_tmp_keys = cp.asarray(np.empty_like(h_in_keys)) keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys) cuda.compute.segmented_sort( @@ -205,7 +206,7 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size): order=order, ) - h_out_keys = keys_db.current().copy_to_host() + h_out_keys = keys_db.current().get() expected_keys, _ = host_segmented_sort( h_in_keys, None, start_offsets, end_offsets, order ) @@ -224,10 +225,10 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size): start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_vals = numba.cuda.to_device(h_in_vals) - d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) - d_tmp_vals = numba.cuda.to_device(np.empty_like(h_in_vals)) + d_in_keys = cp.asarray(h_in_keys) + d_in_vals = cp.asarray(h_in_vals) + d_tmp_keys = cp.asarray(np.empty_like(h_in_keys)) + d_tmp_vals = cp.asarray(np.empty_like(h_in_vals)) keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys) vals_db = cuda.compute.DoubleBuffer(d_in_vals, d_tmp_vals) @@ -244,8 +245,8 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size): order=order, ) - h_out_keys = keys_db.current().copy_to_host() - h_out_vals = vals_db.current().copy_to_host() + h_out_keys = keys_db.current().get() + h_out_vals = vals_db.current().get() expected_keys, expected_vals = host_segmented_sort( h_in_keys, h_in_vals, start_offsets, end_offsets, order @@ -297,10 +298,10 @@ def test_segmented_sort_variable_segment_sizes(num_segments): h_in_keys = random_array(num_items, np.int32, max_value=100) h_in_vals = random_array(num_items, np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_vals = numba.cuda.to_device(h_in_vals) - d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) - d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals)) + d_in_keys = cp.asarray(h_in_keys) + d_in_vals = cp.asarray(h_in_vals) + d_out_keys = cp.asarray(np.empty_like(h_in_keys)) + d_out_vals = cp.asarray(np.empty_like(h_in_vals)) cuda.compute.segmented_sort( d_in_keys=d_in_keys, @@ -314,8 +315,8 @@ def test_segmented_sort_variable_segment_sizes(num_segments): order=order, ) - h_out_keys = d_out_keys.copy_to_host() - h_out_vals = d_out_vals.copy_to_host() + h_out_keys = d_out_keys.get() + h_out_vals = d_out_vals.get() expected_keys, expected_vals = host_segmented_sort( h_in_keys, h_in_vals, start_offsets, end_offsets, order ) diff --git a/python/cuda_cccl/tests/compute/test_select.py b/python/cuda_cccl/tests/compute/test_select.py index f2483860a4a..d467d33c635 100644 --- a/python/cuda_cccl/tests/compute/test_select.py +++ b/python/cuda_cccl/tests/compute/test_select.py @@ -7,6 +7,8 @@ import pytest import cuda.compute +from cuda.compute._cpp_compile import compile_cpp_op_code +from cuda.compute.op import RawOp from cuda.compute import CacheModifiedInputIterator, ZipIterator, gpu_struct DTYPE_LIST = [ @@ -59,6 +61,35 @@ def _host_select(h_in: np.ndarray, cond): return selected, np.int64(selected.size) +def _raw_even_i32_op() -> RawOp: + source = """ +extern "C" __device__ void is_even_i32(void* x, void* result) { + int value = *static_cast(x); + *static_cast(result) = (value % 2) == 0; +} +""" + return RawOp(ltoir=compile_cpp_op_code(source), name="is_even_i32") + + +@pytest.mark.no_numba +def test_select_raw_op_minimal(): + h_in = np.arange(10, dtype=np.int32) + d_in = cp.asarray(h_in) + d_out = cp.empty_like(d_in) + d_num_selected = cp.empty(2, dtype=np.uint64) + + cuda.compute.select( + d_in=d_in, + d_out=d_out, + d_num_selected_out=d_num_selected, + cond=_raw_even_i32_op(), + num_items=len(d_in), + ) + + num_selected = int(d_num_selected[0].get()) + np.testing.assert_array_equal(d_out.get()[:num_selected], h_in[h_in % 2 == 0]) + + @pytest.mark.parametrize("dtype,num_items", select_params) def test_select_basic(dtype, num_items): h_in = random_array(num_items, dtype, max_value=100) diff --git a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py index 5c82fe1ad38..715f9e51235 100644 --- a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py +++ b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py @@ -87,6 +87,7 @@ def test_shuffle_iterator_with_permutation_iterator(): assert sorted(result) == sorted(d_values.get()) +@pytest.mark.no_numba def test_shuffle_iterator_invalid_num_items(): with pytest.raises(ValueError, match="num_items must be > 0"): ShuffleIterator(0, seed=42) diff --git a/python/cuda_cccl/tests/compute/test_struct_field_validation.py b/python/cuda_cccl/tests/compute/test_struct_field_validation.py index 3ddaee54d41..cf1edd7e681 100644 --- a/python/cuda_cccl/tests/compute/test_struct_field_validation.py +++ b/python/cuda_cccl/tests/compute/test_struct_field_validation.py @@ -14,6 +14,8 @@ from cuda.compute import gpu_struct +pytestmark = pytest.mark.no_numba + def test_newline_in_field_name_is_rejected(): """Field names with newlines must be rejected — they are the exec() injection vector.""" diff --git a/python/cuda_cccl/tests/compute/test_three_way_partition.py b/python/cuda_cccl/tests/compute/test_three_way_partition.py index 3fb5b102275..9f2683f6798 100644 --- a/python/cuda_cccl/tests/compute/test_three_way_partition.py +++ b/python/cuda_cccl/tests/compute/test_three_way_partition.py @@ -7,6 +7,8 @@ import pytest import cuda.compute +from cuda.compute._cpp_compile import compile_cpp_op_code +from cuda.compute.op import RawOp from cuda.compute import CacheModifiedInputIterator, gpu_struct DTYPE_LIST = [ @@ -65,6 +67,50 @@ def _host_three_way_partition(h_in: np.ndarray, less_than_op, greater_equal_op): ) +def _raw_less_than_i32(name: str, threshold: int) -> RawOp: + source = f""" +extern "C" __device__ void {name}(void* x, void* result) {{ + int value = *static_cast(x); + *static_cast(result) = value < {threshold}; +}} +""" + return RawOp(ltoir=compile_cpp_op_code(source), name=name) + + +@pytest.mark.no_numba +def test_three_way_partition_raw_op_minimal(): + h_in = np.arange(10, dtype=np.int32) + d_in = cp.asarray(h_in) + d_first = cp.empty_like(d_in) + d_second = cp.empty_like(d_in) + d_unselected = cp.empty_like(d_in) + d_num_selected = cp.empty(2, dtype=np.uint64) + + cuda.compute.three_way_partition( + d_in=d_in, + d_first_part_out=d_first, + d_second_part_out=d_second, + d_unselected_out=d_unselected, + d_num_selected_out=d_num_selected, + select_first_part_op=_raw_less_than_i32("less_than_3_i32", 3), + select_second_part_op=_raw_less_than_i32("less_than_6_i32", 6), + num_items=len(d_in), + ) + + selected = d_num_selected.get() + first_count = int(selected[0]) + second_count = int(selected[1]) + unselected_count = len(h_in) - first_count - second_count + + np.testing.assert_array_equal(d_first.get()[:first_count], h_in[h_in < 3]) + np.testing.assert_array_equal( + d_second.get()[:second_count], h_in[(h_in >= 3) & (h_in < 6)] + ) + np.testing.assert_array_equal( + d_unselected.get()[:unselected_count], h_in[h_in >= 6] + ) + + @pytest.mark.parametrize("dtype,num_items", three_way_partition_params) def test_three_way_partition_basic(dtype, num_items, monkeypatch): # NOTE: the SASS check failure is seen only with NVRTC 13.1: diff --git a/python/cuda_cccl/tests/compute/test_transform.py b/python/cuda_cccl/tests/compute/test_transform.py index c7c3ca2818a..b8429726d77 100644 --- a/python/cuda_cccl/tests/compute/test_transform.py +++ b/python/cuda_cccl/tests/compute/test_transform.py @@ -266,6 +266,7 @@ def op2(a): np.testing.assert_allclose(expected, got) +@pytest.mark.no_numba def test_unary_transform_well_known_negate(): """Test unary transform with well-known NEGATE operation.""" dtype = np.int32 @@ -282,6 +283,7 @@ def test_unary_transform_well_known_negate(): np.testing.assert_equal(d_output.get(), expected) +@pytest.mark.no_numba def test_unary_transform_well_known_identity(): """Test unary transform with well-known IDENTITY operation.""" dtype = np.int32 @@ -298,6 +300,7 @@ def test_unary_transform_well_known_identity(): np.testing.assert_equal(d_output.get(), expected) +@pytest.mark.no_numba @pytest.mark.parametrize("dtype", [np.int32, np.float16]) def test_binary_transform_well_known_plus(dtype): """Test binary transform with well-known PLUS operation.""" @@ -319,6 +322,7 @@ def test_binary_transform_well_known_plus(dtype): np.testing.assert_equal(d_output.get(), expected) +@pytest.mark.no_numba def test_binary_transform_well_known_multiplies(): """Test binary transform with well-known MULTIPLIES operation.""" dtype = np.int32 @@ -586,6 +590,7 @@ def test_binary_transform_with_lambda(): np.testing.assert_array_equal(d_out.get(), expected) +@pytest.mark.no_numba def test_binary_transform_bool_equal_to(): d_input1 = cp.array([True, False, True, False], dtype=np.bool_) d_input2 = cp.array([True, True, False, False], dtype=np.bool_) diff --git a/python/cuda_cccl/tests/compute/test_unique_by_key.py b/python/cuda_cccl/tests/compute/test_unique_by_key.py index 83a4a17db67..5f7d310d71e 100644 --- a/python/cuda_cccl/tests/compute/test_unique_by_key.py +++ b/python/cuda_cccl/tests/compute/test_unique_by_key.py @@ -4,11 +4,11 @@ import cupy as cp -import numba.cuda import numpy as np import pytest import cuda.compute +from cuda.core import Device from cuda.compute import ( CacheModifiedInputIterator, DiscardIterator, @@ -125,7 +125,7 @@ def compare_op(lhs, rhs): @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params) def test_unique_by_key(dtype, num_items, op, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -143,11 +143,11 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch): h_out_items = np.empty(num_items, dtype=np.float32) h_out_num_selected = np.empty(1, np.int32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_items = numba.cuda.to_device(h_in_items) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_items = numba.cuda.to_device(h_out_items) - d_out_num_selected = numba.cuda.to_device(h_out_num_selected) + d_in_keys = cp.asarray(h_in_keys) + d_in_items = cp.asarray(h_in_items) + d_out_keys = cp.asarray(h_out_keys) + d_out_items = cp.asarray(h_out_items) + d_out_num_selected = cp.asarray(h_out_num_selected) unique_by_key_device( d_in_keys, @@ -159,10 +159,10 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch): num_items, ) - h_out_num_selected = d_out_num_selected.copy_to_host() + h_out_num_selected = d_out_num_selected.get() num_selected = h_out_num_selected[0] - h_out_keys = d_out_keys.copy_to_host()[:num_selected] - h_out_items = d_out_items.copy_to_host()[:num_selected] + h_out_keys = d_out_keys.get()[:num_selected] + h_out_items = d_out_items.get()[:num_selected] expected_keys, expected_items = unique_by_key_host(h_in_keys, h_in_items) @@ -172,7 +172,7 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch): @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params) def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip sass verification for CC 9.0+, due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -190,11 +190,11 @@ def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch): h_out_items = np.empty(num_items, dtype=np.float32) h_out_num_selected = np.empty(1, np.int64) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_items = numba.cuda.to_device(h_in_items) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_items = numba.cuda.to_device(h_out_items) - d_out_num_selected = numba.cuda.to_device(h_out_num_selected) + d_in_keys = cp.asarray(h_in_keys) + d_in_items = cp.asarray(h_in_items) + d_out_keys = cp.asarray(h_out_keys) + d_out_items = cp.asarray(h_out_items) + d_out_num_selected = cp.asarray(h_out_num_selected) i_in_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream") i_in_items = CacheModifiedInputIterator(d_in_items, modifier="stream") @@ -209,10 +209,10 @@ def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch): num_items, ) - h_out_num_selected = d_out_num_selected.copy_to_host() + h_out_num_selected = d_out_num_selected.get() num_selected = h_out_num_selected[0] - h_out_keys = d_out_keys.copy_to_host()[:num_selected] - h_out_items = d_out_items.copy_to_host()[:num_selected] + h_out_keys = d_out_keys.get()[:num_selected] + h_out_items = d_out_items.get()[:num_selected] expected_keys, expected_items = unique_by_key_host(h_in_keys, h_in_items) @@ -226,9 +226,9 @@ def test_unique_by_key_keys_only(): h_out_keys = np.empty(num_items, dtype=np.int32) h_out_num_selected = np.empty(1, np.int32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_num_selected = numba.cuda.to_device(h_out_num_selected) + d_in_keys = cp.asarray(h_in_keys) + d_out_keys = cp.asarray(h_out_keys) + d_out_num_selected = cp.asarray(h_out_num_selected) unique_by_key_device( d_in_keys, @@ -240,9 +240,9 @@ def test_unique_by_key_keys_only(): num_items, ) - h_out_num_selected = d_out_num_selected.copy_to_host() + h_out_num_selected = d_out_num_selected.get() num_selected = h_out_num_selected[0] - h_out_keys = d_out_keys.copy_to_host()[:num_selected] + h_out_keys = d_out_keys.get()[:num_selected] expected_keys, _ = unique_by_key_host( h_in_keys, @@ -267,11 +267,11 @@ def compare_complex(lhs, rhs): h_out_items = np.empty(num_items, dtype=np.float32) h_out_num_selected = np.empty(1, np.int32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_items = numba.cuda.to_device(h_in_items) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_items = numba.cuda.to_device(h_out_items) - d_out_num_selected = numba.cuda.to_device(h_out_num_selected) + d_in_keys = cp.asarray(h_in_keys) + d_in_items = cp.asarray(h_in_items) + d_out_keys = cp.asarray(h_out_keys) + d_out_items = cp.asarray(h_out_items) + d_out_num_selected = cp.asarray(h_out_num_selected) unique_by_key_device( d_in_keys, @@ -283,10 +283,10 @@ def compare_complex(lhs, rhs): num_items, ) - h_out_num_selected = d_out_num_selected.copy_to_host() + h_out_num_selected = d_out_num_selected.get() num_selected = h_out_num_selected[0] - h_out_keys = d_out_keys.copy_to_host()[:num_selected] - h_out_items = d_out_items.copy_to_host()[:num_selected] + h_out_keys = d_out_keys.get()[:num_selected] + h_out_items = d_out_items.get()[:num_selected] expected_keys, expected_items = unique_by_key_host( h_in_keys, h_in_items, compare_complex @@ -363,7 +363,7 @@ def struct_compare_op(lhs, rhs): def test_unique_by_key_with_stream(cuda_stream, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -417,8 +417,9 @@ def test_unique_by_key_with_stream(cuda_stream, monkeypatch): np.testing.assert_array_equal(h_out_items, expected_items) +@pytest.mark.no_numba def test_unique_by_key_well_known_equal_to(monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: diff --git a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py index e08709b6e1a..e4d8185eb91 100644 --- a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py +++ b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py @@ -12,7 +12,9 @@ """ import pytest -from numba import types + +numba = pytest.importorskip("numba") +types = numba.types from cuda.compute._odr_helpers import _ArgMode, _ArgSpec, _create_void_ptr_wrapper from cuda.compute._utils import sanitize_identifier diff --git a/python/cuda_cccl/tests/compute/test_zip_iterator.py b/python/cuda_cccl/tests/compute/test_zip_iterator.py index 85630bb4f5f..aea1f167f9f 100644 --- a/python/cuda_cccl/tests/compute/test_zip_iterator.py +++ b/python/cuda_cccl/tests/compute/test_zip_iterator.py @@ -6,6 +6,7 @@ import pytest import cuda.compute +from cuda.core import Device from cuda.compute import ( CountingIterator, TransformIterator, @@ -261,10 +262,8 @@ def min_pairs(p1, p2): @pytest.mark.parametrize("num_items", [10, 1000]) def test_output_zip_iterator_with_scan(monkeypatch, num_items): """Test ZipIterator as output iterator with scan operations.""" - import numba.cuda - # Skip SASS check for CC 8.0+ due to LDL/STL CI failure. - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability if cc_major >= 8: monkeypatch.setattr( cuda.compute._cccl_interop, @@ -426,9 +425,7 @@ def sum_nested_zips(v1, v2): ], ) def test_nested_output_zip_iterator_with_scan(monkeypatch, num_items, dtype_map): - import numba.cuda - - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability if cc_major >= 8: monkeypatch.setattr( cuda.compute._cccl_interop, @@ -502,6 +499,7 @@ def g(x): assert it1.kind != it2.kind +@pytest.mark.no_numba def test_caching_zip_iterator(): """Test that iterator compilation is cached across instances with the same structure.""" from cuda.compute._cpp_compile import compile_cpp_op_code @@ -577,6 +575,7 @@ def test_caching_zip_iterator(): assert len(set(kinds)) == 1, "Same CountingIterator types should have same kind" +@pytest.mark.no_numba def test_compilation_caching_across_iterator_types(): """Test that compilation caching works across different iterator types.""" from cuda.compute import ConstantIterator From d97d809f8eaf9b8fc60fe9ebb337ef52284526c8 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Fri, 12 Jun 2026 12:48:11 -0500 Subject: [PATCH 2/7] Stop wrapping binary search comparator in python callable --- .../cuda/compute/algorithms/_binary_search.py | 21 +------ .../tests/compute/test_binary_search.py | 60 +++++++++++++++++++ 2 files changed, 63 insertions(+), 18 deletions(-) diff --git a/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py b/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py index 6cadd994ceb..23a99c7bc40 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py @@ -16,19 +16,6 @@ from ..typing import DeviceArrayLike, IteratorT, Operator -def _normalize_comp(comp: Operator | None) -> OpAdapter: - # Use a lambda for the default comparator rather than OpKind.LESS - # because well-known ops don't carry type information needed by - # the binary search JIT compilation. - if comp is None or comp is OpKind.LESS: - - def _default_less(a, b): - return a < b - - return make_op_adapter(_default_less) - return make_op_adapter(comp) - - class _BinarySearch: __slots__ = [ "build_result", @@ -96,9 +83,7 @@ def __call__( set_cccl_iterator_state(self.d_out_cccl, d_out) # Update op state for stateful ops - comp_adapter = ( - _normalize_comp(comp) if comp is not None else _normalize_comp(None) - ) + comp_adapter = make_op_adapter(OpKind.LESS if comp is None else comp) self.op_cccl.state = comp_adapter.get_state() stream_handle = protocols.validate_and_get_stream(stream) @@ -154,7 +139,7 @@ def make_lower_bound( See Also: :func:`lower_bound` """ - comp_adapter = _normalize_comp(comp) + comp_adapter = make_op_adapter(OpKind.LESS if comp is None else comp) return _make_binary_search( d_data, d_values, @@ -193,7 +178,7 @@ def make_upper_bound( See Also: :func:`upper_bound` """ - comp_adapter = _normalize_comp(comp) + comp_adapter = make_op_adapter(OpKind.LESS if comp is None else comp) return _make_binary_search( d_data, d_values, diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py index a4f7d047a9b..585a998c44c 100644 --- a/python/cuda_cccl/tests/compute/test_binary_search.py +++ b/python/cuda_cccl/tests/compute/test_binary_search.py @@ -6,6 +6,7 @@ import pytest import cuda.compute +from cuda.compute import OpKind DTYPE_LIST = [ np.int32, @@ -39,6 +40,65 @@ def disable_sass_check(monkeypatch): ) +@pytest.mark.parametrize( + "search, side", + [ + (cuda.compute.lower_bound, "left"), + (cuda.compute.upper_bound, "right"), + ], +) +def test_binary_search_explicit_opkind_less(search, side): + h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32) + h_values = np.array([0, 3, 4, 10], dtype=np.int32) + + d_data = cp.asarray(h_data) + d_values = cp.asarray(h_values) + d_out = cp.empty(len(h_values), dtype=np.uintp) + + search( + d_data=d_data, + num_items=len(d_data), + d_values=d_values, + num_values=len(d_values), + d_out=d_out, + comp=OpKind.LESS, + ) + + expected = np.searchsorted(h_data, h_values, side=side).astype(np.uintp) + np.testing.assert_array_equal(d_out.get(), expected) + + +@pytest.mark.parametrize( + "search, side", + [ + (cuda.compute.lower_bound, "left"), + (cuda.compute.upper_bound, "right"), + ], +) +def test_binary_search_custom_comparator(search, side): + h_data = np.array([9, 7, 3, 3, 1], dtype=np.int32) + h_values = np.array([10, 4, 3, 0], dtype=np.int32) + + def greater(lhs, rhs): + return lhs > rhs + + d_data = cp.asarray(h_data) + d_values = cp.asarray(h_values) + d_out = cp.empty(len(h_values), dtype=np.uintp) + + search( + d_data=d_data, + num_items=len(d_data), + d_values=d_values, + num_values=len(d_values), + d_out=d_out, + comp=greater, + ) + + expected = np.searchsorted(-h_data, -h_values, side=side).astype(np.uintp) + np.testing.assert_array_equal(d_out.get(), expected) + + @pytest.mark.parametrize("dtype", DTYPE_LIST) @pytest.mark.parametrize( "num_items,num_values", [(0, 0), (0, 128), (128, 0), (512, 128)] From c564d3e13e9931f25052099ec0f461ccee824e08 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Fri, 12 Jun 2026 17:04:23 -0500 Subject: [PATCH 3/7] Address comments --- ci/test_cuda_compute_minimal_python.sh | 4 ++-- python/cuda_cccl/tests/compute/test_binary_search.py | 1 + python/cuda_cccl/tests/compute/test_three_way_partition.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ci/test_cuda_compute_minimal_python.sh b/ci/test_cuda_compute_minimal_python.sh index c48b313ed31..672338bbf33 100755 --- a/ci/test_cuda_compute_minimal_python.sh +++ b/ci/test_cuda_compute_minimal_python.sh @@ -19,8 +19,8 @@ setup_python_env "${py_version}" # Fetch or build the cuda_cccl wheel: if [[ -n "${GITHUB_ACTIONS:-}" ]]; then wheel_artifact_name=$("$ci_dir/util/workflow/get_wheel_artifact_name.sh") - "$ci_dir/util/artifacts/download.sh" "${wheel_artifact_name}" /home/coder/cccl/ - wheelhouse_dir="/home/coder/cccl/wheelhouse" + "$ci_dir/util/artifacts/download.sh" "${wheel_artifact_name}" "${repo_root}/" + wheelhouse_dir="${repo_root}/wheelhouse" else "$ci_dir/build_cuda_cccl_python.sh" -py-version "${py_version}" wheelhouse_dir="${repo_root}/wheelhouse" diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py index 585a998c44c..1e24ca00116 100644 --- a/python/cuda_cccl/tests/compute/test_binary_search.py +++ b/python/cuda_cccl/tests/compute/test_binary_search.py @@ -47,6 +47,7 @@ def disable_sass_check(monkeypatch): (cuda.compute.upper_bound, "right"), ], ) +@pytest.mark.no_numba def test_binary_search_explicit_opkind_less(search, side): h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32) h_values = np.array([0, 3, 4, 10], dtype=np.int32) diff --git a/python/cuda_cccl/tests/compute/test_three_way_partition.py b/python/cuda_cccl/tests/compute/test_three_way_partition.py index 9f2683f6798..b9813558de3 100644 --- a/python/cuda_cccl/tests/compute/test_three_way_partition.py +++ b/python/cuda_cccl/tests/compute/test_three_way_partition.py @@ -71,7 +71,7 @@ def _raw_less_than_i32(name: str, threshold: int) -> RawOp: source = f""" extern "C" __device__ void {name}(void* x, void* result) {{ int value = *static_cast(x); - *static_cast(result) = value < {threshold}; + *static_cast(result) = value < {threshold} ? 1 : 0; }} """ return RawOp(ltoir=compile_cpp_op_code(source), name=name) From 79df8e0a62cce4a6b943a369406da3987c989aa8 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Fri, 12 Jun 2026 17:11:35 -0500 Subject: [PATCH 4/7] fix merge conflict --- python/cuda_cccl/tests/compute/test_binary_search.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py index 9b07eeb8815..1e24ca00116 100644 --- a/python/cuda_cccl/tests/compute/test_binary_search.py +++ b/python/cuda_cccl/tests/compute/test_binary_search.py @@ -47,10 +47,7 @@ def disable_sass_check(monkeypatch): (cuda.compute.upper_bound, "right"), ], ) -<<<<<<< HEAD @pytest.mark.no_numba -======= ->>>>>>> main def test_binary_search_explicit_opkind_less(search, side): h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32) h_values = np.array([0, 3, 4, 10], dtype=np.int32) From c2e7c01b5732f968520ef25c21503f244fb02a6b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 12 Jun 2026 22:16:44 +0000 Subject: [PATCH 5/7] [pre-commit.ci] auto code formatting --- python/cuda_cccl/tests/compute/test_radix_sort.py | 2 +- python/cuda_cccl/tests/compute/test_scan.py | 2 +- python/cuda_cccl/tests/compute/test_select.py | 2 +- python/cuda_cccl/tests/compute/test_three_way_partition.py | 2 +- python/cuda_cccl/tests/compute/test_unique_by_key.py | 2 +- python/cuda_cccl/tests/compute/test_zip_iterator.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cuda_cccl/tests/compute/test_radix_sort.py b/python/cuda_cccl/tests/compute/test_radix_sort.py index 67186d3a4f7..d39a7e07fad 100644 --- a/python/cuda_cccl/tests/compute/test_radix_sort.py +++ b/python/cuda_cccl/tests/compute/test_radix_sort.py @@ -10,11 +10,11 @@ import pytest import cuda.compute -from cuda.core import Device from cuda.compute import ( DoubleBuffer, SortOrder, ) +from cuda.core import Device def get_mark(dt, log_size): diff --git a/python/cuda_cccl/tests/compute/test_scan.py b/python/cuda_cccl/tests/compute/test_scan.py index 97f5fae630a..0eb3af34c49 100644 --- a/python/cuda_cccl/tests/compute/test_scan.py +++ b/python/cuda_cccl/tests/compute/test_scan.py @@ -8,7 +8,6 @@ import pytest import cuda.compute -from cuda.core import Device from cuda.compute import ( CountingIterator, OpKind, @@ -16,6 +15,7 @@ TransformOutputIterator, gpu_struct, ) +from cuda.core import Device def scan_host(h_input: np.ndarray, op, h_init, force_inclusive): diff --git a/python/cuda_cccl/tests/compute/test_select.py b/python/cuda_cccl/tests/compute/test_select.py index d467d33c635..3dcaf98a507 100644 --- a/python/cuda_cccl/tests/compute/test_select.py +++ b/python/cuda_cccl/tests/compute/test_select.py @@ -7,9 +7,9 @@ import pytest import cuda.compute +from cuda.compute import CacheModifiedInputIterator, ZipIterator, gpu_struct from cuda.compute._cpp_compile import compile_cpp_op_code from cuda.compute.op import RawOp -from cuda.compute import CacheModifiedInputIterator, ZipIterator, gpu_struct DTYPE_LIST = [ np.uint8, diff --git a/python/cuda_cccl/tests/compute/test_three_way_partition.py b/python/cuda_cccl/tests/compute/test_three_way_partition.py index b9813558de3..8def513681b 100644 --- a/python/cuda_cccl/tests/compute/test_three_way_partition.py +++ b/python/cuda_cccl/tests/compute/test_three_way_partition.py @@ -7,9 +7,9 @@ import pytest import cuda.compute +from cuda.compute import CacheModifiedInputIterator, gpu_struct from cuda.compute._cpp_compile import compile_cpp_op_code from cuda.compute.op import RawOp -from cuda.compute import CacheModifiedInputIterator, gpu_struct DTYPE_LIST = [ np.uint8, diff --git a/python/cuda_cccl/tests/compute/test_unique_by_key.py b/python/cuda_cccl/tests/compute/test_unique_by_key.py index 5f7d310d71e..945d4d99028 100644 --- a/python/cuda_cccl/tests/compute/test_unique_by_key.py +++ b/python/cuda_cccl/tests/compute/test_unique_by_key.py @@ -8,13 +8,13 @@ import pytest import cuda.compute -from cuda.core import Device from cuda.compute import ( CacheModifiedInputIterator, DiscardIterator, OpKind, gpu_struct, ) +from cuda.core import Device DTYPE_LIST = [ np.uint8, diff --git a/python/cuda_cccl/tests/compute/test_zip_iterator.py b/python/cuda_cccl/tests/compute/test_zip_iterator.py index aea1f167f9f..9fbed38359f 100644 --- a/python/cuda_cccl/tests/compute/test_zip_iterator.py +++ b/python/cuda_cccl/tests/compute/test_zip_iterator.py @@ -6,13 +6,13 @@ import pytest import cuda.compute -from cuda.core import Device from cuda.compute import ( CountingIterator, TransformIterator, ZipIterator, gpu_struct, ) +from cuda.core import Device @pytest.mark.parametrize("num_items", [10, 1_000, 100_000]) From b09c9c6ecb2df85d4e4fed2279fcfd7efe255254 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Fri, 12 Jun 2026 17:26:21 -0500 Subject: [PATCH 6/7] Fix pre-commit --- .../tests/compute/test_void_ptr_wrapper_validation.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py index e4d8185eb91..2788a3c0893 100644 --- a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py +++ b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py @@ -16,8 +16,12 @@ numba = pytest.importorskip("numba") types = numba.types -from cuda.compute._odr_helpers import _ArgMode, _ArgSpec, _create_void_ptr_wrapper -from cuda.compute._utils import sanitize_identifier +from cuda.compute._odr_helpers import ( # noqa: E402 + _ArgMode, + _ArgSpec, + _create_void_ptr_wrapper, +) +from cuda.compute._utils import sanitize_identifier # noqa: E402 def _make_arg_specs(): From 15652c00259b5650977a4fb33d66d4bd680559b6 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Tue, 23 Jun 2026 09:34:33 -0500 Subject: [PATCH 7/7] Move no numba tests to separate file --- ci/test_cuda_compute_minimal_python.sh | 21 +- .../tests/compute/test_binary_search.py | 1 - .../cuda_cccl/tests/compute/test_bindings.py | 2 - .../compute/test_deferred_annotations.py | 2 - .../tests/compute/test_func_caching.py | 9 - .../cuda_cccl/tests/compute/test_histogram.py | 2 - .../cuda_cccl/tests/compute/test_iterators.py | 10 +- .../tests/compute/test_merge_sort.py | 55 +- .../tests/compute/test_nested_struct.py | 5 - .../cuda_cccl/tests/compute/test_no_numba.py | 560 +++++++++++++++++- .../compute/test_permutation_iterator.py | 6 - .../tests/compute/test_radix_sort.py | 76 ++- python/cuda_cccl/tests/compute/test_reduce.py | 38 +- python/cuda_cccl/tests/compute/test_scan.py | 16 +- .../tests/compute/test_segmented_reduce.py | 3 - .../tests/compute/test_segmented_sort.py | 51 +- python/cuda_cccl/tests/compute/test_select.py | 31 - .../tests/compute/test_shuffle_iterator.py | 1 - .../compute/test_struct_field_validation.py | 2 - .../tests/compute/test_three_way_partition.py | 46 -- .../cuda_cccl/tests/compute/test_transform.py | 5 - .../tests/compute/test_unique_by_key.py | 69 ++- .../test_void_ptr_wrapper_validation.py | 12 +- .../tests/compute/test_zip_iterator.py | 11 +- 24 files changed, 716 insertions(+), 318 deletions(-) diff --git a/ci/test_cuda_compute_minimal_python.sh b/ci/test_cuda_compute_minimal_python.sh index 672338bbf33..a43bbac7bfa 100755 --- a/ci/test_cuda_compute_minimal_python.sh +++ b/ci/test_cuda_compute_minimal_python.sh @@ -27,25 +27,10 @@ else fi # Install cuda_cccl with the minimal CUDA extra. This intentionally avoids the -# full cu* extras because those pull in numba/numba-cuda. In a clean minimal -# environment, the test phase below runs only tests marked no_numba. +# full cu* extras because those pull in numba/numba-cuda. CUDA_CCCL_WHEEL_PATH="$(ls "${wheelhouse_dir}"/cuda_cccl-*.whl)" python -m pip install "${CUDA_CCCL_WHEEL_PATH}[minimal-cu${cuda_major_version}]" python -m pip install pytest pytest-xdist "cupy-cuda${cuda_major_version}x" -if python - <<'PY' -try: - import numba.cuda # noqa: F401 -except Exception as exc: - print(f"numba.cuda unavailable; running no_numba subset: {exc!r}") - raise SystemExit(1) -else: - print("numba.cuda available; running full compute test suite.") -PY -then - cd "${repo_root}/python/cuda_cccl/tests/" - python -m pytest -n 6 -v compute/ -m "not large" -else - cd "${repo_root}/python/cuda_cccl/tests/" - python -m pytest -n 6 -v compute/ -m "not large and no_numba" -fi +cd "${repo_root}/python/cuda_cccl/tests/" +python -m pytest -n 6 -v compute/test_no_numba.py diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py index 1e24ca00116..585a998c44c 100644 --- a/python/cuda_cccl/tests/compute/test_binary_search.py +++ b/python/cuda_cccl/tests/compute/test_binary_search.py @@ -47,7 +47,6 @@ def disable_sass_check(monkeypatch): (cuda.compute.upper_bound, "right"), ], ) -@pytest.mark.no_numba def test_binary_search_explicit_opkind_less(search, side): h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32) h_values = np.array([0, 3, 4, 10], dtype=np.int32) diff --git a/python/cuda_cccl/tests/compute/test_bindings.py b/python/cuda_cccl/tests/compute/test_bindings.py index ccfbd8d0bc3..2f16da620ef 100644 --- a/python/cuda_cccl/tests/compute/test_bindings.py +++ b/python/cuda_cccl/tests/compute/test_bindings.py @@ -4,8 +4,6 @@ import cuda.compute._bindings as bindings -pytestmark = pytest.mark.no_numba - @pytest.fixture( params=[ diff --git a/python/cuda_cccl/tests/compute/test_deferred_annotations.py b/python/cuda_cccl/tests/compute/test_deferred_annotations.py index a584b8ffbe2..c30aeda4068 100644 --- a/python/cuda_cccl/tests/compute/test_deferred_annotations.py +++ b/python/cuda_cccl/tests/compute/test_deferred_annotations.py @@ -7,12 +7,10 @@ import cupy as cp import numpy as np -import pytest from cuda.compute import OpKind, TransformIterator, gpu_struct, reduce_into -@pytest.mark.no_numba def test_deferred_annotations(): # test that we can use @gpu_struct with deferred annotations # GH: #6421 diff --git a/python/cuda_cccl/tests/compute/test_func_caching.py b/python/cuda_cccl/tests/compute/test_func_caching.py index e94e0069529..8fcf47b32a7 100644 --- a/python/cuda_cccl/tests/compute/test_func_caching.py +++ b/python/cuda_cccl/tests/compute/test_func_caching.py @@ -1,12 +1,10 @@ import numpy as np -import pytest from cuda.compute._caching import CachableFunction global_x = 1 -@pytest.mark.no_numba def test_func_caching_basic(): def func(x): return x @@ -21,7 +19,6 @@ def func(x): assert f1 == f2 -@pytest.mark.no_numba def test_func_caching_different_names(): def func(x): return x @@ -36,7 +33,6 @@ def func2(x): assert f1 != f2 -@pytest.mark.no_numba def test_func_caching_different_code(): def func(x): return x @@ -50,7 +46,6 @@ def func(x): assert f1 != f2 -@pytest.mark.no_numba def test_func_caching_with_closure(): def factory(x): def func(y): @@ -66,7 +61,6 @@ def func(y): assert f1 != f3 -@pytest.mark.no_numba def test_func_caching_with_numpy_numeric_scalar_closure(): def factory(indexlength, regularsize): index_dtype = np.int64 @@ -86,7 +80,6 @@ def func(counter): assert f1 != f3 -@pytest.mark.no_numba def test_func_caching_with_global_variable(): global global_x @@ -133,7 +126,6 @@ def func(x): assert CachableFunction(func1) != CachableFunction(func3) -@pytest.mark.no_numba def test_func_caching_with_global_np_ufunc(): def make_func(): def func(x): @@ -153,7 +145,6 @@ def func(x): assert CachableFunction(func1) != CachableFunction(func2) -@pytest.mark.no_numba def test_func_caching_with_aliased_np_ufunc(): def make_func1(): amin = np.argmin diff --git a/python/cuda_cccl/tests/compute/test_histogram.py b/python/cuda_cccl/tests/compute/test_histogram.py index 5c104114754..0a9d96674e6 100644 --- a/python/cuda_cccl/tests/compute/test_histogram.py +++ b/python/cuda_cccl/tests/compute/test_histogram.py @@ -12,8 +12,6 @@ CountingIterator, ) -pytestmark = pytest.mark.no_numba - DTYPE_LIST = [ np.uint8, np.uint16, diff --git a/python/cuda_cccl/tests/compute/test_iterators.py b/python/cuda_cccl/tests/compute/test_iterators.py index c3b6acc9fc9..49e2ab39d15 100644 --- a/python/cuda_cccl/tests/compute/test_iterators.py +++ b/python/cuda_cccl/tests/compute/test_iterators.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import cupy as cp +import numba.cuda import numpy as np import pytest @@ -21,7 +22,6 @@ ) -@pytest.mark.no_numba def test_constant_iterator_equality(): it1 = ConstantIterator(np.int32(0)) it2 = ConstantIterator(np.int32(0)) @@ -32,7 +32,6 @@ def test_constant_iterator_equality(): assert it1.kind != it4.kind -@pytest.mark.no_numba def test_counting_iterator_equality(): it1 = CountingIterator(np.int32(0)) it2 = CountingIterator(np.int32(0)) @@ -43,7 +42,6 @@ def test_counting_iterator_equality(): assert it1.kind != it4.kind -@pytest.mark.no_numba def test_cache_modified_input_iterator_equality(): ary1 = cp.asarray([0, 1, 2], dtype="int32") ary2 = cp.asarray([3, 4, 5], dtype="int32") @@ -124,14 +122,11 @@ def reverse_iterator_array(request): if array_type == "cupy": array = cp.array(base_array) else: - import numba.cuda - array = numba.cuda.to_device(base_array) return array -@pytest.mark.no_numba def test_reverse_input_iterator_equality(): ary1 = cp.asarray([0, 1, 2], dtype="int32") ary2 = cp.asarray([3, 4, 5], dtype="int32") @@ -146,7 +141,6 @@ def test_reverse_input_iterator_equality(): assert it1.kind != it4.kind -@pytest.mark.no_numba def test_reverse_output_iterator_equality(): ary1 = cp.asarray([0, 1, 2], dtype="int32") ary2 = cp.asarray([3, 4, 5], dtype="int32") @@ -161,7 +155,6 @@ def test_reverse_output_iterator_equality(): assert it1.kind != it4.kind -@pytest.mark.no_numba @pytest.mark.parametrize( "shape, itemsize, expected", [ @@ -186,7 +179,6 @@ def test_compute_c_contiguous_strides_in_bytes(shape, itemsize, expected): assert result == expected -@pytest.mark.no_numba @pytest.mark.parametrize( "shape, dtype", [ diff --git a/python/cuda_cccl/tests/compute/test_merge_sort.py b/python/cuda_cccl/tests/compute/test_merge_sort.py index db97d5aca01..33d7d15de58 100644 --- a/python/cuda_cccl/tests/compute/test_merge_sort.py +++ b/python/cuda_cccl/tests/compute/test_merge_sort.py @@ -5,6 +5,7 @@ from typing import List import cupy as cp +import numba.cuda import numpy as np import pytest @@ -81,11 +82,11 @@ def compare_op(lhs, rhs): def test_merge_sort_keys(dtype, num_items, op): h_in_keys = random_array(num_items, dtype) - d_in_keys = cp.asarray(h_in_keys) + d_in_keys = numba.cuda.to_device(h_in_keys) merge_sort_device(d_in_keys, None, d_in_keys, None, op, num_items) - h_out_keys = d_in_keys.get() + h_out_keys = d_in_keys.copy_to_host() h_in_keys.sort() np.testing.assert_array_equal(h_out_keys, h_in_keys) @@ -101,13 +102,13 @@ def test_merge_sort_pairs(dtype, num_items, op, monkeypatch): h_in_keys = random_array(num_items, dtype) h_in_items = random_array(num_items, np.float32) - d_in_keys = cp.asarray(h_in_keys) - d_in_items = cp.asarray(h_in_items) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_items = numba.cuda.to_device(h_in_items) merge_sort_device(d_in_keys, d_in_items, d_in_keys, d_in_items, op, num_items) - h_out_keys = d_in_keys.get() - h_out_items = d_in_items.get() + h_out_keys = d_in_keys.copy_to_host() + h_out_items = d_in_items.copy_to_host() argsort = np.argsort(h_in_keys, stable=True) h_in_keys = np.array(h_in_keys)[argsort] @@ -122,12 +123,12 @@ def test_merge_sort_keys_copy(dtype, num_items, op): h_in_keys = random_array(num_items, dtype) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = cp.asarray(h_in_keys) - d_out_keys = cp.asarray(h_out_keys) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_out_keys = numba.cuda.to_device(h_out_keys) merge_sort_device(d_in_keys, None, d_out_keys, None, op, num_items) - h_out_keys = d_out_keys.get() + h_out_keys = d_out_keys.copy_to_host() h_in_keys.sort() np.testing.assert_array_equal(h_out_keys, h_in_keys) @@ -145,15 +146,15 @@ def test_merge_sort_pairs_copy(dtype, num_items, op, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_items = np.empty(num_items, dtype=np.float32) - d_in_keys = cp.asarray(h_in_keys) - d_in_items = cp.asarray(h_in_items) - d_out_keys = cp.asarray(h_out_keys) - d_out_items = cp.asarray(h_out_items) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_items = numba.cuda.to_device(h_in_items) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_items = numba.cuda.to_device(h_out_items) merge_sort_device(d_in_keys, d_in_items, d_out_keys, d_out_items, op, num_items) - h_out_keys = d_out_keys.get() - h_out_items = d_out_items.get() + h_out_keys = d_out_keys.copy_to_host() + h_out_items = d_out_items.copy_to_host() argsort = np.argsort(h_in_keys, stable=True) h_in_keys = np.array(h_in_keys)[argsort] @@ -224,11 +225,11 @@ def compare_complex(lhs, rhs): imaginary = random_array(num_items, np.int64, max_value) h_in_keys = real + 1j * imaginary - d_in_keys = cp.asarray(h_in_keys) + d_in_keys = numba.cuda.to_device(h_in_keys) merge_sort_device(d_in_keys, None, d_in_keys, None, compare_complex, num_items) - h_out_keys = d_in_keys.get() + h_out_keys = d_in_keys.copy_to_host() h_in_keys = h_in_keys[np.argsort(h_in_keys.real, stable=True)] np.testing.assert_array_equal(h_out_keys, h_in_keys) @@ -239,15 +240,15 @@ def test_merge_sort_keys_copy_iterator_input(dtype, num_items, op): h_in_keys = random_array(num_items, dtype) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = cp.asarray(h_in_keys) - d_out_keys = cp.asarray(h_out_keys) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_out_keys = numba.cuda.to_device(h_out_keys) i_input = CacheModifiedInputIterator(d_in_keys, modifier="stream") merge_sort_device(i_input, None, d_out_keys, None, op, num_items) h_in_keys.sort() - h_out_keys = d_out_keys.get() + h_out_keys = d_out_keys.copy_to_host() np.testing.assert_array_equal(h_out_keys, h_in_keys) @@ -264,10 +265,10 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch) h_out_keys = np.empty(num_items, dtype=dtype) h_out_items = np.empty(num_items, dtype=np.float32) - d_in_keys = cp.asarray(h_in_keys) - d_in_items = cp.asarray(h_in_items) - d_out_keys = cp.asarray(h_out_keys) - d_out_items = cp.asarray(h_out_items) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_items = numba.cuda.to_device(h_in_items) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_items = numba.cuda.to_device(h_out_items) i_input_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream") i_input_items = CacheModifiedInputIterator(d_in_items, modifier="stream") @@ -276,8 +277,8 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch) i_input_keys, i_input_items, d_out_keys, d_out_items, op, num_items ) - h_out_keys = d_out_keys.get() - h_out_items = d_out_items.get() + h_out_keys = d_out_keys.copy_to_host() + h_out_items = d_out_items.copy_to_host() argsort = np.argsort(h_in_keys, stable=True) h_in_keys = np.array(h_in_keys)[argsort] @@ -306,7 +307,6 @@ def test_merge_sort_with_stream(cuda_stream): np.testing.assert_array_equal(got, h_in_keys) -@pytest.mark.no_numba def test_merge_sort_well_known_less(): dtype = np.int32 @@ -326,7 +326,6 @@ def test_merge_sort_well_known_less(): np.testing.assert_equal(d_out_keys.get(), expected) -@pytest.mark.no_numba def test_merge_sort_well_known_greater(): dtype = np.int32 diff --git a/python/cuda_cccl/tests/compute/test_nested_struct.py b/python/cuda_cccl/tests/compute/test_nested_struct.py index 715873af3c8..9cbfa40e393 100644 --- a/python/cuda_cccl/tests/compute/test_nested_struct.py +++ b/python/cuda_cccl/tests/compute/test_nested_struct.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import cupy as cp import numpy as np -import pytest import cuda.compute from cuda.compute import ZipIterator, gpu_struct @@ -150,7 +149,6 @@ def sum_pixels(p1, p2): assert result["color"]["b"] == expected_b -@pytest.mark.no_numba def test_dict_init_nested_struct(): """Test initializing a nested struct with a dictionary.""" Inner = gpu_struct({"a": np.int32, "b": np.float32}) @@ -164,7 +162,6 @@ def test_dict_init_nested_struct(): assert np.isclose(obj.inner.b, 3.14) -@pytest.mark.no_numba def test_dict_init_per_field(): """Test initializing a struct with a dictionary for a nested field.""" Inner = gpu_struct({"a": np.int32, "b": np.float32}) @@ -178,7 +175,6 @@ def test_dict_init_per_field(): assert np.isclose(obj.inner.b, 3.14) -@pytest.mark.no_numba def test_dict_init_deeply_nested(): """Test initializing deeply nested structs (3+ levels) with dictionaries.""" Level1 = gpu_struct({"value": np.int32}) @@ -193,7 +189,6 @@ def test_dict_init_deeply_nested(): assert obj.middle.nested.value == 42 -@pytest.mark.no_numba def test_dict_init_mixed(): """Test mixed initialization with some dicts and some direct values.""" Inner1 = gpu_struct({"a": np.int32, "b": np.int32}) diff --git a/python/cuda_cccl/tests/compute/test_no_numba.py b/python/cuda_cccl/tests/compute/test_no_numba.py index bbab965c8bf..6fb1ef0e811 100644 --- a/python/cuda_cccl/tests/compute/test_no_numba.py +++ b/python/cuda_cccl/tests/compute/test_no_numba.py @@ -1,12 +1,568 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import cupy as cp +import numpy as np import pytest -# Check that tests marked no_numba fail fast if they import numba. +import cuda.compute +from cuda.compute import ( + CacheModifiedInputIterator, + ConstantIterator, + CountingIterator, + DiscardIterator, + OpKind, + PermutationIterator, + ReverseIterator, + ShuffleIterator, + SortOrder, + TransformIterator, + TransformOutputIterator, + ZipIterator, +) +from cuda.compute._cpp_compile import compile_cpp_op_code +from cuda.compute.op import RawOp +from cuda.compute.types import int16 as cccl_int16 +from cuda.compute.types import int32 as cccl_int32 + +# These tests define the minimal-extra integration contract. They intentionally +# use small fixed inputs and avoid the Python-callable operator path. pytestmark = pytest.mark.no_numba -@pytest.mark.no_numba +def _raw_op(source: str, name: str) -> RawOp: + return RawOp(ltoir=compile_cpp_op_code(source), name=name) + + +def _raw_even_i32_op() -> RawOp: + source = """ +extern "C" __device__ void no_numba_even_i32(void* x, void* result) { + int value = *static_cast(x); + *static_cast(result) = (value % 2) == 0; +} +""" + return _raw_op(source, "no_numba_even_i32") + + +def _raw_less_than_i32_op(name: str, threshold: int) -> RawOp: + source = f""" +extern "C" __device__ void {name}(void* x, void* result) {{ + int value = *static_cast(x); + *static_cast(result) = value < {threshold} ? 1 : 0; +}} +""" + return _raw_op(source, name) + + +def _raw_plus_i64_op() -> RawOp: + source = """ +extern "C" __device__ void no_numba_plus_i64( + void* lhs, + void* rhs, + void* result +) { + *static_cast(result) = + *static_cast(lhs) + *static_cast(rhs); +} +""" + return _raw_op(source, "no_numba_plus_i64") + + +def _raw_square_i32_op() -> RawOp: + source = """ +extern "C" __device__ void no_numba_square_i32(void* x, void* result) { + int value = *static_cast(x); + *static_cast(result) = value * value; +} +""" + return _raw_op(source, "no_numba_square_i32") + + +def _raw_zip_sum_i32_op() -> RawOp: + source = """ +struct Zip2I32 { + int field_0; + int field_1; +}; + +extern "C" __device__ void no_numba_zip_sum_i32(void* x, void* result) { + auto values = static_cast(x); + *static_cast(result) = values->field_0 + values->field_1; +} +""" + return _raw_op(source, "no_numba_zip_sum_i32") + + +def _raw_negate_i16_op() -> RawOp: + source = """ +extern "C" __device__ void no_numba_negate_i16(void* x, void* result) { + *static_cast(result) = -*static_cast(x); +} +""" + return _raw_op(source, "no_numba_negate_i16") + + def test_import_numba_raises(): with pytest.raises( ImportError, match="This test is marked 'no_numba' but attempted to import it" ): import numba.cuda # noqa: F401 + + +def test_reduce_well_known_plus(): + h_input = np.arange(1, 14, dtype=np.int32) + d_input = cp.asarray(h_input) + d_output = cp.empty(1, dtype=np.int32) + h_init = np.array([5], dtype=np.int32) + + cuda.compute.reduce_into( + d_in=d_input, + d_out=d_output, + num_items=d_input.size, + op=OpKind.PLUS, + h_init=h_init, + ) + + assert d_output.get()[0] == np.sum(h_input, initial=h_init[0]) + + +def test_exclusive_scan_well_known_plus(): + d_input = cp.asarray([2, 4, 6, 8, 10, 12], dtype=np.uint16) + d_output = cp.empty_like(d_input) + h_init = np.array([1], dtype=np.uint16) + + cuda.compute.exclusive_scan( + d_in=d_input, + d_out=d_output, + op=OpKind.PLUS, + init_value=h_init, + num_items=d_input.size, + ) + + expected = np.asarray([1, 3, 7, 13, 21, 31], dtype=np.uint16) + np.testing.assert_array_equal(d_output.get(), expected) + + +def test_binary_transform_well_known_plus(): + d_lhs = cp.asarray([1.5, 2.5, 3.5, 4.5], dtype=np.float32) + d_rhs = cp.asarray([10.0, 20.0, 30.0, 40.0], dtype=np.float32) + d_output = cp.empty_like(d_lhs) + + cuda.compute.binary_transform( + d_in1=d_lhs, + d_in2=d_rhs, + d_out=d_output, + op=OpKind.PLUS, + num_items=d_lhs.size, + ) + + np.testing.assert_allclose(d_output.get(), d_lhs.get() + d_rhs.get()) + + +def test_unary_transform_well_known_negate(): + d_input = cp.asarray([-4, -2, 0, 2, 4], dtype=np.int8) + d_output = cp.empty_like(d_input) + + cuda.compute.unary_transform( + d_in=d_input, + d_out=d_output, + op=OpKind.NEGATE, + num_items=d_input.size, + ) + + np.testing.assert_array_equal(d_output.get(), np.asarray([4, 2, 0, -2, -4])) + + +@pytest.mark.parametrize( + "search, side", + [ + (cuda.compute.lower_bound, "left"), + (cuda.compute.upper_bound, "right"), + ], +) +def test_binary_search_explicit_opkind_less(search, side): + h_data = np.asarray([1, 3, 3, 7, 9, 11], dtype=np.int64) + h_values = np.asarray([0, 3, 4, 10, 12], dtype=np.int64) + d_out = cp.empty(h_values.size, dtype=np.uintp) + + search( + d_data=cp.asarray(h_data), + num_items=h_data.size, + d_values=cp.asarray(h_values), + num_values=h_values.size, + d_out=d_out, + comp=OpKind.LESS, + ) + + expected = np.searchsorted(h_data, h_values, side=side).astype(np.uintp) + np.testing.assert_array_equal(d_out.get(), expected) + + +def test_segmented_reduce_well_known_plus(monkeypatch): + monkeypatch.setattr(cuda.compute._cccl_interop, "_check_sass", False) + + d_input = cp.asarray([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.uint32) + d_starts = cp.asarray([0, 3, 5], dtype=np.int32) + d_ends = cp.asarray([3, 5, 8], dtype=np.int32) + d_output = cp.empty(3, dtype=np.uint32) + h_init = np.array([0], dtype=np.uint32) + + cuda.compute.segmented_reduce( + d_in=d_input, + d_out=d_output, + num_segments=3, + start_offsets_in=d_starts, + end_offsets_in=d_ends, + op=OpKind.PLUS, + h_init=h_init, + ) + + np.testing.assert_array_equal(d_output.get(), np.asarray([6, 9, 21])) + + +def test_merge_sort_well_known_less(): + d_input = cp.asarray([3.5, -1.0, 2.25, 2.0, 7.0], dtype=np.float64) + d_output = cp.empty_like(d_input) + + cuda.compute.merge_sort( + d_in_keys=d_input, + d_in_values=None, + d_out_keys=d_output, + d_out_values=None, + num_items=d_input.size, + op=OpKind.LESS, + ) + + np.testing.assert_array_equal(d_output.get(), np.sort(d_input.get())) + + +def test_radix_sort_key_value_pairs(): + h_keys = np.asarray([4, -2, 7, 1, -2, 0], dtype=np.int16) + h_values = np.asarray([40, 20, 70, 10, 21, 0], dtype=np.uint8) + d_out_keys = cp.empty_like(cp.asarray(h_keys)) + d_out_values = cp.empty_like(cp.asarray(h_values)) + + cuda.compute.radix_sort( + d_in_keys=cp.asarray(h_keys), + d_out_keys=d_out_keys, + d_in_values=cp.asarray(h_values), + d_out_values=d_out_values, + num_items=h_keys.size, + order=SortOrder.ASCENDING, + ) + + order = np.argsort(h_keys, stable=True) + np.testing.assert_array_equal(d_out_keys.get(), h_keys[order]) + np.testing.assert_array_equal(d_out_values.get(), h_values[order]) + + +def test_segmented_sort_keys(): + h_keys = np.asarray([3, 1, 2, 9, 7, 8, 6, 5], dtype=np.uint64) + h_offsets = np.asarray([0, 3, 6, 8], dtype=np.int64) + d_output = cp.empty_like(cp.asarray(h_keys)) + + cuda.compute.segmented_sort( + d_in_keys=cp.asarray(h_keys), + d_out_keys=d_output, + d_in_values=None, + d_out_values=None, + num_items=h_keys.size, + num_segments=h_offsets.size - 1, + start_offsets_in=cp.asarray(h_offsets[:-1]), + end_offsets_in=cp.asarray(h_offsets[1:]), + order=SortOrder.ASCENDING, + ) + + expected = np.asarray([1, 2, 3, 7, 8, 9, 5, 6], dtype=np.uint64) + np.testing.assert_array_equal(d_output.get(), expected) + + +def test_unique_by_key_well_known_equal_to(monkeypatch): + cc_major, _ = cuda.compute._cccl_interop.CudaDevice().compute_capability + if cc_major >= 9: + monkeypatch.setattr(cuda.compute._cccl_interop, "_check_sass", False) + + d_keys = cp.asarray([1, 1, 2, 2, 2, 3, 4, 4], dtype=np.int16) + d_values = cp.asarray([10, 11, 20, 21, 22, 30, 40, 41], dtype=np.int8) + d_out_keys = cp.empty_like(d_keys) + d_out_values = cp.empty_like(d_values) + d_num_selected = cp.empty(1, dtype=np.int64) + + cuda.compute.unique_by_key( + d_in_keys=d_keys, + d_in_items=d_values, + d_out_keys=d_out_keys, + d_out_items=d_out_values, + d_out_num_selected=d_num_selected, + op=OpKind.EQUAL_TO, + num_items=d_keys.size, + ) + + num_selected = int(d_num_selected.get()[0]) + np.testing.assert_array_equal(d_out_keys.get()[:num_selected], [1, 2, 3, 4]) + np.testing.assert_array_equal(d_out_values.get()[:num_selected], [10, 20, 30, 40]) + + +def test_histogram_even_small_range(): + h_samples = np.asarray([0.5, 1.5, 2.5, 2.75, 3.0, 3.5], dtype=np.float32) + d_histogram = cp.empty(4, dtype=np.int32) + + cuda.compute.histogram_even( + d_samples=cp.asarray(h_samples), + d_histogram=d_histogram, + num_output_levels=5, + lower_level=np.float32(0.0), + upper_level=np.float32(4.0), + num_samples=h_samples.size, + ) + + expected, _ = np.histogram(h_samples, bins=4, range=(0.0, 4.0)) + np.testing.assert_array_equal(d_histogram.get(), expected.astype(np.int32)) + + +def test_select_raw_op(): + h_input = np.arange(12, dtype=np.int32) + d_output = cp.empty_like(cp.asarray(h_input)) + d_num_selected = cp.empty(1, dtype=np.uint64) + + cuda.compute.select( + d_in=cp.asarray(h_input), + d_out=d_output, + d_num_selected_out=d_num_selected, + cond=_raw_even_i32_op(), + num_items=h_input.size, + ) + + num_selected = int(d_num_selected.get()[0]) + np.testing.assert_array_equal(d_output.get()[:num_selected], h_input[::2]) + + +def test_three_way_partition_raw_op(): + h_input = np.arange(12, dtype=np.int32) + d_first = cp.empty_like(cp.asarray(h_input)) + d_second = cp.empty_like(cp.asarray(h_input)) + d_unselected = cp.empty_like(cp.asarray(h_input)) + d_num_selected = cp.empty(2, dtype=np.uint64) + + cuda.compute.three_way_partition( + d_in=cp.asarray(h_input), + d_first_part_out=d_first, + d_second_part_out=d_second, + d_unselected_out=d_unselected, + d_num_selected_out=d_num_selected, + select_first_part_op=_raw_less_than_i32_op("no_numba_less_than_4_i32", 4), + select_second_part_op=_raw_less_than_i32_op("no_numba_less_than_8_i32", 8), + num_items=h_input.size, + ) + + selected = d_num_selected.get() + first_count = int(selected[0]) + second_count = int(selected[1]) + unselected_count = h_input.size - first_count - second_count + + np.testing.assert_array_equal(d_first.get()[:first_count], h_input[:4]) + np.testing.assert_array_equal(d_second.get()[:second_count], h_input[4:8]) + np.testing.assert_array_equal(d_unselected.get()[:unselected_count], h_input[8:]) + + +def test_raw_op_reduce(): + h_input = np.asarray([10, 20, 30, 40], dtype=np.int64) + d_output = cp.empty(1, dtype=np.int64) + + cuda.compute.reduce_into( + d_in=cp.asarray(h_input), + d_out=d_output, + num_items=h_input.size, + op=_raw_plus_i64_op(), + h_init=np.array([5], dtype=np.int64), + ) + + assert d_output.get()[0] == 105 + + +def test_stream_argument(cuda_stream): + d_lhs = cp.asarray([2, 4, 6, 8, 10], dtype=np.int32) + d_rhs = cp.asarray([1, 3, 5, 7, 9], dtype=np.int32) + d_output = cp.empty_like(d_lhs) + + cuda.compute.binary_transform( + d_in1=d_lhs, + d_in2=d_rhs, + d_out=d_output, + op=OpKind.PLUS, + num_items=d_lhs.size, + stream=cuda_stream, + ) + + cp.cuda.Device().synchronize() + np.testing.assert_array_equal(d_output.get(), np.asarray([3, 7, 11, 15, 19])) + + +def test_counting_iterator_reduce(): + d_output = cp.empty(1, dtype=np.int32) + + cuda.compute.reduce_into( + d_in=CountingIterator(np.int32(3)), + d_out=d_output, + num_items=8, + op=OpKind.PLUS, + h_init=np.array([0], dtype=np.int32), + ) + + assert d_output.get()[0] == 52 + + +def test_constant_iterator_reduce(): + d_output = cp.empty(1, dtype=np.float32) + + cuda.compute.reduce_into( + d_in=ConstantIterator(np.float32(1.5)), + d_out=d_output, + num_items=8, + op=OpKind.PLUS, + h_init=np.array([0], dtype=np.float32), + ) + + np.testing.assert_allclose(d_output.get()[0], np.float32(12.0)) + + +def test_cache_modified_input_iterator_reduce(): + d_input = cp.asarray([2, 4, 6, 8, 10], dtype=np.uint16) + d_output = cp.empty(1, dtype=np.uint16) + iterator = CacheModifiedInputIterator(d_input, modifier="stream") + + cuda.compute.reduce_into( + d_in=iterator, + d_out=d_output, + num_items=d_input.size, + op=OpKind.PLUS, + h_init=np.array([0], dtype=np.uint16), + ) + + assert d_output.get()[0] == 30 + + +def test_reverse_input_iterator_scan(): + d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32) + d_output = cp.empty_like(d_input) + + cuda.compute.inclusive_scan( + d_in=ReverseIterator(d_input), + d_out=d_output, + op=OpKind.PLUS, + init_value=np.array([0], dtype=np.int32), + num_items=d_input.size, + ) + + np.testing.assert_array_equal(d_output.get(), np.asarray([5, 9, 12, 14, 15])) + + +def test_reverse_output_iterator_scan(): + d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32) + d_output = cp.empty_like(d_input) + + cuda.compute.inclusive_scan( + d_in=d_input, + d_out=ReverseIterator(d_output), + op=OpKind.PLUS, + init_value=np.array([0], dtype=np.int32), + num_items=d_input.size, + ) + + np.testing.assert_array_equal(d_output.get(), np.asarray([15, 10, 6, 3, 1])) + + +def test_permutation_iterator_reduce(): + d_values = cp.asarray([10, 20, 30, 40, 50, 60], dtype=np.int64) + d_indices = cp.asarray([4, 2, 5, 1], dtype=np.int32) + d_output = cp.empty(1, dtype=np.int64) + + cuda.compute.reduce_into( + d_in=PermutationIterator(d_values, d_indices), + d_out=d_output, + num_items=d_indices.size, + op=OpKind.PLUS, + h_init=np.array([0], dtype=np.int64), + ) + + assert d_output.get()[0] == 160 + + +def test_transform_iterator_reduce(): + d_output = cp.empty(1, dtype=np.int32) + iterator = TransformIterator( + CountingIterator(np.int32(1)), _raw_square_i32_op(), value_type=cccl_int32 + ) + + cuda.compute.reduce_into( + d_in=iterator, + d_out=d_output, + num_items=6, + op=OpKind.PLUS, + h_init=np.array([0], dtype=np.int32), + ) + + assert d_output.get()[0] == 91 + + +def test_transform_output_iterator_reduce(): + d_input = cp.asarray([1, 2, 3, 4], dtype=np.int16) + d_output = cp.empty(1, dtype=np.int16) + output_iterator = TransformOutputIterator( + d_output, _raw_negate_i16_op(), output_value_type=cccl_int16 + ) + + cuda.compute.reduce_into( + d_in=d_input, + d_out=output_iterator, + num_items=d_input.size, + op=OpKind.PLUS, + h_init=np.array([0], dtype=np.int16), + ) + + assert d_output.get()[0] == -10 + + +def test_zip_iterator_transform(): + d_lhs = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32) + d_rhs = cp.asarray([10, 20, 30, 40, 50], dtype=np.int32) + d_output = cp.empty_like(d_lhs) + + cuda.compute.unary_transform( + d_in=ZipIterator(d_lhs, d_rhs), + d_out=d_output, + op=_raw_zip_sum_i32_op(), + num_items=d_lhs.size, + ) + + np.testing.assert_array_equal(d_output.get(), d_lhs.get() + d_rhs.get()) + + +def test_shuffle_iterator_transform(): + num_items = 17 + d_output = cp.empty(num_items, dtype=np.int64) + + cuda.compute.unary_transform( + d_in=ShuffleIterator(num_items, seed=123), + d_out=d_output, + op=OpKind.IDENTITY, + num_items=num_items, + ) + + result = d_output.get() + assert sorted(result.tolist()) == list(range(num_items)) + + +def test_discard_iterator_transform(): + d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32) + d_reference = cp.full_like(d_input, -1) + + cuda.compute.unary_transform( + d_in=d_input, + d_out=DiscardIterator(d_reference), + op=OpKind.IDENTITY, + num_items=d_input.size, + ) + + np.testing.assert_array_equal(d_reference.get(), np.full(5, -1, dtype=np.int32)) diff --git a/python/cuda_cccl/tests/compute/test_permutation_iterator.py b/python/cuda_cccl/tests/compute/test_permutation_iterator.py index 226904b75da..74d74a7b4e6 100644 --- a/python/cuda_cccl/tests/compute/test_permutation_iterator.py +++ b/python/cuda_cccl/tests/compute/test_permutation_iterator.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import cupy as cp import numpy as np -import pytest import cuda.compute from cuda.compute.iterators import ( @@ -13,7 +12,6 @@ ) -@pytest.mark.no_numba def test_permutation_iterator_equality(): values1 = cp.asarray([10, 20, 30, 40, 50], dtype="int32") values2 = cp.asarray([100, 200, 300], dtype="int32") @@ -39,7 +37,6 @@ def test_permutation_iterator_equality(): assert it1.kind != it5.kind -@pytest.mark.no_numba def test_permutation_iterator_with_array_values(): values = cp.asarray([10, 20, 30, 40, 50], dtype="int32") indices = cp.asarray([2, 0, 4, 1], dtype="int32") @@ -57,7 +54,6 @@ def test_permutation_iterator_with_array_values(): assert d_output[0] == values[indices].sum() -@pytest.mark.no_numba def test_permutation_iterator_with_iterator_values(): values_it = CountingIterator(np.int32(10)) indices = cp.asarray([2, 0, 4, 1], dtype="int32") @@ -161,7 +157,6 @@ def op(a): assert cp.all(d_out == expected) -@pytest.mark.no_numba def test_caching_permutation_iterator(): """Test that iterator compilation is cached across instances with the same structure.""" from cuda.compute._cpp_compile import compile_cpp_op_code @@ -209,7 +204,6 @@ def test_caching_permutation_iterator(): ) -@pytest.mark.no_numba def test_permutation_iterator_advance(): """Test PermutationIterator.__add__ only advances indices, not values.""" # Create values array [10, 20, 30, 40, 50, 60, 70] diff --git a/python/cuda_cccl/tests/compute/test_radix_sort.py b/python/cuda_cccl/tests/compute/test_radix_sort.py index d39a7e07fad..62b8e7dcb44 100644 --- a/python/cuda_cccl/tests/compute/test_radix_sort.py +++ b/python/cuda_cccl/tests/compute/test_radix_sort.py @@ -6,6 +6,7 @@ from typing import Tuple import cupy as cp +import numba import numpy as np import pytest @@ -14,7 +15,6 @@ DoubleBuffer, SortOrder, ) -from cuda.core import Device def get_mark(dt, log_size): @@ -148,7 +148,7 @@ def host_sort(h_in_keys, h_in_values, order, begin_bit=None, end_bit=None) -> Tu DTYPE_SIZE, ) def test_radix_sort_keys(dtype, num_items, monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -164,12 +164,12 @@ def test_radix_sort_keys(dtype, num_items, monkeypatch): h_in_keys = random_array(num_items, dtype, max_value=20) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = cp.asarray(h_in_keys) - d_out_keys = cp.asarray(h_out_keys) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_out_keys = numba.cuda.to_device(h_out_keys) radix_sort_device(d_in_keys, d_out_keys, None, None, order, num_items) - h_out_keys = d_out_keys.get() + h_out_keys = d_out_keys.copy_to_host() h_in_keys, _ = host_sort(h_in_keys, None, order) @@ -195,17 +195,17 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = cp.asarray(h_in_keys) - d_in_values = cp.asarray(h_in_values) - d_out_keys = cp.asarray(h_out_keys) - d_out_values = cp.asarray(h_out_values) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_values = numba.cuda.to_device(h_in_values) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_values = numba.cuda.to_device(h_out_values) radix_sort_device( d_in_keys, d_out_keys, d_in_values, d_out_values, order, num_items ) - h_out_keys = d_out_keys.get() - h_out_values = d_out_values.get() + h_out_keys = d_out_keys.copy_to_host() + h_out_values = d_out_values.copy_to_host() h_in_keys, h_in_values = host_sort(h_in_keys, h_in_values, order) @@ -218,7 +218,7 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch): DTYPE_SIZE, ) def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -234,14 +234,14 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch): h_in_keys = random_array(num_items, dtype, max_value=20) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = cp.asarray(h_in_keys) - d_out_keys = cp.asarray(h_out_keys) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_out_keys = numba.cuda.to_device(h_out_keys) keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys) radix_sort_device(keys_double_buffer, None, None, None, order, num_items) - h_out_keys = keys_double_buffer.current().get() + h_out_keys = keys_double_buffer.current().copy_to_host() h_in_keys, _ = host_sort(h_in_keys, None, order) @@ -253,7 +253,7 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch): DTYPE_SIZE, ) def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # NOTE: int16 failures seen only with NVRTC 13.1: if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)): import cuda.compute._cccl_interop @@ -270,10 +270,10 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = cp.asarray(h_in_keys) - d_in_values = cp.asarray(h_in_values) - d_out_keys = cp.asarray(h_out_keys) - d_out_values = cp.asarray(h_out_values) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_values = numba.cuda.to_device(h_in_values) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_values = numba.cuda.to_device(h_out_values) keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys) values_double_buffer = DoubleBuffer(d_in_values, d_out_values) @@ -282,8 +282,8 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): keys_double_buffer, None, values_double_buffer, None, order, num_items ) - h_out_keys = keys_double_buffer.current().get() - h_out_values = values_double_buffer.current().get() + h_out_keys = keys_double_buffer.current().copy_to_host() + h_out_values = values_double_buffer.current().copy_to_host() h_in_keys, h_in_values = host_sort(h_in_keys, h_in_values, order) @@ -304,7 +304,7 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): DTYPE_SIZE_BIT_WINDOW, ) def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # NOTE: int16 failures seen only with NVRTC 13.1: if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)): import cuda.compute._cccl_interop @@ -329,10 +329,10 @@ def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = cp.asarray(h_in_keys) - d_in_values = cp.asarray(h_in_values) - d_out_keys = cp.asarray(h_out_keys) - d_out_values = cp.asarray(h_out_values) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_values = numba.cuda.to_device(h_in_values) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_values = numba.cuda.to_device(h_out_values) radix_sort_device( d_in_keys, @@ -345,8 +345,8 @@ def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch): end_bit, ) - h_out_keys = d_out_keys.get() - h_out_values = d_out_values.get() + h_out_keys = d_out_keys.copy_to_host() + h_out_values = d_out_values.copy_to_host() h_in_keys, h_in_values = host_sort( h_in_keys, h_in_values, order, begin_bit, end_bit @@ -384,10 +384,10 @@ def test_radix_sort_pairs_double_buffer_bit_window(dtype, num_items, monkeypatch h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = cp.asarray(h_in_keys) - d_in_values = cp.asarray(h_in_values) - d_out_keys = cp.asarray(h_out_keys) - d_out_values = cp.asarray(h_out_values) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_values = numba.cuda.to_device(h_in_values) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_values = numba.cuda.to_device(h_out_values) keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys) values_double_buffer = DoubleBuffer(d_in_values, d_out_values) @@ -403,8 +403,8 @@ def test_radix_sort_pairs_double_buffer_bit_window(dtype, num_items, monkeypatch end_bit, ) - h_out_keys = keys_double_buffer.current().get() - h_out_values = values_double_buffer.current().get() + h_out_keys = keys_double_buffer.current().copy_to_host() + h_out_values = values_double_buffer.current().copy_to_host() h_in_keys, h_in_values = host_sort( h_in_keys, h_in_values, order, begin_bit, end_bit @@ -469,9 +469,8 @@ def test_radix_sort_with_stream(cuda_stream): np.testing.assert_array_equal(got, h_in_keys) -@pytest.mark.no_numba def test_radix_sort(monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -519,9 +518,8 @@ def test_radix_sort(monkeypatch): np.testing.assert_array_equal(h_out_items, h_in_values) -@pytest.mark.no_numba def test_radix_sort_double_buffer(monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: diff --git a/python/cuda_cccl/tests/compute/test_reduce.py b/python/cuda_cccl/tests/compute/test_reduce.py index 15f11428506..df3f024f0ee 100644 --- a/python/cuda_cccl/tests/compute/test_reduce.py +++ b/python/cuda_cccl/tests/compute/test_reduce.py @@ -6,6 +6,7 @@ import random import cupy as cp +import numba.cuda import numpy as np import pytest from cupy.cuda import runtime @@ -72,15 +73,15 @@ def add_op(a, b): def test_device_reduce(dtype, num_items, op): init_value = 42 h_init = np.array([init_value], dtype=dtype) - d_output = cp.empty(1, dtype=dtype) + d_output = numba.cuda.device_array(1, dtype=dtype) h_input = random_int(num_items, dtype) - d_input = cp.asarray(h_input) + d_input = numba.cuda.to_device(h_input) cuda.compute.reduce_into( d_in=d_input, d_out=d_output, num_items=d_input.size, op=op, h_init=h_init ) - h_output = d_output.get() + h_output = d_output.copy_to_host() assert h_output[0] == pytest.approx( sum(h_input) + init_value, rel=0.08 if dtype == np.float16 else 0 ) # obtained relative error value from c2h/include/c2h/check_results.cuh @@ -93,10 +94,10 @@ def test_device_reduce_with_lambda(): num_items = 1024 h_init = np.array([init_value], dtype=dtype) - d_output = cp.empty(1, dtype=dtype) + d_output = numba.cuda.device_array(1, dtype=dtype) h_input = random_int(num_items, dtype) - d_input = cp.asarray(h_input) + d_input = numba.cuda.to_device(h_input) # Use a lambda function directly as the reducer cuda.compute.reduce_into( @@ -106,7 +107,7 @@ def test_device_reduce_with_lambda(): op=lambda a, b: a + b, h_init=h_init, ) - h_output = d_output.get() + h_output = d_output.copy_to_host() assert h_output[0] == sum(h_input) + init_value @@ -117,10 +118,10 @@ def test_device_reduce_with_lambda_variable(): num_items = 1024 h_init = np.array([init_value], dtype=dtype) - d_output = cp.empty(1, dtype=dtype) + d_output = numba.cuda.device_array(1, dtype=dtype) h_input = random_int(num_items, dtype) - d_input = cp.asarray(h_input) + d_input = numba.cuda.to_device(h_input) # Use a lambda function assigned to a variable as the reducer cuda.compute.reduce_into( @@ -130,24 +131,24 @@ def test_device_reduce_with_lambda_variable(): op=add_op_lambda, h_init=h_init, ) - h_output = d_output.get() + h_output = d_output.copy_to_host() assert h_output[0] == sum(h_input) + init_value def test_complex_device_reduce(): h_init = np.array([40.0 + 2.0j], dtype=complex) - d_output = cp.empty(1, dtype=complex) + d_output = numba.cuda.device_array(1, dtype=complex) for num_items in [42, 420000]: real_imag = np.random.random((2, num_items)) h_input = real_imag[0] + 1j * real_imag[1] - d_input = cp.asarray(h_input) + d_input = numba.cuda.to_device(h_input) assert d_input.size == num_items cuda.compute.reduce_into( d_in=d_input, d_out=d_output, num_items=num_items, op=add_op, h_init=h_init ) - result = d_output.get()[0] + result = d_output.copy_to_host()[0] expected = np.sum(h_input, initial=h_init[0]) assert result == pytest.approx(expected) @@ -161,11 +162,11 @@ def _test_device_sum_with_iterator( if use_numpy_array: h_input = np.array(l_varr, dtype_inp) - d_input = cp.asarray(h_input) + d_input = numba.cuda.to_device(h_input) else: d_input = i_input - d_output = cp.empty(1, dtype_out) # to store device sum + d_output = numba.cuda.device_array(1, dtype_out) # to store device sum h_init = np.array([start_sum_with], dtype_out) @@ -173,7 +174,7 @@ def _test_device_sum_with_iterator( d_in=d_input, d_out=d_output, num_items=len(l_varr), op=add_op, h_init=h_init ) - h_output = d_output.get() + h_output = d_output.copy_to_host() assert h_output[0] == expected_result @@ -215,7 +216,7 @@ def test_device_sum_cache_modified_input_it( l_varr = [rng.randrange(100) for _ in range(num_items)] dtype_inp = np.dtype(supported_value_type) dtype_out = dtype_inp - input_devarr = cp.asarray(np.array(l_varr, dtype=dtype_inp)) + input_devarr = numba.cuda.to_device(np.array(l_varr, dtype=dtype_inp)) i_input = CacheModifiedInputIterator(input_devarr, modifier="stream") _test_device_sum_with_iterator( l_varr, start_sum_with, i_input, dtype_inp, dtype_out, use_numpy_array @@ -690,7 +691,6 @@ def add_op(x, y): ) -@pytest.mark.no_numba def test_device_reduce_well_known_plus(): dtype = np.int32 h_init = np.array([0], dtype=dtype) @@ -709,7 +709,6 @@ def test_device_reduce_well_known_plus(): assert (d_output == expected_output).all() -@pytest.mark.no_numba def test_device_reduce_well_known_minimum(): dtype = np.int32 h_init = np.array([100], dtype=dtype) @@ -728,7 +727,6 @@ def test_device_reduce_well_known_minimum(): assert (d_output == expected_output).all() -@pytest.mark.no_numba def test_device_reduce_well_known_maximum(): dtype = np.int32 h_init = np.array([-100], dtype=dtype) @@ -927,7 +925,6 @@ def sqrt(x: dtype) -> dtype: np.testing.assert_allclose(d_output.get(), expected.get(), atol=1e-6) -@pytest.mark.no_numba def test_reduce_with_not_guaranteed_determinism(floating_array): dtype = floating_array.dtype h_init = np.array([0], dtype=dtype) @@ -945,7 +942,6 @@ def test_reduce_with_not_guaranteed_determinism(floating_array): ) -@pytest.mark.no_numba def test_reduce_bool(): h_init = np.array([False]) d_input = cp.array([True, False, True]) diff --git a/python/cuda_cccl/tests/compute/test_scan.py b/python/cuda_cccl/tests/compute/test_scan.py index 0eb3af34c49..af6d941b11b 100644 --- a/python/cuda_cccl/tests/compute/test_scan.py +++ b/python/cuda_cccl/tests/compute/test_scan.py @@ -4,6 +4,7 @@ import cupy as cp +import numba.cuda import numpy as np import pytest @@ -15,7 +16,6 @@ TransformOutputIterator, gpu_struct, ) -from cuda.core import Device def scan_host(h_input: np.ndarray, op, h_init, force_inclusive): @@ -52,7 +52,7 @@ def scan_device(d_input, d_output, num_items, op, h_init, force_inclusive, strea [True, False], ) def test_scan_array_input(force_inclusive, input_array, monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip sass verification if input is complex # as LDL/STL instructions are emitted for complex types. # Also skip for: @@ -214,7 +214,6 @@ def op(a, b): np.testing.assert_allclose(expected, got, rtol=1e-5) -@pytest.mark.no_numba def test_exclusive_scan_well_known_plus(): dtype = np.int32 h_init = np.array([0], dtype=dtype) @@ -233,9 +232,8 @@ def test_exclusive_scan_well_known_plus(): np.testing.assert_equal(d_output.get(), expected) -@pytest.mark.no_numba def test_inclusive_scan_well_known_plus(monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip SASS check for CC 9.0+, due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -356,7 +354,7 @@ def add_op(a, b): def test_reverse_input_iterator(monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip SASS check for CC 9.0+, due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -411,7 +409,6 @@ def add_op(a, b): np.testing.assert_equal(d_output.get(), expected) -@pytest.mark.no_numba @pytest.mark.parametrize( "force_inclusive", [True, False], @@ -433,14 +430,13 @@ def test_future_init_value(force_inclusive): np.testing.assert_array_equal(expected, got) -@pytest.mark.no_numba def test_no_init_value(monkeypatch): force_inclusive = True num_items = 1024 dtype = np.dtype("int32") # Skip SASS check for CC 9.0 due to LDL/STL CI failure. - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability if cc_major >= 9: import cuda.compute._cccl_interop @@ -460,7 +456,6 @@ def test_no_init_value(monkeypatch): np.testing.assert_array_equal(expected, got) -@pytest.mark.no_numba def test_no_init_value_iterator(): force_inclusive = True num_items = 1024 @@ -498,7 +493,6 @@ def test_inclusive_scan_with_lambda(): np.testing.assert_array_equal(d_output.get(), expected) -@pytest.mark.no_numba @pytest.mark.parametrize("force_inclusive", [True, False]) def test_scan_bool_maximum(force_inclusive): h_init = np.array([False], dtype=np.bool_) diff --git a/python/cuda_cccl/tests/compute/test_segmented_reduce.py b/python/cuda_cccl/tests/compute/test_segmented_reduce.py index 815c98b8edb..950d860a14d 100644 --- a/python/cuda_cccl/tests/compute/test_segmented_reduce.py +++ b/python/cuda_cccl/tests/compute/test_segmented_reduce.py @@ -269,7 +269,6 @@ def _plus(a, b): ) -@pytest.mark.no_numba def test_segmented_reduce_well_known_plus(monkeypatch): # Disable SASS verification for this test (LDL instruction in SASS). monkeypatch.setattr( @@ -300,7 +299,6 @@ def test_segmented_reduce_well_known_plus(monkeypatch): np.testing.assert_equal(d_output.get(), expected) -@pytest.mark.no_numba def test_segmented_reduce_well_known_maximum(monkeypatch): # Disable SASS verification for this test (LDL instruction in SASS). monkeypatch.setattr( @@ -331,7 +329,6 @@ def test_segmented_reduce_well_known_maximum(monkeypatch): np.testing.assert_equal(d_output.get(), expected) -@pytest.mark.no_numba def test_segmented_reduce_bool_maximum(monkeypatch): # Disable SASS verification for this test (LDL instruction in SASS). monkeypatch.setattr( diff --git a/python/cuda_cccl/tests/compute/test_segmented_sort.py b/python/cuda_cccl/tests/compute/test_segmented_sort.py index 9c2ab9f2de4..2bdd03abb42 100644 --- a/python/cuda_cccl/tests/compute/test_segmented_sort.py +++ b/python/cuda_cccl/tests/compute/test_segmented_sort.py @@ -5,13 +5,12 @@ from typing import Tuple import cupy as cp +import numba import numpy as np import pytest import cuda.compute -pytestmark = pytest.mark.no_numba - DTYPE_LIST = [ np.uint8, np.int16, @@ -119,8 +118,8 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch): h_in_keys = random_array(num_items, dtype, max_value=50) start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = cp.asarray(h_in_keys) - d_out_keys = cp.asarray(np.empty_like(h_in_keys)) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) cuda.compute.segmented_sort( d_in_keys=d_in_keys, @@ -134,7 +133,7 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch): order=order, ) - h_out_keys = d_out_keys.get() + h_out_keys = d_out_keys.copy_to_host() expected_keys, _ = host_segmented_sort( h_in_keys, None, start_offsets, end_offsets, order ) @@ -154,10 +153,10 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size): start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = cp.asarray(h_in_keys) - d_in_vals = cp.asarray(h_in_vals) - d_out_keys = cp.asarray(np.empty_like(h_in_keys)) - d_out_vals = cp.asarray(np.empty_like(h_in_vals)) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_vals = numba.cuda.to_device(h_in_vals) + d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) + d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals)) cuda.compute.segmented_sort( d_in_keys=d_in_keys, @@ -171,8 +170,8 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size): order=order, ) - h_out_keys = d_out_keys.get() - h_out_vals = d_out_vals.get() + h_out_keys = d_out_keys.copy_to_host() + h_out_vals = d_out_vals.copy_to_host() expected_keys, expected_vals = host_segmented_sort( h_in_keys, h_in_vals, start_offsets, end_offsets, order @@ -190,8 +189,8 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size): h_in_keys = random_array(num_items, dtype, max_value=20) start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = cp.asarray(h_in_keys) - d_tmp_keys = cp.asarray(np.empty_like(h_in_keys)) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys) cuda.compute.segmented_sort( @@ -206,7 +205,7 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size): order=order, ) - h_out_keys = keys_db.current().get() + h_out_keys = keys_db.current().copy_to_host() expected_keys, _ = host_segmented_sort( h_in_keys, None, start_offsets, end_offsets, order ) @@ -225,10 +224,10 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size): start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = cp.asarray(h_in_keys) - d_in_vals = cp.asarray(h_in_vals) - d_tmp_keys = cp.asarray(np.empty_like(h_in_keys)) - d_tmp_vals = cp.asarray(np.empty_like(h_in_vals)) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_vals = numba.cuda.to_device(h_in_vals) + d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) + d_tmp_vals = numba.cuda.to_device(np.empty_like(h_in_vals)) keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys) vals_db = cuda.compute.DoubleBuffer(d_in_vals, d_tmp_vals) @@ -245,8 +244,8 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size): order=order, ) - h_out_keys = keys_db.current().get() - h_out_vals = vals_db.current().get() + h_out_keys = keys_db.current().copy_to_host() + h_out_vals = vals_db.current().copy_to_host() expected_keys, expected_vals = host_segmented_sort( h_in_keys, h_in_vals, start_offsets, end_offsets, order @@ -298,10 +297,10 @@ def test_segmented_sort_variable_segment_sizes(num_segments): h_in_keys = random_array(num_items, np.int32, max_value=100) h_in_vals = random_array(num_items, np.float32) - d_in_keys = cp.asarray(h_in_keys) - d_in_vals = cp.asarray(h_in_vals) - d_out_keys = cp.asarray(np.empty_like(h_in_keys)) - d_out_vals = cp.asarray(np.empty_like(h_in_vals)) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_vals = numba.cuda.to_device(h_in_vals) + d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) + d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals)) cuda.compute.segmented_sort( d_in_keys=d_in_keys, @@ -315,8 +314,8 @@ def test_segmented_sort_variable_segment_sizes(num_segments): order=order, ) - h_out_keys = d_out_keys.get() - h_out_vals = d_out_vals.get() + h_out_keys = d_out_keys.copy_to_host() + h_out_vals = d_out_vals.copy_to_host() expected_keys, expected_vals = host_segmented_sort( h_in_keys, h_in_vals, start_offsets, end_offsets, order ) diff --git a/python/cuda_cccl/tests/compute/test_select.py b/python/cuda_cccl/tests/compute/test_select.py index 3dcaf98a507..f2483860a4a 100644 --- a/python/cuda_cccl/tests/compute/test_select.py +++ b/python/cuda_cccl/tests/compute/test_select.py @@ -8,8 +8,6 @@ import cuda.compute from cuda.compute import CacheModifiedInputIterator, ZipIterator, gpu_struct -from cuda.compute._cpp_compile import compile_cpp_op_code -from cuda.compute.op import RawOp DTYPE_LIST = [ np.uint8, @@ -61,35 +59,6 @@ def _host_select(h_in: np.ndarray, cond): return selected, np.int64(selected.size) -def _raw_even_i32_op() -> RawOp: - source = """ -extern "C" __device__ void is_even_i32(void* x, void* result) { - int value = *static_cast(x); - *static_cast(result) = (value % 2) == 0; -} -""" - return RawOp(ltoir=compile_cpp_op_code(source), name="is_even_i32") - - -@pytest.mark.no_numba -def test_select_raw_op_minimal(): - h_in = np.arange(10, dtype=np.int32) - d_in = cp.asarray(h_in) - d_out = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.uint64) - - cuda.compute.select( - d_in=d_in, - d_out=d_out, - d_num_selected_out=d_num_selected, - cond=_raw_even_i32_op(), - num_items=len(d_in), - ) - - num_selected = int(d_num_selected[0].get()) - np.testing.assert_array_equal(d_out.get()[:num_selected], h_in[h_in % 2 == 0]) - - @pytest.mark.parametrize("dtype,num_items", select_params) def test_select_basic(dtype, num_items): h_in = random_array(num_items, dtype, max_value=100) diff --git a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py index 715f9e51235..5c82fe1ad38 100644 --- a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py +++ b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py @@ -87,7 +87,6 @@ def test_shuffle_iterator_with_permutation_iterator(): assert sorted(result) == sorted(d_values.get()) -@pytest.mark.no_numba def test_shuffle_iterator_invalid_num_items(): with pytest.raises(ValueError, match="num_items must be > 0"): ShuffleIterator(0, seed=42) diff --git a/python/cuda_cccl/tests/compute/test_struct_field_validation.py b/python/cuda_cccl/tests/compute/test_struct_field_validation.py index cf1edd7e681..3ddaee54d41 100644 --- a/python/cuda_cccl/tests/compute/test_struct_field_validation.py +++ b/python/cuda_cccl/tests/compute/test_struct_field_validation.py @@ -14,8 +14,6 @@ from cuda.compute import gpu_struct -pytestmark = pytest.mark.no_numba - def test_newline_in_field_name_is_rejected(): """Field names with newlines must be rejected — they are the exec() injection vector.""" diff --git a/python/cuda_cccl/tests/compute/test_three_way_partition.py b/python/cuda_cccl/tests/compute/test_three_way_partition.py index 8def513681b..3fb5b102275 100644 --- a/python/cuda_cccl/tests/compute/test_three_way_partition.py +++ b/python/cuda_cccl/tests/compute/test_three_way_partition.py @@ -8,8 +8,6 @@ import cuda.compute from cuda.compute import CacheModifiedInputIterator, gpu_struct -from cuda.compute._cpp_compile import compile_cpp_op_code -from cuda.compute.op import RawOp DTYPE_LIST = [ np.uint8, @@ -67,50 +65,6 @@ def _host_three_way_partition(h_in: np.ndarray, less_than_op, greater_equal_op): ) -def _raw_less_than_i32(name: str, threshold: int) -> RawOp: - source = f""" -extern "C" __device__ void {name}(void* x, void* result) {{ - int value = *static_cast(x); - *static_cast(result) = value < {threshold} ? 1 : 0; -}} -""" - return RawOp(ltoir=compile_cpp_op_code(source), name=name) - - -@pytest.mark.no_numba -def test_three_way_partition_raw_op_minimal(): - h_in = np.arange(10, dtype=np.int32) - d_in = cp.asarray(h_in) - d_first = cp.empty_like(d_in) - d_second = cp.empty_like(d_in) - d_unselected = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.uint64) - - cuda.compute.three_way_partition( - d_in=d_in, - d_first_part_out=d_first, - d_second_part_out=d_second, - d_unselected_out=d_unselected, - d_num_selected_out=d_num_selected, - select_first_part_op=_raw_less_than_i32("less_than_3_i32", 3), - select_second_part_op=_raw_less_than_i32("less_than_6_i32", 6), - num_items=len(d_in), - ) - - selected = d_num_selected.get() - first_count = int(selected[0]) - second_count = int(selected[1]) - unselected_count = len(h_in) - first_count - second_count - - np.testing.assert_array_equal(d_first.get()[:first_count], h_in[h_in < 3]) - np.testing.assert_array_equal( - d_second.get()[:second_count], h_in[(h_in >= 3) & (h_in < 6)] - ) - np.testing.assert_array_equal( - d_unselected.get()[:unselected_count], h_in[h_in >= 6] - ) - - @pytest.mark.parametrize("dtype,num_items", three_way_partition_params) def test_three_way_partition_basic(dtype, num_items, monkeypatch): # NOTE: the SASS check failure is seen only with NVRTC 13.1: diff --git a/python/cuda_cccl/tests/compute/test_transform.py b/python/cuda_cccl/tests/compute/test_transform.py index b8429726d77..c7c3ca2818a 100644 --- a/python/cuda_cccl/tests/compute/test_transform.py +++ b/python/cuda_cccl/tests/compute/test_transform.py @@ -266,7 +266,6 @@ def op2(a): np.testing.assert_allclose(expected, got) -@pytest.mark.no_numba def test_unary_transform_well_known_negate(): """Test unary transform with well-known NEGATE operation.""" dtype = np.int32 @@ -283,7 +282,6 @@ def test_unary_transform_well_known_negate(): np.testing.assert_equal(d_output.get(), expected) -@pytest.mark.no_numba def test_unary_transform_well_known_identity(): """Test unary transform with well-known IDENTITY operation.""" dtype = np.int32 @@ -300,7 +298,6 @@ def test_unary_transform_well_known_identity(): np.testing.assert_equal(d_output.get(), expected) -@pytest.mark.no_numba @pytest.mark.parametrize("dtype", [np.int32, np.float16]) def test_binary_transform_well_known_plus(dtype): """Test binary transform with well-known PLUS operation.""" @@ -322,7 +319,6 @@ def test_binary_transform_well_known_plus(dtype): np.testing.assert_equal(d_output.get(), expected) -@pytest.mark.no_numba def test_binary_transform_well_known_multiplies(): """Test binary transform with well-known MULTIPLIES operation.""" dtype = np.int32 @@ -590,7 +586,6 @@ def test_binary_transform_with_lambda(): np.testing.assert_array_equal(d_out.get(), expected) -@pytest.mark.no_numba def test_binary_transform_bool_equal_to(): d_input1 = cp.array([True, False, True, False], dtype=np.bool_) d_input2 = cp.array([True, True, False, False], dtype=np.bool_) diff --git a/python/cuda_cccl/tests/compute/test_unique_by_key.py b/python/cuda_cccl/tests/compute/test_unique_by_key.py index 945d4d99028..83a4a17db67 100644 --- a/python/cuda_cccl/tests/compute/test_unique_by_key.py +++ b/python/cuda_cccl/tests/compute/test_unique_by_key.py @@ -4,6 +4,7 @@ import cupy as cp +import numba.cuda import numpy as np import pytest @@ -14,7 +15,6 @@ OpKind, gpu_struct, ) -from cuda.core import Device DTYPE_LIST = [ np.uint8, @@ -125,7 +125,7 @@ def compare_op(lhs, rhs): @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params) def test_unique_by_key(dtype, num_items, op, monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -143,11 +143,11 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch): h_out_items = np.empty(num_items, dtype=np.float32) h_out_num_selected = np.empty(1, np.int32) - d_in_keys = cp.asarray(h_in_keys) - d_in_items = cp.asarray(h_in_items) - d_out_keys = cp.asarray(h_out_keys) - d_out_items = cp.asarray(h_out_items) - d_out_num_selected = cp.asarray(h_out_num_selected) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_items = numba.cuda.to_device(h_in_items) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_items = numba.cuda.to_device(h_out_items) + d_out_num_selected = numba.cuda.to_device(h_out_num_selected) unique_by_key_device( d_in_keys, @@ -159,10 +159,10 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch): num_items, ) - h_out_num_selected = d_out_num_selected.get() + h_out_num_selected = d_out_num_selected.copy_to_host() num_selected = h_out_num_selected[0] - h_out_keys = d_out_keys.get()[:num_selected] - h_out_items = d_out_items.get()[:num_selected] + h_out_keys = d_out_keys.copy_to_host()[:num_selected] + h_out_items = d_out_items.copy_to_host()[:num_selected] expected_keys, expected_items = unique_by_key_host(h_in_keys, h_in_items) @@ -172,7 +172,7 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch): @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params) def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip sass verification for CC 9.0+, due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -190,11 +190,11 @@ def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch): h_out_items = np.empty(num_items, dtype=np.float32) h_out_num_selected = np.empty(1, np.int64) - d_in_keys = cp.asarray(h_in_keys) - d_in_items = cp.asarray(h_in_items) - d_out_keys = cp.asarray(h_out_keys) - d_out_items = cp.asarray(h_out_items) - d_out_num_selected = cp.asarray(h_out_num_selected) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_items = numba.cuda.to_device(h_in_items) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_items = numba.cuda.to_device(h_out_items) + d_out_num_selected = numba.cuda.to_device(h_out_num_selected) i_in_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream") i_in_items = CacheModifiedInputIterator(d_in_items, modifier="stream") @@ -209,10 +209,10 @@ def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch): num_items, ) - h_out_num_selected = d_out_num_selected.get() + h_out_num_selected = d_out_num_selected.copy_to_host() num_selected = h_out_num_selected[0] - h_out_keys = d_out_keys.get()[:num_selected] - h_out_items = d_out_items.get()[:num_selected] + h_out_keys = d_out_keys.copy_to_host()[:num_selected] + h_out_items = d_out_items.copy_to_host()[:num_selected] expected_keys, expected_items = unique_by_key_host(h_in_keys, h_in_items) @@ -226,9 +226,9 @@ def test_unique_by_key_keys_only(): h_out_keys = np.empty(num_items, dtype=np.int32) h_out_num_selected = np.empty(1, np.int32) - d_in_keys = cp.asarray(h_in_keys) - d_out_keys = cp.asarray(h_out_keys) - d_out_num_selected = cp.asarray(h_out_num_selected) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_num_selected = numba.cuda.to_device(h_out_num_selected) unique_by_key_device( d_in_keys, @@ -240,9 +240,9 @@ def test_unique_by_key_keys_only(): num_items, ) - h_out_num_selected = d_out_num_selected.get() + h_out_num_selected = d_out_num_selected.copy_to_host() num_selected = h_out_num_selected[0] - h_out_keys = d_out_keys.get()[:num_selected] + h_out_keys = d_out_keys.copy_to_host()[:num_selected] expected_keys, _ = unique_by_key_host( h_in_keys, @@ -267,11 +267,11 @@ def compare_complex(lhs, rhs): h_out_items = np.empty(num_items, dtype=np.float32) h_out_num_selected = np.empty(1, np.int32) - d_in_keys = cp.asarray(h_in_keys) - d_in_items = cp.asarray(h_in_items) - d_out_keys = cp.asarray(h_out_keys) - d_out_items = cp.asarray(h_out_items) - d_out_num_selected = cp.asarray(h_out_num_selected) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_items = numba.cuda.to_device(h_in_items) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_items = numba.cuda.to_device(h_out_items) + d_out_num_selected = numba.cuda.to_device(h_out_num_selected) unique_by_key_device( d_in_keys, @@ -283,10 +283,10 @@ def compare_complex(lhs, rhs): num_items, ) - h_out_num_selected = d_out_num_selected.get() + h_out_num_selected = d_out_num_selected.copy_to_host() num_selected = h_out_num_selected[0] - h_out_keys = d_out_keys.get()[:num_selected] - h_out_items = d_out_items.get()[:num_selected] + h_out_keys = d_out_keys.copy_to_host()[:num_selected] + h_out_items = d_out_items.copy_to_host()[:num_selected] expected_keys, expected_items = unique_by_key_host( h_in_keys, h_in_items, compare_complex @@ -363,7 +363,7 @@ def struct_compare_op(lhs, rhs): def test_unique_by_key_with_stream(cuda_stream, monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -417,9 +417,8 @@ def test_unique_by_key_with_stream(cuda_stream, monkeypatch): np.testing.assert_array_equal(h_out_items, expected_items) -@pytest.mark.no_numba def test_unique_by_key_well_known_equal_to(monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: diff --git a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py index 2788a3c0893..e08709b6e1a 100644 --- a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py +++ b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py @@ -12,16 +12,10 @@ """ import pytest +from numba import types -numba = pytest.importorskip("numba") -types = numba.types - -from cuda.compute._odr_helpers import ( # noqa: E402 - _ArgMode, - _ArgSpec, - _create_void_ptr_wrapper, -) -from cuda.compute._utils import sanitize_identifier # noqa: E402 +from cuda.compute._odr_helpers import _ArgMode, _ArgSpec, _create_void_ptr_wrapper +from cuda.compute._utils import sanitize_identifier def _make_arg_specs(): diff --git a/python/cuda_cccl/tests/compute/test_zip_iterator.py b/python/cuda_cccl/tests/compute/test_zip_iterator.py index 9fbed38359f..85630bb4f5f 100644 --- a/python/cuda_cccl/tests/compute/test_zip_iterator.py +++ b/python/cuda_cccl/tests/compute/test_zip_iterator.py @@ -12,7 +12,6 @@ ZipIterator, gpu_struct, ) -from cuda.core import Device @pytest.mark.parametrize("num_items", [10, 1_000, 100_000]) @@ -262,8 +261,10 @@ def min_pairs(p1, p2): @pytest.mark.parametrize("num_items", [10, 1000]) def test_output_zip_iterator_with_scan(monkeypatch, num_items): """Test ZipIterator as output iterator with scan operations.""" + import numba.cuda + # Skip SASS check for CC 8.0+ due to LDL/STL CI failure. - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability if cc_major >= 8: monkeypatch.setattr( cuda.compute._cccl_interop, @@ -425,7 +426,9 @@ def sum_nested_zips(v1, v2): ], ) def test_nested_output_zip_iterator_with_scan(monkeypatch, num_items, dtype_map): - cc_major, _ = Device().compute_capability + import numba.cuda + + cc_major, _ = numba.cuda.get_current_device().compute_capability if cc_major >= 8: monkeypatch.setattr( cuda.compute._cccl_interop, @@ -499,7 +502,6 @@ def g(x): assert it1.kind != it2.kind -@pytest.mark.no_numba def test_caching_zip_iterator(): """Test that iterator compilation is cached across instances with the same structure.""" from cuda.compute._cpp_compile import compile_cpp_op_code @@ -575,7 +577,6 @@ def test_caching_zip_iterator(): assert len(set(kinds)) == 1, "Same CountingIterator types should have same kind" -@pytest.mark.no_numba def test_compilation_caching_across_iterator_types(): """Test that compilation caching works across different iterator types.""" from cuda.compute import ConstantIterator