From e461b597054a0695db414cdb7fa730ab2a128a62 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Fri, 12 Jun 2026 12:21:29 -0500
Subject: [PATCH 1/7] Decouple from numba.cuda further in tests in favor of
 cupy and cuda.core and add minimal extra testing

---
 ci/matrix.yaml                                |  2 +
 ci/test_cuda_compute_minimal_python.sh        | 51 +++++++++++++
 .../tests/compute/test_binary_search.py       | 35 +++++++++
 .../cuda_cccl/tests/compute/test_bindings.py  |  2 +
 .../compute/test_deferred_annotations.py      |  2 +
 .../tests/compute/test_func_caching.py        |  8 ++
 .../cuda_cccl/tests/compute/test_histogram.py |  2 +
 .../cuda_cccl/tests/compute/test_iterators.py | 10 ++-
 .../tests/compute/test_merge_sort.py          | 55 +++++++-------
 .../tests/compute/test_nested_struct.py       |  5 ++
 .../cuda_cccl/tests/compute/test_no_numba.py  | 75 +-----------------
 .../compute/test_permutation_iterator.py      |  6 ++
 .../tests/compute/test_radix_sort.py          | 76 ++++++++++---------
 python/cuda_cccl/tests/compute/test_reduce.py | 38 +++++-----
 python/cuda_cccl/tests/compute/test_scan.py   | 16 ++--
 .../tests/compute/test_segmented_reduce.py    |  3 +
 .../tests/compute/test_segmented_sort.py      | 51 +++++++------
 python/cuda_cccl/tests/compute/test_select.py | 31 ++++++++
 .../tests/compute/test_shuffle_iterator.py    |  1 +
 .../compute/test_struct_field_validation.py   |  2 +
 .../tests/compute/test_three_way_partition.py | 46 +++++++++++
 .../cuda_cccl/tests/compute/test_transform.py |  5 ++
 .../tests/compute/test_unique_by_key.py       | 69 ++++++++---------
 .../test_void_ptr_wrapper_validation.py       |  4 +-
 .../tests/compute/test_zip_iterator.py        | 11 ++-
 25 files changed, 379 insertions(+), 227 deletions(-)
 create mode 100755 ci/test_cuda_compute_minimal_python.sh

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 6146d863c87..1fcb1df75ae 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -84,6 +84,7 @@ workflows:
     - {jobs: ['test'], project: 'python', ctk: ['12.X',        '13.X'], py_version: ['3.10'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']}
     - {jobs: ['test'], project: 'python', ctk: ['12.X','13.0', '13.X'], py_version: ['3.14'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']}
     - {jobs: ['test'], project: 'python', py_version: '3.14', gpu: 'h100', cxx: 'gcc13'}
+    - {jobs: ['test_py_compute_minimal'], project: 'python', ctk: '13.X', py_version: '3.14', gpu: 'l4', cxx: 'gcc13'}
     # CCCL packaging:
     - {jobs: ['test'], project: 'packaging', ctk: '12.0', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080', args: '-min-cmake'}
     - {jobs: ['test'], project: 'packaging', ctk: '12.X', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080'}
@@ -539,6 +540,7 @@ jobs:
   test_py_headers:  { name: "Test cuda.cccl.headers",      gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_headers'} }
   test_py_coop:     { name: "Test cuda.coop._experimental",  gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_coop'} }
   test_py_par:      { name: "Test cuda.compute",     gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute'} }
+  test_py_compute_minimal: { name: "Test cuda.compute minimal", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute_minimal'} }
   test_py_examples: { name: "Test cuda.cccl.examples",     gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_examples'} }
 
   # Run jobs for 'target' project (ci/util/build_and_test_targets.sh):
diff --git a/ci/test_cuda_compute_minimal_python.sh b/ci/test_cuda_compute_minimal_python.sh
new file mode 100755
index 00000000000..c48b313ed31
--- /dev/null
+++ b/ci/test_cuda_compute_minimal_python.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+repo_root="$(cd "$ci_dir/.." && pwd)"
+source "$ci_dir/pyenv_helper.sh"
+
+# Parse common arguments
+source "$ci_dir/util/python/common_arg_parser.sh"
+parse_python_args "$@"
+require_py_version "Usage: $0 -py-version <python_version>"
+
+cuda_major_version=$(nvcc --version | grep release | awk '{print $6}' | tr -d ',' | cut -d '.' -f 1 | cut -d 'V' -f 2)
+
+# Setup Python environment
+setup_python_env "${py_version}"
+
+# Fetch or build the cuda_cccl wheel:
+if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
+  wheel_artifact_name=$("$ci_dir/util/workflow/get_wheel_artifact_name.sh")
+  "$ci_dir/util/artifacts/download.sh" "${wheel_artifact_name}" /home/coder/cccl/
+  wheelhouse_dir="/home/coder/cccl/wheelhouse"
+else
+  "$ci_dir/build_cuda_cccl_python.sh" -py-version "${py_version}"
+  wheelhouse_dir="${repo_root}/wheelhouse"
+fi
+
+# Install cuda_cccl with the minimal CUDA extra. This intentionally avoids the
+# full cu* extras because those pull in numba/numba-cuda. In a clean minimal
+# environment, the test phase below runs only tests marked no_numba.
+CUDA_CCCL_WHEEL_PATH="$(ls "${wheelhouse_dir}"/cuda_cccl-*.whl)"
+python -m pip install "${CUDA_CCCL_WHEEL_PATH}[minimal-cu${cuda_major_version}]"
+python -m pip install pytest pytest-xdist "cupy-cuda${cuda_major_version}x"
+
+if python - <<'PY'
+try:
+    import numba.cuda  # noqa: F401
+except Exception as exc:
+    print(f"numba.cuda unavailable; running no_numba subset: {exc!r}")
+    raise SystemExit(1)
+else:
+    print("numba.cuda available; running full compute test suite.")
+PY
+then
+  cd "${repo_root}/python/cuda_cccl/tests/"
+  python -m pytest -n 6 -v compute/ -m "not large"
+else
+  cd "${repo_root}/python/cuda_cccl/tests/"
+  python -m pytest -n 6 -v compute/ -m "not large and no_numba"
+fi
diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py
index a4f7d047a9b..94863c3fe74 100644
--- a/python/cuda_cccl/tests/compute/test_binary_search.py
+++ b/python/cuda_cccl/tests/compute/test_binary_search.py
@@ -6,6 +6,8 @@
 import pytest
 
 import cuda.compute
+from cuda.compute._cpp_compile import compile_cpp_op_code
+from cuda.compute.op import RawOp
 
 DTYPE_LIST = [
     np.int32,
@@ -39,6 +41,39 @@ def disable_sass_check(monkeypatch):
     )
 
 
+def _raw_less_i32_op() -> RawOp:
+    source = """
+extern "C" __device__ void less_i32(void* lhs, void* rhs, void* result) {
+    int lhs_value = *static_cast<int*>(lhs);
+    int rhs_value = *static_cast<int*>(rhs);
+    *static_cast<bool*>(result) = lhs_value < rhs_value;
+}
+"""
+    return RawOp(ltoir=compile_cpp_op_code(source), name="less_i32")
+
+
+@pytest.mark.no_numba
+def test_lower_bound_raw_op_minimal():
+    h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32)
+    h_values = np.array([0, 3, 4, 10], dtype=np.int32)
+
+    d_data = cp.asarray(h_data)
+    d_values = cp.asarray(h_values)
+    d_out = cp.empty(len(h_values), dtype=np.uintp)
+
+    cuda.compute.lower_bound(
+        d_data=d_data,
+        num_items=len(d_data),
+        d_values=d_values,
+        num_values=len(d_values),
+        d_out=d_out,
+        comp=_raw_less_i32_op(),
+    )
+
+    expected = np.searchsorted(h_data, h_values, side="left").astype(np.uintp)
+    np.testing.assert_array_equal(d_out.get(), expected)
+
+
 @pytest.mark.parametrize("dtype", DTYPE_LIST)
 @pytest.mark.parametrize(
     "num_items,num_values", [(0, 0), (0, 128), (128, 0), (512, 128)]
diff --git a/python/cuda_cccl/tests/compute/test_bindings.py b/python/cuda_cccl/tests/compute/test_bindings.py
index 2f16da620ef..ccfbd8d0bc3 100644
--- a/python/cuda_cccl/tests/compute/test_bindings.py
+++ b/python/cuda_cccl/tests/compute/test_bindings.py
@@ -4,6 +4,8 @@
 
 import cuda.compute._bindings as bindings
 
+pytestmark = pytest.mark.no_numba
+
 
 @pytest.fixture(
     params=[
diff --git a/python/cuda_cccl/tests/compute/test_deferred_annotations.py b/python/cuda_cccl/tests/compute/test_deferred_annotations.py
index c30aeda4068..a584b8ffbe2 100644
--- a/python/cuda_cccl/tests/compute/test_deferred_annotations.py
+++ b/python/cuda_cccl/tests/compute/test_deferred_annotations.py
@@ -7,10 +7,12 @@
 
 import cupy as cp
 import numpy as np
+import pytest
 
 from cuda.compute import OpKind, TransformIterator, gpu_struct, reduce_into
 
 
+@pytest.mark.no_numba
 def test_deferred_annotations():
     # test that we can use @gpu_struct with deferred annotations
     # GH: #6421
diff --git a/python/cuda_cccl/tests/compute/test_func_caching.py b/python/cuda_cccl/tests/compute/test_func_caching.py
index 0da32901a32..5d12458790d 100644
--- a/python/cuda_cccl/tests/compute/test_func_caching.py
+++ b/python/cuda_cccl/tests/compute/test_func_caching.py
@@ -1,10 +1,12 @@
 import numpy as np
+import pytest
 
 from cuda.compute._caching import CachableFunction
 
 global_x = 1
 
 
+@pytest.mark.no_numba
 def test_func_caching_basic():
     def func(x):
         return x
@@ -19,6 +21,7 @@ def func(x):
     assert f1 == f2
 
 
+@pytest.mark.no_numba
 def test_func_caching_different_names():
     def func(x):
         return x
@@ -33,6 +36,7 @@ def func2(x):
     assert f1 != f2
 
 
+@pytest.mark.no_numba
 def test_func_caching_different_code():
     def func(x):
         return x
@@ -46,6 +50,7 @@ def func(x):
     assert f1 != f2
 
 
+@pytest.mark.no_numba
 def test_func_caching_with_closure():
     def factory(x):
         def func(y):
@@ -61,6 +66,7 @@ def func(y):
     assert f1 != f3
 
 
+@pytest.mark.no_numba
 def test_func_caching_with_global_variable():
     global global_x
 
@@ -107,6 +113,7 @@ def func(x):
     assert CachableFunction(func1) != CachableFunction(func3)
 
 
+@pytest.mark.no_numba
 def test_func_caching_with_global_np_ufunc():
     def make_func():
         def func(x):
@@ -126,6 +133,7 @@ def func(x):
     assert CachableFunction(func1) != CachableFunction(func2)
 
 
+@pytest.mark.no_numba
 def test_func_caching_with_aliased_np_ufunc():
     def make_func1():
         amin = np.argmin
diff --git a/python/cuda_cccl/tests/compute/test_histogram.py b/python/cuda_cccl/tests/compute/test_histogram.py
index 0a9d96674e6..5c104114754 100644
--- a/python/cuda_cccl/tests/compute/test_histogram.py
+++ b/python/cuda_cccl/tests/compute/test_histogram.py
@@ -12,6 +12,8 @@
     CountingIterator,
 )
 
+pytestmark = pytest.mark.no_numba
+
 DTYPE_LIST = [
     np.uint8,
     np.uint16,
diff --git a/python/cuda_cccl/tests/compute/test_iterators.py b/python/cuda_cccl/tests/compute/test_iterators.py
index 49e2ab39d15..c3b6acc9fc9 100644
--- a/python/cuda_cccl/tests/compute/test_iterators.py
+++ b/python/cuda_cccl/tests/compute/test_iterators.py
@@ -4,7 +4,6 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import cupy as cp
-import numba.cuda
 import numpy as np
 import pytest
 
@@ -22,6 +21,7 @@
 )
 
 
+@pytest.mark.no_numba
 def test_constant_iterator_equality():
     it1 = ConstantIterator(np.int32(0))
     it2 = ConstantIterator(np.int32(0))
@@ -32,6 +32,7 @@ def test_constant_iterator_equality():
     assert it1.kind != it4.kind
 
 
+@pytest.mark.no_numba
 def test_counting_iterator_equality():
     it1 = CountingIterator(np.int32(0))
     it2 = CountingIterator(np.int32(0))
@@ -42,6 +43,7 @@ def test_counting_iterator_equality():
     assert it1.kind != it4.kind
 
 
+@pytest.mark.no_numba
 def test_cache_modified_input_iterator_equality():
     ary1 = cp.asarray([0, 1, 2], dtype="int32")
     ary2 = cp.asarray([3, 4, 5], dtype="int32")
@@ -122,11 +124,14 @@ def reverse_iterator_array(request):
     if array_type == "cupy":
         array = cp.array(base_array)
     else:
+        import numba.cuda
+
         array = numba.cuda.to_device(base_array)
 
     return array
 
 
+@pytest.mark.no_numba
 def test_reverse_input_iterator_equality():
     ary1 = cp.asarray([0, 1, 2], dtype="int32")
     ary2 = cp.asarray([3, 4, 5], dtype="int32")
@@ -141,6 +146,7 @@ def test_reverse_input_iterator_equality():
     assert it1.kind != it4.kind
 
 
+@pytest.mark.no_numba
 def test_reverse_output_iterator_equality():
     ary1 = cp.asarray([0, 1, 2], dtype="int32")
     ary2 = cp.asarray([3, 4, 5], dtype="int32")
@@ -155,6 +161,7 @@ def test_reverse_output_iterator_equality():
     assert it1.kind != it4.kind
 
 
+@pytest.mark.no_numba
 @pytest.mark.parametrize(
     "shape, itemsize, expected",
     [
@@ -179,6 +186,7 @@ def test_compute_c_contiguous_strides_in_bytes(shape, itemsize, expected):
     assert result == expected
 
 
+@pytest.mark.no_numba
 @pytest.mark.parametrize(
     "shape, dtype",
     [
diff --git a/python/cuda_cccl/tests/compute/test_merge_sort.py b/python/cuda_cccl/tests/compute/test_merge_sort.py
index 33d7d15de58..db97d5aca01 100644
--- a/python/cuda_cccl/tests/compute/test_merge_sort.py
+++ b/python/cuda_cccl/tests/compute/test_merge_sort.py
@@ -5,7 +5,6 @@
 from typing import List
 
 import cupy as cp
-import numba.cuda
 import numpy as np
 import pytest
 
@@ -82,11 +81,11 @@ def compare_op(lhs, rhs):
 def test_merge_sort_keys(dtype, num_items, op):
     h_in_keys = random_array(num_items, dtype)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_keys = cp.asarray(h_in_keys)
 
     merge_sort_device(d_in_keys, None, d_in_keys, None, op, num_items)
 
-    h_out_keys = d_in_keys.copy_to_host()
+    h_out_keys = d_in_keys.get()
     h_in_keys.sort()
 
     np.testing.assert_array_equal(h_out_keys, h_in_keys)
@@ -102,13 +101,13 @@ def test_merge_sort_pairs(dtype, num_items, op, monkeypatch):
     h_in_keys = random_array(num_items, dtype)
     h_in_items = random_array(num_items, np.float32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_items = numba.cuda.to_device(h_in_items)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_items = cp.asarray(h_in_items)
 
     merge_sort_device(d_in_keys, d_in_items, d_in_keys, d_in_items, op, num_items)
 
-    h_out_keys = d_in_keys.copy_to_host()
-    h_out_items = d_in_items.copy_to_host()
+    h_out_keys = d_in_keys.get()
+    h_out_items = d_in_items.get()
 
     argsort = np.argsort(h_in_keys, stable=True)
     h_in_keys = np.array(h_in_keys)[argsort]
@@ -123,12 +122,12 @@ def test_merge_sort_keys_copy(dtype, num_items, op):
     h_in_keys = random_array(num_items, dtype)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_out_keys = cp.asarray(h_out_keys)
 
     merge_sort_device(d_in_keys, None, d_out_keys, None, op, num_items)
 
-    h_out_keys = d_out_keys.copy_to_host()
+    h_out_keys = d_out_keys.get()
     h_in_keys.sort()
 
     np.testing.assert_array_equal(h_out_keys, h_in_keys)
@@ -146,15 +145,15 @@ def test_merge_sort_pairs_copy(dtype, num_items, op, monkeypatch):
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_items = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_items = numba.cuda.to_device(h_in_items)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_items = numba.cuda.to_device(h_out_items)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_items = cp.asarray(h_in_items)
+    d_out_keys = cp.asarray(h_out_keys)
+    d_out_items = cp.asarray(h_out_items)
 
     merge_sort_device(d_in_keys, d_in_items, d_out_keys, d_out_items, op, num_items)
 
-    h_out_keys = d_out_keys.copy_to_host()
-    h_out_items = d_out_items.copy_to_host()
+    h_out_keys = d_out_keys.get()
+    h_out_items = d_out_items.get()
 
     argsort = np.argsort(h_in_keys, stable=True)
     h_in_keys = np.array(h_in_keys)[argsort]
@@ -225,11 +224,11 @@ def compare_complex(lhs, rhs):
     imaginary = random_array(num_items, np.int64, max_value)
 
     h_in_keys = real + 1j * imaginary
-    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_keys = cp.asarray(h_in_keys)
 
     merge_sort_device(d_in_keys, None, d_in_keys, None, compare_complex, num_items)
 
-    h_out_keys = d_in_keys.copy_to_host()
+    h_out_keys = d_in_keys.get()
     h_in_keys = h_in_keys[np.argsort(h_in_keys.real, stable=True)]
 
     np.testing.assert_array_equal(h_out_keys, h_in_keys)
@@ -240,15 +239,15 @@ def test_merge_sort_keys_copy_iterator_input(dtype, num_items, op):
     h_in_keys = random_array(num_items, dtype)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_out_keys = cp.asarray(h_out_keys)
 
     i_input = CacheModifiedInputIterator(d_in_keys, modifier="stream")
 
     merge_sort_device(i_input, None, d_out_keys, None, op, num_items)
 
     h_in_keys.sort()
-    h_out_keys = d_out_keys.copy_to_host()
+    h_out_keys = d_out_keys.get()
 
     np.testing.assert_array_equal(h_out_keys, h_in_keys)
 
@@ -265,10 +264,10 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch)
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_items = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_items = numba.cuda.to_device(h_in_items)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_items = numba.cuda.to_device(h_out_items)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_items = cp.asarray(h_in_items)
+    d_out_keys = cp.asarray(h_out_keys)
+    d_out_items = cp.asarray(h_out_items)
 
     i_input_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream")
     i_input_items = CacheModifiedInputIterator(d_in_items, modifier="stream")
@@ -277,8 +276,8 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch)
         i_input_keys, i_input_items, d_out_keys, d_out_items, op, num_items
     )
 
-    h_out_keys = d_out_keys.copy_to_host()
-    h_out_items = d_out_items.copy_to_host()
+    h_out_keys = d_out_keys.get()
+    h_out_items = d_out_items.get()
 
     argsort = np.argsort(h_in_keys, stable=True)
     h_in_keys = np.array(h_in_keys)[argsort]
@@ -307,6 +306,7 @@ def test_merge_sort_with_stream(cuda_stream):
     np.testing.assert_array_equal(got, h_in_keys)
 
 
+@pytest.mark.no_numba
 def test_merge_sort_well_known_less():
     dtype = np.int32
 
@@ -326,6 +326,7 @@ def test_merge_sort_well_known_less():
     np.testing.assert_equal(d_out_keys.get(), expected)
 
 
+@pytest.mark.no_numba
 def test_merge_sort_well_known_greater():
     dtype = np.int32
 
diff --git a/python/cuda_cccl/tests/compute/test_nested_struct.py b/python/cuda_cccl/tests/compute/test_nested_struct.py
index 9cbfa40e393..715873af3c8 100644
--- a/python/cuda_cccl/tests/compute/test_nested_struct.py
+++ b/python/cuda_cccl/tests/compute/test_nested_struct.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 import cupy as cp
 import numpy as np
+import pytest
 
 import cuda.compute
 from cuda.compute import ZipIterator, gpu_struct
@@ -149,6 +150,7 @@ def sum_pixels(p1, p2):
     assert result["color"]["b"] == expected_b
 
 
+@pytest.mark.no_numba
 def test_dict_init_nested_struct():
     """Test initializing a nested struct with a dictionary."""
     Inner = gpu_struct({"a": np.int32, "b": np.float32})
@@ -162,6 +164,7 @@ def test_dict_init_nested_struct():
     assert np.isclose(obj.inner.b, 3.14)
 
 
+@pytest.mark.no_numba
 def test_dict_init_per_field():
     """Test initializing a struct with a dictionary for a nested field."""
     Inner = gpu_struct({"a": np.int32, "b": np.float32})
@@ -175,6 +178,7 @@ def test_dict_init_per_field():
     assert np.isclose(obj.inner.b, 3.14)
 
 
+@pytest.mark.no_numba
 def test_dict_init_deeply_nested():
     """Test initializing deeply nested structs (3+ levels) with dictionaries."""
     Level1 = gpu_struct({"value": np.int32})
@@ -189,6 +193,7 @@ def test_dict_init_deeply_nested():
     assert obj.middle.nested.value == 42
 
 
+@pytest.mark.no_numba
 def test_dict_init_mixed():
     """Test mixed initialization with some dicts and some direct values."""
     Inner1 = gpu_struct({"a": np.int32, "b": np.int32})
diff --git a/python/cuda_cccl/tests/compute/test_no_numba.py b/python/cuda_cccl/tests/compute/test_no_numba.py
index 8f1d271e1ed..bbab965c8bf 100644
--- a/python/cuda_cccl/tests/compute/test_no_numba.py
+++ b/python/cuda_cccl/tests/compute/test_no_numba.py
@@ -1,12 +1,6 @@
-import cupy as cp
-import numpy as np
 import pytest
 
-import cuda.compute
-from cuda.compute import OpKind
-
-# Mainly, these tests check that we can use algorithms with OpKind
-# operators while not requiring numba to be installed.
+# Check that tests marked no_numba fail fast if they import numba.
 pytestmark = pytest.mark.no_numba
 
 
@@ -16,70 +10,3 @@ def test_import_numba_raises():
         ImportError, match="This test is marked 'no_numba' but attempted to import it"
     ):
         import numba.cuda  # noqa: F401
-
-
-def test_reduce_op_kind():
-    num_items = 100
-    h_input = np.arange(num_items, dtype=np.int32)
-    d_input = cp.array(h_input)
-    d_output = cp.empty(1, dtype=np.int32)
-
-    h_init = np.array(0, dtype=np.int32)
-    cuda.compute.reduce_into(
-        d_in=d_input, d_out=d_output, num_items=num_items, op=OpKind.PLUS, h_init=h_init
-    )
-
-    result = d_output.get()[0]
-    expected = np.sum(h_input)
-    assert result == expected
-
-
-def test_binary_transform_op_kind():
-    num_items = 100
-    h_input1 = np.arange(num_items, dtype=np.int32)
-    h_input2 = np.arange(num_items, dtype=np.int32) * 2
-    d_input1 = cp.array(h_input1)
-    d_input2 = cp.array(h_input2)
-    d_output = cp.empty(num_items, dtype=np.int32)
-
-    cuda.compute.binary_transform(
-        d_in1=d_input1,
-        d_in2=d_input2,
-        d_out=d_output,
-        op=OpKind.PLUS,
-        num_items=num_items,
-    )
-
-    result = d_output.get()
-    expected = h_input1 + h_input2
-    assert np.array_equal(result, expected)
-
-
-def test_segmented_sort_op_kind():
-    # Create segments: [3, 1, 4] | [1, 5, 9, 2] | [6, 5]
-    num_items = 9
-    h_keys = np.array([3, 1, 4, 1, 5, 9, 2, 6, 5], dtype=np.int32)
-    h_offsets = np.array([0, 3, 7, 9], dtype=np.int32)
-
-    d_keys_in = cp.array(h_keys)
-    d_keys_out = cp.empty(num_items, dtype=np.int32)
-    d_offsets = cp.array(h_offsets)
-
-    num_segments = len(h_offsets) - 1
-
-    cuda.compute.segmented_sort(
-        d_in_keys=d_keys_in,
-        d_out_keys=d_keys_out,
-        d_in_values=None,
-        d_out_values=None,
-        num_items=num_items,
-        num_segments=num_segments,
-        start_offsets_in=d_offsets[:-1],
-        end_offsets_in=d_offsets[1:],
-        order=cuda.compute.SortOrder.ASCENDING,
-    )
-
-    result = d_keys_out.get()
-    # Expected: [1, 3, 4] | [1, 2, 5, 9] | [5, 6]
-    expected = np.array([1, 3, 4, 1, 2, 5, 9, 5, 6], dtype=np.int32)
-    assert np.array_equal(result, expected)
diff --git a/python/cuda_cccl/tests/compute/test_permutation_iterator.py b/python/cuda_cccl/tests/compute/test_permutation_iterator.py
index 74d74a7b4e6..226904b75da 100644
--- a/python/cuda_cccl/tests/compute/test_permutation_iterator.py
+++ b/python/cuda_cccl/tests/compute/test_permutation_iterator.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 import cupy as cp
 import numpy as np
+import pytest
 
 import cuda.compute
 from cuda.compute.iterators import (
@@ -12,6 +13,7 @@
 )
 
 
+@pytest.mark.no_numba
 def test_permutation_iterator_equality():
     values1 = cp.asarray([10, 20, 30, 40, 50], dtype="int32")
     values2 = cp.asarray([100, 200, 300], dtype="int32")
@@ -37,6 +39,7 @@ def test_permutation_iterator_equality():
     assert it1.kind != it5.kind
 
 
+@pytest.mark.no_numba
 def test_permutation_iterator_with_array_values():
     values = cp.asarray([10, 20, 30, 40, 50], dtype="int32")
     indices = cp.asarray([2, 0, 4, 1], dtype="int32")
@@ -54,6 +57,7 @@ def test_permutation_iterator_with_array_values():
     assert d_output[0] == values[indices].sum()
 
 
+@pytest.mark.no_numba
 def test_permutation_iterator_with_iterator_values():
     values_it = CountingIterator(np.int32(10))
     indices = cp.asarray([2, 0, 4, 1], dtype="int32")
@@ -157,6 +161,7 @@ def op(a):
     assert cp.all(d_out == expected)
 
 
+@pytest.mark.no_numba
 def test_caching_permutation_iterator():
     """Test that iterator compilation is cached across instances with the same structure."""
     from cuda.compute._cpp_compile import compile_cpp_op_code
@@ -204,6 +209,7 @@ def test_caching_permutation_iterator():
     )
 
 
+@pytest.mark.no_numba
 def test_permutation_iterator_advance():
     """Test PermutationIterator.__add__ only advances indices, not values."""
     # Create values array [10, 20, 30, 40, 50, 60, 70]
diff --git a/python/cuda_cccl/tests/compute/test_radix_sort.py b/python/cuda_cccl/tests/compute/test_radix_sort.py
index 62b8e7dcb44..67186d3a4f7 100644
--- a/python/cuda_cccl/tests/compute/test_radix_sort.py
+++ b/python/cuda_cccl/tests/compute/test_radix_sort.py
@@ -6,11 +6,11 @@
 from typing import Tuple
 
 import cupy as cp
-import numba
 import numpy as np
 import pytest
 
 import cuda.compute
+from cuda.core import Device
 from cuda.compute import (
     DoubleBuffer,
     SortOrder,
@@ -148,7 +148,7 @@ def host_sort(h_in_keys, h_in_values, order, begin_bit=None, end_bit=None) -> Tu
     DTYPE_SIZE,
 )
 def test_radix_sort_keys(dtype, num_items, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -164,12 +164,12 @@ def test_radix_sort_keys(dtype, num_items, monkeypatch):
     h_in_keys = random_array(num_items, dtype, max_value=20)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_out_keys = cp.asarray(h_out_keys)
 
     radix_sort_device(d_in_keys, d_out_keys, None, None, order, num_items)
 
-    h_out_keys = d_out_keys.copy_to_host()
+    h_out_keys = d_out_keys.get()
 
     h_in_keys, _ = host_sort(h_in_keys, None, order)
 
@@ -195,17 +195,17 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch):
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_values = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_values = numba.cuda.to_device(h_in_values)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_values = numba.cuda.to_device(h_out_values)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_values = cp.asarray(h_in_values)
+    d_out_keys = cp.asarray(h_out_keys)
+    d_out_values = cp.asarray(h_out_values)
 
     radix_sort_device(
         d_in_keys, d_out_keys, d_in_values, d_out_values, order, num_items
     )
 
-    h_out_keys = d_out_keys.copy_to_host()
-    h_out_values = d_out_values.copy_to_host()
+    h_out_keys = d_out_keys.get()
+    h_out_values = d_out_values.get()
 
     h_in_keys, h_in_values = host_sort(h_in_keys, h_in_values, order)
 
@@ -218,7 +218,7 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch):
     DTYPE_SIZE,
 )
 def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -234,14 +234,14 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch):
     h_in_keys = random_array(num_items, dtype, max_value=20)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_out_keys = cp.asarray(h_out_keys)
 
     keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys)
 
     radix_sort_device(keys_double_buffer, None, None, None, order, num_items)
 
-    h_out_keys = keys_double_buffer.current().copy_to_host()
+    h_out_keys = keys_double_buffer.current().get()
 
     h_in_keys, _ = host_sort(h_in_keys, None, order)
 
@@ -253,7 +253,7 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch):
     DTYPE_SIZE,
 )
 def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # NOTE: int16 failures seen only with NVRTC 13.1:
     if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)):
         import cuda.compute._cccl_interop
@@ -270,10 +270,10 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_values = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_values = numba.cuda.to_device(h_in_values)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_values = numba.cuda.to_device(h_out_values)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_values = cp.asarray(h_in_values)
+    d_out_keys = cp.asarray(h_out_keys)
+    d_out_values = cp.asarray(h_out_values)
 
     keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys)
     values_double_buffer = DoubleBuffer(d_in_values, d_out_values)
@@ -282,8 +282,8 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
         keys_double_buffer, None, values_double_buffer, None, order, num_items
     )
 
-    h_out_keys = keys_double_buffer.current().copy_to_host()
-    h_out_values = values_double_buffer.current().copy_to_host()
+    h_out_keys = keys_double_buffer.current().get()
+    h_out_values = values_double_buffer.current().get()
 
     h_in_keys, h_in_values = host_sort(h_in_keys, h_in_values, order)
 
@@ -304,7 +304,7 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
     DTYPE_SIZE_BIT_WINDOW,
 )
 def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # NOTE: int16 failures seen only with NVRTC 13.1:
     if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)):
         import cuda.compute._cccl_interop
@@ -329,10 +329,10 @@ def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch):
         h_out_keys = np.empty(num_items, dtype=dtype)
         h_out_values = np.empty(num_items, dtype=np.float32)
 
-        d_in_keys = numba.cuda.to_device(h_in_keys)
-        d_in_values = numba.cuda.to_device(h_in_values)
-        d_out_keys = numba.cuda.to_device(h_out_keys)
-        d_out_values = numba.cuda.to_device(h_out_values)
+        d_in_keys = cp.asarray(h_in_keys)
+        d_in_values = cp.asarray(h_in_values)
+        d_out_keys = cp.asarray(h_out_keys)
+        d_out_values = cp.asarray(h_out_values)
 
         radix_sort_device(
             d_in_keys,
@@ -345,8 +345,8 @@ def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch):
             end_bit,
         )
 
-        h_out_keys = d_out_keys.copy_to_host()
-        h_out_values = d_out_values.copy_to_host()
+        h_out_keys = d_out_keys.get()
+        h_out_values = d_out_values.get()
 
         h_in_keys, h_in_values = host_sort(
             h_in_keys, h_in_values, order, begin_bit, end_bit
@@ -384,10 +384,10 @@ def test_radix_sort_pairs_double_buffer_bit_window(dtype, num_items, monkeypatch
         h_out_keys = np.empty(num_items, dtype=dtype)
         h_out_values = np.empty(num_items, dtype=np.float32)
 
-        d_in_keys = numba.cuda.to_device(h_in_keys)
-        d_in_values = numba.cuda.to_device(h_in_values)
-        d_out_keys = numba.cuda.to_device(h_out_keys)
-        d_out_values = numba.cuda.to_device(h_out_values)
+        d_in_keys = cp.asarray(h_in_keys)
+        d_in_values = cp.asarray(h_in_values)
+        d_out_keys = cp.asarray(h_out_keys)
+        d_out_values = cp.asarray(h_out_values)
 
         keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys)
         values_double_buffer = DoubleBuffer(d_in_values, d_out_values)
@@ -403,8 +403,8 @@ def test_radix_sort_pairs_double_buffer_bit_window(dtype, num_items, monkeypatch
             end_bit,
         )
 
-        h_out_keys = keys_double_buffer.current().copy_to_host()
-        h_out_values = values_double_buffer.current().copy_to_host()
+        h_out_keys = keys_double_buffer.current().get()
+        h_out_values = values_double_buffer.current().get()
 
         h_in_keys, h_in_values = host_sort(
             h_in_keys, h_in_values, order, begin_bit, end_bit
@@ -469,8 +469,9 @@ def test_radix_sort_with_stream(cuda_stream):
     np.testing.assert_array_equal(got, h_in_keys)
 
 
+@pytest.mark.no_numba
 def test_radix_sort(monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -518,8 +519,9 @@ def test_radix_sort(monkeypatch):
     np.testing.assert_array_equal(h_out_items, h_in_values)
 
 
+@pytest.mark.no_numba
 def test_radix_sort_double_buffer(monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
diff --git a/python/cuda_cccl/tests/compute/test_reduce.py b/python/cuda_cccl/tests/compute/test_reduce.py
index df3f024f0ee..15f11428506 100644
--- a/python/cuda_cccl/tests/compute/test_reduce.py
+++ b/python/cuda_cccl/tests/compute/test_reduce.py
@@ -6,7 +6,6 @@
 import random
 
 import cupy as cp
-import numba.cuda
 import numpy as np
 import pytest
 from cupy.cuda import runtime
@@ -73,15 +72,15 @@ def add_op(a, b):
 def test_device_reduce(dtype, num_items, op):
     init_value = 42
     h_init = np.array([init_value], dtype=dtype)
-    d_output = numba.cuda.device_array(1, dtype=dtype)
+    d_output = cp.empty(1, dtype=dtype)
 
     h_input = random_int(num_items, dtype)
-    d_input = numba.cuda.to_device(h_input)
+    d_input = cp.asarray(h_input)
 
     cuda.compute.reduce_into(
         d_in=d_input, d_out=d_output, num_items=d_input.size, op=op, h_init=h_init
     )
-    h_output = d_output.copy_to_host()
+    h_output = d_output.get()
     assert h_output[0] == pytest.approx(
         sum(h_input) + init_value, rel=0.08 if dtype == np.float16 else 0
     )  # obtained relative error value from c2h/include/c2h/check_results.cuh
@@ -94,10 +93,10 @@ def test_device_reduce_with_lambda():
     num_items = 1024
 
     h_init = np.array([init_value], dtype=dtype)
-    d_output = numba.cuda.device_array(1, dtype=dtype)
+    d_output = cp.empty(1, dtype=dtype)
 
     h_input = random_int(num_items, dtype)
-    d_input = numba.cuda.to_device(h_input)
+    d_input = cp.asarray(h_input)
 
     # Use a lambda function directly as the reducer
     cuda.compute.reduce_into(
@@ -107,7 +106,7 @@ def test_device_reduce_with_lambda():
         op=lambda a, b: a + b,
         h_init=h_init,
     )
-    h_output = d_output.copy_to_host()
+    h_output = d_output.get()
     assert h_output[0] == sum(h_input) + init_value
 
 
@@ -118,10 +117,10 @@ def test_device_reduce_with_lambda_variable():
     num_items = 1024
 
     h_init = np.array([init_value], dtype=dtype)
-    d_output = numba.cuda.device_array(1, dtype=dtype)
+    d_output = cp.empty(1, dtype=dtype)
 
     h_input = random_int(num_items, dtype)
-    d_input = numba.cuda.to_device(h_input)
+    d_input = cp.asarray(h_input)
 
     # Use a lambda function assigned to a variable as the reducer
     cuda.compute.reduce_into(
@@ -131,24 +130,24 @@ def test_device_reduce_with_lambda_variable():
         op=add_op_lambda,
         h_init=h_init,
     )
-    h_output = d_output.copy_to_host()
+    h_output = d_output.get()
     assert h_output[0] == sum(h_input) + init_value
 
 
 def test_complex_device_reduce():
     h_init = np.array([40.0 + 2.0j], dtype=complex)
-    d_output = numba.cuda.device_array(1, dtype=complex)
+    d_output = cp.empty(1, dtype=complex)
 
     for num_items in [42, 420000]:
         real_imag = np.random.random((2, num_items))
         h_input = real_imag[0] + 1j * real_imag[1]
-        d_input = numba.cuda.to_device(h_input)
+        d_input = cp.asarray(h_input)
         assert d_input.size == num_items
         cuda.compute.reduce_into(
             d_in=d_input, d_out=d_output, num_items=num_items, op=add_op, h_init=h_init
         )
 
-        result = d_output.copy_to_host()[0]
+        result = d_output.get()[0]
         expected = np.sum(h_input, initial=h_init[0])
         assert result == pytest.approx(expected)
 
@@ -162,11 +161,11 @@ def _test_device_sum_with_iterator(
 
     if use_numpy_array:
         h_input = np.array(l_varr, dtype_inp)
-        d_input = numba.cuda.to_device(h_input)
+        d_input = cp.asarray(h_input)
     else:
         d_input = i_input
 
-    d_output = numba.cuda.device_array(1, dtype_out)  # to store device sum
+    d_output = cp.empty(1, dtype_out)  # to store device sum
 
     h_init = np.array([start_sum_with], dtype_out)
 
@@ -174,7 +173,7 @@ def _test_device_sum_with_iterator(
         d_in=d_input, d_out=d_output, num_items=len(l_varr), op=add_op, h_init=h_init
     )
 
-    h_output = d_output.copy_to_host()
+    h_output = d_output.get()
     assert h_output[0] == expected_result
 
 
@@ -216,7 +215,7 @@ def test_device_sum_cache_modified_input_it(
     l_varr = [rng.randrange(100) for _ in range(num_items)]
     dtype_inp = np.dtype(supported_value_type)
     dtype_out = dtype_inp
-    input_devarr = numba.cuda.to_device(np.array(l_varr, dtype=dtype_inp))
+    input_devarr = cp.asarray(np.array(l_varr, dtype=dtype_inp))
     i_input = CacheModifiedInputIterator(input_devarr, modifier="stream")
     _test_device_sum_with_iterator(
         l_varr, start_sum_with, i_input, dtype_inp, dtype_out, use_numpy_array
@@ -691,6 +690,7 @@ def add_op(x, y):
         )
 
 
+@pytest.mark.no_numba
 def test_device_reduce_well_known_plus():
     dtype = np.int32
     h_init = np.array([0], dtype=dtype)
@@ -709,6 +709,7 @@ def test_device_reduce_well_known_plus():
     assert (d_output == expected_output).all()
 
 
+@pytest.mark.no_numba
 def test_device_reduce_well_known_minimum():
     dtype = np.int32
     h_init = np.array([100], dtype=dtype)
@@ -727,6 +728,7 @@ def test_device_reduce_well_known_minimum():
     assert (d_output == expected_output).all()
 
 
+@pytest.mark.no_numba
 def test_device_reduce_well_known_maximum():
     dtype = np.int32
     h_init = np.array([-100], dtype=dtype)
@@ -925,6 +927,7 @@ def sqrt(x: dtype) -> dtype:
     np.testing.assert_allclose(d_output.get(), expected.get(), atol=1e-6)
 
 
+@pytest.mark.no_numba
 def test_reduce_with_not_guaranteed_determinism(floating_array):
     dtype = floating_array.dtype
     h_init = np.array([0], dtype=dtype)
@@ -942,6 +945,7 @@ def test_reduce_with_not_guaranteed_determinism(floating_array):
     )
 
 
+@pytest.mark.no_numba
 def test_reduce_bool():
     h_init = np.array([False])
     d_input = cp.array([True, False, True])
diff --git a/python/cuda_cccl/tests/compute/test_scan.py b/python/cuda_cccl/tests/compute/test_scan.py
index af6d941b11b..97f5fae630a 100644
--- a/python/cuda_cccl/tests/compute/test_scan.py
+++ b/python/cuda_cccl/tests/compute/test_scan.py
@@ -4,11 +4,11 @@
 
 
 import cupy as cp
-import numba.cuda
 import numpy as np
 import pytest
 
 import cuda.compute
+from cuda.core import Device
 from cuda.compute import (
     CountingIterator,
     OpKind,
@@ -52,7 +52,7 @@ def scan_device(d_input, d_output, num_items, op, h_init, force_inclusive, strea
     [True, False],
 )
 def test_scan_array_input(force_inclusive, input_array, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip sass verification if input is complex
     # as LDL/STL instructions are emitted for complex types.
     # Also skip for:
@@ -214,6 +214,7 @@ def op(a, b):
     np.testing.assert_allclose(expected, got, rtol=1e-5)
 
 
+@pytest.mark.no_numba
 def test_exclusive_scan_well_known_plus():
     dtype = np.int32
     h_init = np.array([0], dtype=dtype)
@@ -232,8 +233,9 @@ def test_exclusive_scan_well_known_plus():
     np.testing.assert_equal(d_output.get(), expected)
 
 
+@pytest.mark.no_numba
 def test_inclusive_scan_well_known_plus(monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip SASS check for CC 9.0+, due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -354,7 +356,7 @@ def add_op(a, b):
 
 
 def test_reverse_input_iterator(monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip SASS check for CC 9.0+, due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -409,6 +411,7 @@ def add_op(a, b):
     np.testing.assert_equal(d_output.get(), expected)
 
 
+@pytest.mark.no_numba
 @pytest.mark.parametrize(
     "force_inclusive",
     [True, False],
@@ -430,13 +433,14 @@ def test_future_init_value(force_inclusive):
     np.testing.assert_array_equal(expected, got)
 
 
+@pytest.mark.no_numba
 def test_no_init_value(monkeypatch):
     force_inclusive = True
     num_items = 1024
     dtype = np.dtype("int32")
 
     # Skip SASS check for CC 9.0 due to LDL/STL CI failure.
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     if cc_major >= 9:
         import cuda.compute._cccl_interop
 
@@ -456,6 +460,7 @@ def test_no_init_value(monkeypatch):
     np.testing.assert_array_equal(expected, got)
 
 
+@pytest.mark.no_numba
 def test_no_init_value_iterator():
     force_inclusive = True
     num_items = 1024
@@ -493,6 +498,7 @@ def test_inclusive_scan_with_lambda():
     np.testing.assert_array_equal(d_output.get(), expected)
 
 
+@pytest.mark.no_numba
 @pytest.mark.parametrize("force_inclusive", [True, False])
 def test_scan_bool_maximum(force_inclusive):
     h_init = np.array([False], dtype=np.bool_)
diff --git a/python/cuda_cccl/tests/compute/test_segmented_reduce.py b/python/cuda_cccl/tests/compute/test_segmented_reduce.py
index 950d860a14d..815c98b8edb 100644
--- a/python/cuda_cccl/tests/compute/test_segmented_reduce.py
+++ b/python/cuda_cccl/tests/compute/test_segmented_reduce.py
@@ -269,6 +269,7 @@ def _plus(a, b):
         )
 
 
+@pytest.mark.no_numba
 def test_segmented_reduce_well_known_plus(monkeypatch):
     # Disable SASS verification for this test (LDL instruction in SASS).
     monkeypatch.setattr(
@@ -299,6 +300,7 @@ def test_segmented_reduce_well_known_plus(monkeypatch):
     np.testing.assert_equal(d_output.get(), expected)
 
 
+@pytest.mark.no_numba
 def test_segmented_reduce_well_known_maximum(monkeypatch):
     # Disable SASS verification for this test (LDL instruction in SASS).
     monkeypatch.setattr(
@@ -329,6 +331,7 @@ def test_segmented_reduce_well_known_maximum(monkeypatch):
     np.testing.assert_equal(d_output.get(), expected)
 
 
+@pytest.mark.no_numba
 def test_segmented_reduce_bool_maximum(monkeypatch):
     # Disable SASS verification for this test (LDL instruction in SASS).
     monkeypatch.setattr(
diff --git a/python/cuda_cccl/tests/compute/test_segmented_sort.py b/python/cuda_cccl/tests/compute/test_segmented_sort.py
index 2bdd03abb42..9c2ab9f2de4 100644
--- a/python/cuda_cccl/tests/compute/test_segmented_sort.py
+++ b/python/cuda_cccl/tests/compute/test_segmented_sort.py
@@ -5,12 +5,13 @@
 from typing import Tuple
 
 import cupy as cp
-import numba
 import numpy as np
 import pytest
 
 import cuda.compute
 
+pytestmark = pytest.mark.no_numba
+
 DTYPE_LIST = [
     np.uint8,
     np.int16,
@@ -118,8 +119,8 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch):
     h_in_keys = random_array(num_items, dtype, max_value=50)
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
+    d_in_keys = cp.asarray(h_in_keys)
+    d_out_keys = cp.asarray(np.empty_like(h_in_keys))
 
     cuda.compute.segmented_sort(
         d_in_keys=d_in_keys,
@@ -133,7 +134,7 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch):
         order=order,
     )
 
-    h_out_keys = d_out_keys.copy_to_host()
+    h_out_keys = d_out_keys.get()
     expected_keys, _ = host_segmented_sort(
         h_in_keys, None, start_offsets, end_offsets, order
     )
@@ -153,10 +154,10 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size):
 
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_vals = numba.cuda.to_device(h_in_vals)
-    d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
-    d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals))
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_vals = cp.asarray(h_in_vals)
+    d_out_keys = cp.asarray(np.empty_like(h_in_keys))
+    d_out_vals = cp.asarray(np.empty_like(h_in_vals))
 
     cuda.compute.segmented_sort(
         d_in_keys=d_in_keys,
@@ -170,8 +171,8 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size):
         order=order,
     )
 
-    h_out_keys = d_out_keys.copy_to_host()
-    h_out_vals = d_out_vals.copy_to_host()
+    h_out_keys = d_out_keys.get()
+    h_out_vals = d_out_vals.get()
 
     expected_keys, expected_vals = host_segmented_sort(
         h_in_keys, h_in_vals, start_offsets, end_offsets, order
@@ -189,8 +190,8 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size):
     h_in_keys = random_array(num_items, dtype, max_value=20)
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
+    d_in_keys = cp.asarray(h_in_keys)
+    d_tmp_keys = cp.asarray(np.empty_like(h_in_keys))
     keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys)
 
     cuda.compute.segmented_sort(
@@ -205,7 +206,7 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size):
         order=order,
     )
 
-    h_out_keys = keys_db.current().copy_to_host()
+    h_out_keys = keys_db.current().get()
     expected_keys, _ = host_segmented_sort(
         h_in_keys, None, start_offsets, end_offsets, order
     )
@@ -224,10 +225,10 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size):
 
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_vals = numba.cuda.to_device(h_in_vals)
-    d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
-    d_tmp_vals = numba.cuda.to_device(np.empty_like(h_in_vals))
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_vals = cp.asarray(h_in_vals)
+    d_tmp_keys = cp.asarray(np.empty_like(h_in_keys))
+    d_tmp_vals = cp.asarray(np.empty_like(h_in_vals))
 
     keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys)
     vals_db = cuda.compute.DoubleBuffer(d_in_vals, d_tmp_vals)
@@ -244,8 +245,8 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size):
         order=order,
     )
 
-    h_out_keys = keys_db.current().copy_to_host()
-    h_out_vals = vals_db.current().copy_to_host()
+    h_out_keys = keys_db.current().get()
+    h_out_vals = vals_db.current().get()
 
     expected_keys, expected_vals = host_segmented_sort(
         h_in_keys, h_in_vals, start_offsets, end_offsets, order
@@ -297,10 +298,10 @@ def test_segmented_sort_variable_segment_sizes(num_segments):
     h_in_keys = random_array(num_items, np.int32, max_value=100)
     h_in_vals = random_array(num_items, np.float32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_vals = numba.cuda.to_device(h_in_vals)
-    d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
-    d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals))
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_vals = cp.asarray(h_in_vals)
+    d_out_keys = cp.asarray(np.empty_like(h_in_keys))
+    d_out_vals = cp.asarray(np.empty_like(h_in_vals))
 
     cuda.compute.segmented_sort(
         d_in_keys=d_in_keys,
@@ -314,8 +315,8 @@ def test_segmented_sort_variable_segment_sizes(num_segments):
         order=order,
     )
 
-    h_out_keys = d_out_keys.copy_to_host()
-    h_out_vals = d_out_vals.copy_to_host()
+    h_out_keys = d_out_keys.get()
+    h_out_vals = d_out_vals.get()
     expected_keys, expected_vals = host_segmented_sort(
         h_in_keys, h_in_vals, start_offsets, end_offsets, order
     )
diff --git a/python/cuda_cccl/tests/compute/test_select.py b/python/cuda_cccl/tests/compute/test_select.py
index f2483860a4a..d467d33c635 100644
--- a/python/cuda_cccl/tests/compute/test_select.py
+++ b/python/cuda_cccl/tests/compute/test_select.py
@@ -7,6 +7,8 @@
 import pytest
 
 import cuda.compute
+from cuda.compute._cpp_compile import compile_cpp_op_code
+from cuda.compute.op import RawOp
 from cuda.compute import CacheModifiedInputIterator, ZipIterator, gpu_struct
 
 DTYPE_LIST = [
@@ -59,6 +61,35 @@ def _host_select(h_in: np.ndarray, cond):
     return selected, np.int64(selected.size)
 
 
+def _raw_even_i32_op() -> RawOp:
+    source = """
+extern "C" __device__ void is_even_i32(void* x, void* result) {
+    int value = *static_cast<int*>(x);
+    *static_cast<bool*>(result) = (value % 2) == 0;
+}
+"""
+    return RawOp(ltoir=compile_cpp_op_code(source), name="is_even_i32")
+
+
+@pytest.mark.no_numba
+def test_select_raw_op_minimal():
+    h_in = np.arange(10, dtype=np.int32)
+    d_in = cp.asarray(h_in)
+    d_out = cp.empty_like(d_in)
+    d_num_selected = cp.empty(2, dtype=np.uint64)
+
+    cuda.compute.select(
+        d_in=d_in,
+        d_out=d_out,
+        d_num_selected_out=d_num_selected,
+        cond=_raw_even_i32_op(),
+        num_items=len(d_in),
+    )
+
+    num_selected = int(d_num_selected[0].get())
+    np.testing.assert_array_equal(d_out.get()[:num_selected], h_in[h_in % 2 == 0])
+
+
 @pytest.mark.parametrize("dtype,num_items", select_params)
 def test_select_basic(dtype, num_items):
     h_in = random_array(num_items, dtype, max_value=100)
diff --git a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py
index 5c82fe1ad38..715f9e51235 100644
--- a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py
+++ b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py
@@ -87,6 +87,7 @@ def test_shuffle_iterator_with_permutation_iterator():
     assert sorted(result) == sorted(d_values.get())
 
 
+@pytest.mark.no_numba
 def test_shuffle_iterator_invalid_num_items():
     with pytest.raises(ValueError, match="num_items must be > 0"):
         ShuffleIterator(0, seed=42)
diff --git a/python/cuda_cccl/tests/compute/test_struct_field_validation.py b/python/cuda_cccl/tests/compute/test_struct_field_validation.py
index 3ddaee54d41..cf1edd7e681 100644
--- a/python/cuda_cccl/tests/compute/test_struct_field_validation.py
+++ b/python/cuda_cccl/tests/compute/test_struct_field_validation.py
@@ -14,6 +14,8 @@
 
 from cuda.compute import gpu_struct
 
+pytestmark = pytest.mark.no_numba
+
 
 def test_newline_in_field_name_is_rejected():
     """Field names with newlines must be rejected — they are the exec() injection vector."""
diff --git a/python/cuda_cccl/tests/compute/test_three_way_partition.py b/python/cuda_cccl/tests/compute/test_three_way_partition.py
index 3fb5b102275..9f2683f6798 100644
--- a/python/cuda_cccl/tests/compute/test_three_way_partition.py
+++ b/python/cuda_cccl/tests/compute/test_three_way_partition.py
@@ -7,6 +7,8 @@
 import pytest
 
 import cuda.compute
+from cuda.compute._cpp_compile import compile_cpp_op_code
+from cuda.compute.op import RawOp
 from cuda.compute import CacheModifiedInputIterator, gpu_struct
 
 DTYPE_LIST = [
@@ -65,6 +67,50 @@ def _host_three_way_partition(h_in: np.ndarray, less_than_op, greater_equal_op):
     )
 
 
+def _raw_less_than_i32(name: str, threshold: int) -> RawOp:
+    source = f"""
+extern "C" __device__ void {name}(void* x, void* result) {{
+    int value = *static_cast<int*>(x);
+    *static_cast<bool*>(result) = value < {threshold};
+}}
+"""
+    return RawOp(ltoir=compile_cpp_op_code(source), name=name)
+
+
+@pytest.mark.no_numba
+def test_three_way_partition_raw_op_minimal():
+    h_in = np.arange(10, dtype=np.int32)
+    d_in = cp.asarray(h_in)
+    d_first = cp.empty_like(d_in)
+    d_second = cp.empty_like(d_in)
+    d_unselected = cp.empty_like(d_in)
+    d_num_selected = cp.empty(2, dtype=np.uint64)
+
+    cuda.compute.three_way_partition(
+        d_in=d_in,
+        d_first_part_out=d_first,
+        d_second_part_out=d_second,
+        d_unselected_out=d_unselected,
+        d_num_selected_out=d_num_selected,
+        select_first_part_op=_raw_less_than_i32("less_than_3_i32", 3),
+        select_second_part_op=_raw_less_than_i32("less_than_6_i32", 6),
+        num_items=len(d_in),
+    )
+
+    selected = d_num_selected.get()
+    first_count = int(selected[0])
+    second_count = int(selected[1])
+    unselected_count = len(h_in) - first_count - second_count
+
+    np.testing.assert_array_equal(d_first.get()[:first_count], h_in[h_in < 3])
+    np.testing.assert_array_equal(
+        d_second.get()[:second_count], h_in[(h_in >= 3) & (h_in < 6)]
+    )
+    np.testing.assert_array_equal(
+        d_unselected.get()[:unselected_count], h_in[h_in >= 6]
+    )
+
+
 @pytest.mark.parametrize("dtype,num_items", three_way_partition_params)
 def test_three_way_partition_basic(dtype, num_items, monkeypatch):
     # NOTE: the SASS check failure is seen only with NVRTC 13.1:
diff --git a/python/cuda_cccl/tests/compute/test_transform.py b/python/cuda_cccl/tests/compute/test_transform.py
index c7c3ca2818a..b8429726d77 100644
--- a/python/cuda_cccl/tests/compute/test_transform.py
+++ b/python/cuda_cccl/tests/compute/test_transform.py
@@ -266,6 +266,7 @@ def op2(a):
     np.testing.assert_allclose(expected, got)
 
 
+@pytest.mark.no_numba
 def test_unary_transform_well_known_negate():
     """Test unary transform with well-known NEGATE operation."""
     dtype = np.int32
@@ -282,6 +283,7 @@ def test_unary_transform_well_known_negate():
     np.testing.assert_equal(d_output.get(), expected)
 
 
+@pytest.mark.no_numba
 def test_unary_transform_well_known_identity():
     """Test unary transform with well-known IDENTITY operation."""
     dtype = np.int32
@@ -298,6 +300,7 @@ def test_unary_transform_well_known_identity():
     np.testing.assert_equal(d_output.get(), expected)
 
 
+@pytest.mark.no_numba
 @pytest.mark.parametrize("dtype", [np.int32, np.float16])
 def test_binary_transform_well_known_plus(dtype):
     """Test binary transform with well-known PLUS operation."""
@@ -319,6 +322,7 @@ def test_binary_transform_well_known_plus(dtype):
     np.testing.assert_equal(d_output.get(), expected)
 
 
+@pytest.mark.no_numba
 def test_binary_transform_well_known_multiplies():
     """Test binary transform with well-known MULTIPLIES operation."""
     dtype = np.int32
@@ -586,6 +590,7 @@ def test_binary_transform_with_lambda():
     np.testing.assert_array_equal(d_out.get(), expected)
 
 
+@pytest.mark.no_numba
 def test_binary_transform_bool_equal_to():
     d_input1 = cp.array([True, False, True, False], dtype=np.bool_)
     d_input2 = cp.array([True, True, False, False], dtype=np.bool_)
diff --git a/python/cuda_cccl/tests/compute/test_unique_by_key.py b/python/cuda_cccl/tests/compute/test_unique_by_key.py
index 83a4a17db67..5f7d310d71e 100644
--- a/python/cuda_cccl/tests/compute/test_unique_by_key.py
+++ b/python/cuda_cccl/tests/compute/test_unique_by_key.py
@@ -4,11 +4,11 @@
 
 
 import cupy as cp
-import numba.cuda
 import numpy as np
 import pytest
 
 import cuda.compute
+from cuda.core import Device
 from cuda.compute import (
     CacheModifiedInputIterator,
     DiscardIterator,
@@ -125,7 +125,7 @@ def compare_op(lhs, rhs):
 
 @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params)
 def test_unique_by_key(dtype, num_items, op, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -143,11 +143,11 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch):
     h_out_items = np.empty(num_items, dtype=np.float32)
     h_out_num_selected = np.empty(1, np.int32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_items = numba.cuda.to_device(h_in_items)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_items = numba.cuda.to_device(h_out_items)
-    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_items = cp.asarray(h_in_items)
+    d_out_keys = cp.asarray(h_out_keys)
+    d_out_items = cp.asarray(h_out_items)
+    d_out_num_selected = cp.asarray(h_out_num_selected)
 
     unique_by_key_device(
         d_in_keys,
@@ -159,10 +159,10 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch):
         num_items,
     )
 
-    h_out_num_selected = d_out_num_selected.copy_to_host()
+    h_out_num_selected = d_out_num_selected.get()
     num_selected = h_out_num_selected[0]
-    h_out_keys = d_out_keys.copy_to_host()[:num_selected]
-    h_out_items = d_out_items.copy_to_host()[:num_selected]
+    h_out_keys = d_out_keys.get()[:num_selected]
+    h_out_items = d_out_items.get()[:num_selected]
 
     expected_keys, expected_items = unique_by_key_host(h_in_keys, h_in_items)
 
@@ -172,7 +172,7 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch):
 
 @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params)
 def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip sass verification for CC 9.0+, due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -190,11 +190,11 @@ def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch):
     h_out_items = np.empty(num_items, dtype=np.float32)
     h_out_num_selected = np.empty(1, np.int64)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_items = numba.cuda.to_device(h_in_items)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_items = numba.cuda.to_device(h_out_items)
-    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_items = cp.asarray(h_in_items)
+    d_out_keys = cp.asarray(h_out_keys)
+    d_out_items = cp.asarray(h_out_items)
+    d_out_num_selected = cp.asarray(h_out_num_selected)
 
     i_in_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream")
     i_in_items = CacheModifiedInputIterator(d_in_items, modifier="stream")
@@ -209,10 +209,10 @@ def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch):
         num_items,
     )
 
-    h_out_num_selected = d_out_num_selected.copy_to_host()
+    h_out_num_selected = d_out_num_selected.get()
     num_selected = h_out_num_selected[0]
-    h_out_keys = d_out_keys.copy_to_host()[:num_selected]
-    h_out_items = d_out_items.copy_to_host()[:num_selected]
+    h_out_keys = d_out_keys.get()[:num_selected]
+    h_out_items = d_out_items.get()[:num_selected]
 
     expected_keys, expected_items = unique_by_key_host(h_in_keys, h_in_items)
 
@@ -226,9 +226,9 @@ def test_unique_by_key_keys_only():
     h_out_keys = np.empty(num_items, dtype=np.int32)
     h_out_num_selected = np.empty(1, np.int32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_out_keys = cp.asarray(h_out_keys)
+    d_out_num_selected = cp.asarray(h_out_num_selected)
 
     unique_by_key_device(
         d_in_keys,
@@ -240,9 +240,9 @@ def test_unique_by_key_keys_only():
         num_items,
     )
 
-    h_out_num_selected = d_out_num_selected.copy_to_host()
+    h_out_num_selected = d_out_num_selected.get()
     num_selected = h_out_num_selected[0]
-    h_out_keys = d_out_keys.copy_to_host()[:num_selected]
+    h_out_keys = d_out_keys.get()[:num_selected]
 
     expected_keys, _ = unique_by_key_host(
         h_in_keys,
@@ -267,11 +267,11 @@ def compare_complex(lhs, rhs):
     h_out_items = np.empty(num_items, dtype=np.float32)
     h_out_num_selected = np.empty(1, np.int32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_items = numba.cuda.to_device(h_in_items)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_items = numba.cuda.to_device(h_out_items)
-    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_items = cp.asarray(h_in_items)
+    d_out_keys = cp.asarray(h_out_keys)
+    d_out_items = cp.asarray(h_out_items)
+    d_out_num_selected = cp.asarray(h_out_num_selected)
 
     unique_by_key_device(
         d_in_keys,
@@ -283,10 +283,10 @@ def compare_complex(lhs, rhs):
         num_items,
     )
 
-    h_out_num_selected = d_out_num_selected.copy_to_host()
+    h_out_num_selected = d_out_num_selected.get()
     num_selected = h_out_num_selected[0]
-    h_out_keys = d_out_keys.copy_to_host()[:num_selected]
-    h_out_items = d_out_items.copy_to_host()[:num_selected]
+    h_out_keys = d_out_keys.get()[:num_selected]
+    h_out_items = d_out_items.get()[:num_selected]
 
     expected_keys, expected_items = unique_by_key_host(
         h_in_keys, h_in_items, compare_complex
@@ -363,7 +363,7 @@ def struct_compare_op(lhs, rhs):
 
 
 def test_unique_by_key_with_stream(cuda_stream, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -417,8 +417,9 @@ def test_unique_by_key_with_stream(cuda_stream, monkeypatch):
     np.testing.assert_array_equal(h_out_items, expected_items)
 
 
+@pytest.mark.no_numba
 def test_unique_by_key_well_known_equal_to(monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
diff --git a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py
index e08709b6e1a..e4d8185eb91 100644
--- a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py
+++ b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py
@@ -12,7 +12,9 @@
 """
 
 import pytest
-from numba import types
+
+numba = pytest.importorskip("numba")
+types = numba.types
 
 from cuda.compute._odr_helpers import _ArgMode, _ArgSpec, _create_void_ptr_wrapper
 from cuda.compute._utils import sanitize_identifier
diff --git a/python/cuda_cccl/tests/compute/test_zip_iterator.py b/python/cuda_cccl/tests/compute/test_zip_iterator.py
index 85630bb4f5f..aea1f167f9f 100644
--- a/python/cuda_cccl/tests/compute/test_zip_iterator.py
+++ b/python/cuda_cccl/tests/compute/test_zip_iterator.py
@@ -6,6 +6,7 @@
 import pytest
 
 import cuda.compute
+from cuda.core import Device
 from cuda.compute import (
     CountingIterator,
     TransformIterator,
@@ -261,10 +262,8 @@ def min_pairs(p1, p2):
 @pytest.mark.parametrize("num_items", [10, 1000])
 def test_output_zip_iterator_with_scan(monkeypatch, num_items):
     """Test ZipIterator as output iterator with scan operations."""
-    import numba.cuda
-
     # Skip SASS check for CC 8.0+ due to LDL/STL CI failure.
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     if cc_major >= 8:
         monkeypatch.setattr(
             cuda.compute._cccl_interop,
@@ -426,9 +425,7 @@ def sum_nested_zips(v1, v2):
     ],
 )
 def test_nested_output_zip_iterator_with_scan(monkeypatch, num_items, dtype_map):
-    import numba.cuda
-
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     if cc_major >= 8:
         monkeypatch.setattr(
             cuda.compute._cccl_interop,
@@ -502,6 +499,7 @@ def g(x):
     assert it1.kind != it2.kind
 
 
+@pytest.mark.no_numba
 def test_caching_zip_iterator():
     """Test that iterator compilation is cached across instances with the same structure."""
     from cuda.compute._cpp_compile import compile_cpp_op_code
@@ -577,6 +575,7 @@ def test_caching_zip_iterator():
     assert len(set(kinds)) == 1, "Same CountingIterator types should have same kind"
 
 
+@pytest.mark.no_numba
 def test_compilation_caching_across_iterator_types():
     """Test that compilation caching works across different iterator types."""
     from cuda.compute import ConstantIterator

From d97d809f8eaf9b8fc60fe9ebb337ef52284526c8 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Fri, 12 Jun 2026 12:48:11 -0500
Subject: [PATCH 2/7] Stop wrapping binary search comparator in python callable

---
 .../cuda/compute/algorithms/_binary_search.py | 21 +------
 .../tests/compute/test_binary_search.py       | 60 +++++++++++++++++++
 2 files changed, 63 insertions(+), 18 deletions(-)

diff --git a/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py b/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py
index 6cadd994ceb..23a99c7bc40 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py
@@ -16,19 +16,6 @@
 from ..typing import DeviceArrayLike, IteratorT, Operator
 
 
-def _normalize_comp(comp: Operator | None) -> OpAdapter:
-    # Use a lambda for the default comparator rather than OpKind.LESS
-    # because well-known ops don't carry type information needed by
-    # the binary search JIT compilation.
-    if comp is None or comp is OpKind.LESS:
-
-        def _default_less(a, b):
-            return a < b
-
-        return make_op_adapter(_default_less)
-    return make_op_adapter(comp)
-
-
 class _BinarySearch:
     __slots__ = [
         "build_result",
@@ -96,9 +83,7 @@ def __call__(
         set_cccl_iterator_state(self.d_out_cccl, d_out)
 
         # Update op state for stateful ops
-        comp_adapter = (
-            _normalize_comp(comp) if comp is not None else _normalize_comp(None)
-        )
+        comp_adapter = make_op_adapter(OpKind.LESS if comp is None else comp)
         self.op_cccl.state = comp_adapter.get_state()
 
         stream_handle = protocols.validate_and_get_stream(stream)
@@ -154,7 +139,7 @@ def make_lower_bound(
     See Also:
         :func:`lower_bound`
     """
-    comp_adapter = _normalize_comp(comp)
+    comp_adapter = make_op_adapter(OpKind.LESS if comp is None else comp)
     return _make_binary_search(
         d_data,
         d_values,
@@ -193,7 +178,7 @@ def make_upper_bound(
     See Also:
         :func:`upper_bound`
     """
-    comp_adapter = _normalize_comp(comp)
+    comp_adapter = make_op_adapter(OpKind.LESS if comp is None else comp)
     return _make_binary_search(
         d_data,
         d_values,
diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py
index a4f7d047a9b..585a998c44c 100644
--- a/python/cuda_cccl/tests/compute/test_binary_search.py
+++ b/python/cuda_cccl/tests/compute/test_binary_search.py
@@ -6,6 +6,7 @@
 import pytest
 
 import cuda.compute
+from cuda.compute import OpKind
 
 DTYPE_LIST = [
     np.int32,
@@ -39,6 +40,65 @@ def disable_sass_check(monkeypatch):
     )
 
 
+@pytest.mark.parametrize(
+    "search, side",
+    [
+        (cuda.compute.lower_bound, "left"),
+        (cuda.compute.upper_bound, "right"),
+    ],
+)
+def test_binary_search_explicit_opkind_less(search, side):
+    h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32)
+    h_values = np.array([0, 3, 4, 10], dtype=np.int32)
+
+    d_data = cp.asarray(h_data)
+    d_values = cp.asarray(h_values)
+    d_out = cp.empty(len(h_values), dtype=np.uintp)
+
+    search(
+        d_data=d_data,
+        num_items=len(d_data),
+        d_values=d_values,
+        num_values=len(d_values),
+        d_out=d_out,
+        comp=OpKind.LESS,
+    )
+
+    expected = np.searchsorted(h_data, h_values, side=side).astype(np.uintp)
+    np.testing.assert_array_equal(d_out.get(), expected)
+
+
+@pytest.mark.parametrize(
+    "search, side",
+    [
+        (cuda.compute.lower_bound, "left"),
+        (cuda.compute.upper_bound, "right"),
+    ],
+)
+def test_binary_search_custom_comparator(search, side):
+    h_data = np.array([9, 7, 3, 3, 1], dtype=np.int32)
+    h_values = np.array([10, 4, 3, 0], dtype=np.int32)
+
+    def greater(lhs, rhs):
+        return lhs > rhs
+
+    d_data = cp.asarray(h_data)
+    d_values = cp.asarray(h_values)
+    d_out = cp.empty(len(h_values), dtype=np.uintp)
+
+    search(
+        d_data=d_data,
+        num_items=len(d_data),
+        d_values=d_values,
+        num_values=len(d_values),
+        d_out=d_out,
+        comp=greater,
+    )
+
+    expected = np.searchsorted(-h_data, -h_values, side=side).astype(np.uintp)
+    np.testing.assert_array_equal(d_out.get(), expected)
+
+
 @pytest.mark.parametrize("dtype", DTYPE_LIST)
 @pytest.mark.parametrize(
     "num_items,num_values", [(0, 0), (0, 128), (128, 0), (512, 128)]

From c564d3e13e9931f25052099ec0f461ccee824e08 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Fri, 12 Jun 2026 17:04:23 -0500
Subject: [PATCH 3/7] Address comments

---
 ci/test_cuda_compute_minimal_python.sh                     | 4 ++--
 python/cuda_cccl/tests/compute/test_binary_search.py       | 1 +
 python/cuda_cccl/tests/compute/test_three_way_partition.py | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ci/test_cuda_compute_minimal_python.sh b/ci/test_cuda_compute_minimal_python.sh
index c48b313ed31..672338bbf33 100755
--- a/ci/test_cuda_compute_minimal_python.sh
+++ b/ci/test_cuda_compute_minimal_python.sh
@@ -19,8 +19,8 @@ setup_python_env "${py_version}"
 # Fetch or build the cuda_cccl wheel:
 if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
   wheel_artifact_name=$("$ci_dir/util/workflow/get_wheel_artifact_name.sh")
-  "$ci_dir/util/artifacts/download.sh" "${wheel_artifact_name}" /home/coder/cccl/
-  wheelhouse_dir="/home/coder/cccl/wheelhouse"
+  "$ci_dir/util/artifacts/download.sh" "${wheel_artifact_name}" "${repo_root}/"
+  wheelhouse_dir="${repo_root}/wheelhouse"
 else
   "$ci_dir/build_cuda_cccl_python.sh" -py-version "${py_version}"
   wheelhouse_dir="${repo_root}/wheelhouse"
diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py
index 585a998c44c..1e24ca00116 100644
--- a/python/cuda_cccl/tests/compute/test_binary_search.py
+++ b/python/cuda_cccl/tests/compute/test_binary_search.py
@@ -47,6 +47,7 @@ def disable_sass_check(monkeypatch):
         (cuda.compute.upper_bound, "right"),
     ],
 )
+@pytest.mark.no_numba
 def test_binary_search_explicit_opkind_less(search, side):
     h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32)
     h_values = np.array([0, 3, 4, 10], dtype=np.int32)
diff --git a/python/cuda_cccl/tests/compute/test_three_way_partition.py b/python/cuda_cccl/tests/compute/test_three_way_partition.py
index 9f2683f6798..b9813558de3 100644
--- a/python/cuda_cccl/tests/compute/test_three_way_partition.py
+++ b/python/cuda_cccl/tests/compute/test_three_way_partition.py
@@ -71,7 +71,7 @@ def _raw_less_than_i32(name: str, threshold: int) -> RawOp:
     source = f"""
 extern "C" __device__ void {name}(void* x, void* result) {{
     int value = *static_cast<int*>(x);
-    *static_cast<bool*>(result) = value < {threshold};
+    *static_cast<unsigned char*>(result) = value < {threshold} ? 1 : 0;
 }}
 """
     return RawOp(ltoir=compile_cpp_op_code(source), name=name)

From 79df8e0a62cce4a6b943a369406da3987c989aa8 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Fri, 12 Jun 2026 17:11:35 -0500
Subject: [PATCH 4/7] fix merge conflict

---
 python/cuda_cccl/tests/compute/test_binary_search.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py
index 9b07eeb8815..1e24ca00116 100644
--- a/python/cuda_cccl/tests/compute/test_binary_search.py
+++ b/python/cuda_cccl/tests/compute/test_binary_search.py
@@ -47,10 +47,7 @@ def disable_sass_check(monkeypatch):
         (cuda.compute.upper_bound, "right"),
     ],
 )
-<<<<<<< HEAD
 @pytest.mark.no_numba
-=======
->>>>>>> main
 def test_binary_search_explicit_opkind_less(search, side):
     h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32)
     h_values = np.array([0, 3, 4, 10], dtype=np.int32)

From c2e7c01b5732f968520ef25c21503f244fb02a6b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 12 Jun 2026 22:16:44 +0000
Subject: [PATCH 5/7] [pre-commit.ci] auto code formatting

---
 python/cuda_cccl/tests/compute/test_radix_sort.py          | 2 +-
 python/cuda_cccl/tests/compute/test_scan.py                | 2 +-
 python/cuda_cccl/tests/compute/test_select.py              | 2 +-
 python/cuda_cccl/tests/compute/test_three_way_partition.py | 2 +-
 python/cuda_cccl/tests/compute/test_unique_by_key.py       | 2 +-
 python/cuda_cccl/tests/compute/test_zip_iterator.py        | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/cuda_cccl/tests/compute/test_radix_sort.py b/python/cuda_cccl/tests/compute/test_radix_sort.py
index 67186d3a4f7..d39a7e07fad 100644
--- a/python/cuda_cccl/tests/compute/test_radix_sort.py
+++ b/python/cuda_cccl/tests/compute/test_radix_sort.py
@@ -10,11 +10,11 @@
 import pytest
 
 import cuda.compute
-from cuda.core import Device
 from cuda.compute import (
     DoubleBuffer,
     SortOrder,
 )
+from cuda.core import Device
 
 
 def get_mark(dt, log_size):
diff --git a/python/cuda_cccl/tests/compute/test_scan.py b/python/cuda_cccl/tests/compute/test_scan.py
index 97f5fae630a..0eb3af34c49 100644
--- a/python/cuda_cccl/tests/compute/test_scan.py
+++ b/python/cuda_cccl/tests/compute/test_scan.py
@@ -8,7 +8,6 @@
 import pytest
 
 import cuda.compute
-from cuda.core import Device
 from cuda.compute import (
     CountingIterator,
     OpKind,
@@ -16,6 +15,7 @@
     TransformOutputIterator,
     gpu_struct,
 )
+from cuda.core import Device
 
 
 def scan_host(h_input: np.ndarray, op, h_init, force_inclusive):
diff --git a/python/cuda_cccl/tests/compute/test_select.py b/python/cuda_cccl/tests/compute/test_select.py
index d467d33c635..3dcaf98a507 100644
--- a/python/cuda_cccl/tests/compute/test_select.py
+++ b/python/cuda_cccl/tests/compute/test_select.py
@@ -7,9 +7,9 @@
 import pytest
 
 import cuda.compute
+from cuda.compute import CacheModifiedInputIterator, ZipIterator, gpu_struct
 from cuda.compute._cpp_compile import compile_cpp_op_code
 from cuda.compute.op import RawOp
-from cuda.compute import CacheModifiedInputIterator, ZipIterator, gpu_struct
 
 DTYPE_LIST = [
     np.uint8,
diff --git a/python/cuda_cccl/tests/compute/test_three_way_partition.py b/python/cuda_cccl/tests/compute/test_three_way_partition.py
index b9813558de3..8def513681b 100644
--- a/python/cuda_cccl/tests/compute/test_three_way_partition.py
+++ b/python/cuda_cccl/tests/compute/test_three_way_partition.py
@@ -7,9 +7,9 @@
 import pytest
 
 import cuda.compute
+from cuda.compute import CacheModifiedInputIterator, gpu_struct
 from cuda.compute._cpp_compile import compile_cpp_op_code
 from cuda.compute.op import RawOp
-from cuda.compute import CacheModifiedInputIterator, gpu_struct
 
 DTYPE_LIST = [
     np.uint8,
diff --git a/python/cuda_cccl/tests/compute/test_unique_by_key.py b/python/cuda_cccl/tests/compute/test_unique_by_key.py
index 5f7d310d71e..945d4d99028 100644
--- a/python/cuda_cccl/tests/compute/test_unique_by_key.py
+++ b/python/cuda_cccl/tests/compute/test_unique_by_key.py
@@ -8,13 +8,13 @@
 import pytest
 
 import cuda.compute
-from cuda.core import Device
 from cuda.compute import (
     CacheModifiedInputIterator,
     DiscardIterator,
     OpKind,
     gpu_struct,
 )
+from cuda.core import Device
 
 DTYPE_LIST = [
     np.uint8,
diff --git a/python/cuda_cccl/tests/compute/test_zip_iterator.py b/python/cuda_cccl/tests/compute/test_zip_iterator.py
index aea1f167f9f..9fbed38359f 100644
--- a/python/cuda_cccl/tests/compute/test_zip_iterator.py
+++ b/python/cuda_cccl/tests/compute/test_zip_iterator.py
@@ -6,13 +6,13 @@
 import pytest
 
 import cuda.compute
-from cuda.core import Device
 from cuda.compute import (
     CountingIterator,
     TransformIterator,
     ZipIterator,
     gpu_struct,
 )
+from cuda.core import Device
 
 
 @pytest.mark.parametrize("num_items", [10, 1_000, 100_000])

From b09c9c6ecb2df85d4e4fed2279fcfd7efe255254 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Fri, 12 Jun 2026 17:26:21 -0500
Subject: [PATCH 6/7] Fix pre-commit

---
 .../tests/compute/test_void_ptr_wrapper_validation.py     | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py
index e4d8185eb91..2788a3c0893 100644
--- a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py
+++ b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py
@@ -16,8 +16,12 @@
 numba = pytest.importorskip("numba")
 types = numba.types
 
-from cuda.compute._odr_helpers import _ArgMode, _ArgSpec, _create_void_ptr_wrapper
-from cuda.compute._utils import sanitize_identifier
+from cuda.compute._odr_helpers import (  # noqa: E402
+    _ArgMode,
+    _ArgSpec,
+    _create_void_ptr_wrapper,
+)
+from cuda.compute._utils import sanitize_identifier  # noqa: E402
 
 
 def _make_arg_specs():

From 15652c00259b5650977a4fb33d66d4bd680559b6 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Tue, 23 Jun 2026 09:34:33 -0500
Subject: [PATCH 7/7] Move no numba tests to separate file

---
 ci/test_cuda_compute_minimal_python.sh        |  21 +-
 .../tests/compute/test_binary_search.py       |   1 -
 .../cuda_cccl/tests/compute/test_bindings.py  |   2 -
 .../compute/test_deferred_annotations.py      |   2 -
 .../tests/compute/test_func_caching.py        |   9 -
 .../cuda_cccl/tests/compute/test_histogram.py |   2 -
 .../cuda_cccl/tests/compute/test_iterators.py |  10 +-
 .../tests/compute/test_merge_sort.py          |  55 +-
 .../tests/compute/test_nested_struct.py       |   5 -
 .../cuda_cccl/tests/compute/test_no_numba.py  | 560 +++++++++++++++++-
 .../compute/test_permutation_iterator.py      |   6 -
 .../tests/compute/test_radix_sort.py          |  76 ++-
 python/cuda_cccl/tests/compute/test_reduce.py |  38 +-
 python/cuda_cccl/tests/compute/test_scan.py   |  16 +-
 .../tests/compute/test_segmented_reduce.py    |   3 -
 .../tests/compute/test_segmented_sort.py      |  51 +-
 python/cuda_cccl/tests/compute/test_select.py |  31 -
 .../tests/compute/test_shuffle_iterator.py    |   1 -
 .../compute/test_struct_field_validation.py   |   2 -
 .../tests/compute/test_three_way_partition.py |  46 --
 .../cuda_cccl/tests/compute/test_transform.py |   5 -
 .../tests/compute/test_unique_by_key.py       |  69 ++-
 .../test_void_ptr_wrapper_validation.py       |  12 +-
 .../tests/compute/test_zip_iterator.py        |  11 +-
 24 files changed, 716 insertions(+), 318 deletions(-)

diff --git a/ci/test_cuda_compute_minimal_python.sh b/ci/test_cuda_compute_minimal_python.sh
index 672338bbf33..a43bbac7bfa 100755
--- a/ci/test_cuda_compute_minimal_python.sh
+++ b/ci/test_cuda_compute_minimal_python.sh
@@ -27,25 +27,10 @@ else
 fi
 
 # Install cuda_cccl with the minimal CUDA extra. This intentionally avoids the
-# full cu* extras because those pull in numba/numba-cuda. In a clean minimal
-# environment, the test phase below runs only tests marked no_numba.
+# full cu* extras because those pull in numba/numba-cuda.
 CUDA_CCCL_WHEEL_PATH="$(ls "${wheelhouse_dir}"/cuda_cccl-*.whl)"
 python -m pip install "${CUDA_CCCL_WHEEL_PATH}[minimal-cu${cuda_major_version}]"
 python -m pip install pytest pytest-xdist "cupy-cuda${cuda_major_version}x"
 
-if python - <<'PY'
-try:
-    import numba.cuda  # noqa: F401
-except Exception as exc:
-    print(f"numba.cuda unavailable; running no_numba subset: {exc!r}")
-    raise SystemExit(1)
-else:
-    print("numba.cuda available; running full compute test suite.")
-PY
-then
-  cd "${repo_root}/python/cuda_cccl/tests/"
-  python -m pytest -n 6 -v compute/ -m "not large"
-else
-  cd "${repo_root}/python/cuda_cccl/tests/"
-  python -m pytest -n 6 -v compute/ -m "not large and no_numba"
-fi
+cd "${repo_root}/python/cuda_cccl/tests/"
+python -m pytest -n 6 -v compute/test_no_numba.py
diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py
index 1e24ca00116..585a998c44c 100644
--- a/python/cuda_cccl/tests/compute/test_binary_search.py
+++ b/python/cuda_cccl/tests/compute/test_binary_search.py
@@ -47,7 +47,6 @@ def disable_sass_check(monkeypatch):
         (cuda.compute.upper_bound, "right"),
     ],
 )
-@pytest.mark.no_numba
 def test_binary_search_explicit_opkind_less(search, side):
     h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32)
     h_values = np.array([0, 3, 4, 10], dtype=np.int32)
diff --git a/python/cuda_cccl/tests/compute/test_bindings.py b/python/cuda_cccl/tests/compute/test_bindings.py
index ccfbd8d0bc3..2f16da620ef 100644
--- a/python/cuda_cccl/tests/compute/test_bindings.py
+++ b/python/cuda_cccl/tests/compute/test_bindings.py
@@ -4,8 +4,6 @@
 
 import cuda.compute._bindings as bindings
 
-pytestmark = pytest.mark.no_numba
-
 
 @pytest.fixture(
     params=[
diff --git a/python/cuda_cccl/tests/compute/test_deferred_annotations.py b/python/cuda_cccl/tests/compute/test_deferred_annotations.py
index a584b8ffbe2..c30aeda4068 100644
--- a/python/cuda_cccl/tests/compute/test_deferred_annotations.py
+++ b/python/cuda_cccl/tests/compute/test_deferred_annotations.py
@@ -7,12 +7,10 @@
 
 import cupy as cp
 import numpy as np
-import pytest
 
 from cuda.compute import OpKind, TransformIterator, gpu_struct, reduce_into
 
 
-@pytest.mark.no_numba
 def test_deferred_annotations():
     # test that we can use @gpu_struct with deferred annotations
     # GH: #6421
diff --git a/python/cuda_cccl/tests/compute/test_func_caching.py b/python/cuda_cccl/tests/compute/test_func_caching.py
index e94e0069529..8fcf47b32a7 100644
--- a/python/cuda_cccl/tests/compute/test_func_caching.py
+++ b/python/cuda_cccl/tests/compute/test_func_caching.py
@@ -1,12 +1,10 @@
 import numpy as np
-import pytest
 
 from cuda.compute._caching import CachableFunction
 
 global_x = 1
 
 
-@pytest.mark.no_numba
 def test_func_caching_basic():
     def func(x):
         return x
@@ -21,7 +19,6 @@ def func(x):
     assert f1 == f2
 
 
-@pytest.mark.no_numba
 def test_func_caching_different_names():
     def func(x):
         return x
@@ -36,7 +33,6 @@ def func2(x):
     assert f1 != f2
 
 
-@pytest.mark.no_numba
 def test_func_caching_different_code():
     def func(x):
         return x
@@ -50,7 +46,6 @@ def func(x):
     assert f1 != f2
 
 
-@pytest.mark.no_numba
 def test_func_caching_with_closure():
     def factory(x):
         def func(y):
@@ -66,7 +61,6 @@ def func(y):
     assert f1 != f3
 
 
-@pytest.mark.no_numba
 def test_func_caching_with_numpy_numeric_scalar_closure():
     def factory(indexlength, regularsize):
         index_dtype = np.int64
@@ -86,7 +80,6 @@ def func(counter):
     assert f1 != f3
 
 
-@pytest.mark.no_numba
 def test_func_caching_with_global_variable():
     global global_x
 
@@ -133,7 +126,6 @@ def func(x):
     assert CachableFunction(func1) != CachableFunction(func3)
 
 
-@pytest.mark.no_numba
 def test_func_caching_with_global_np_ufunc():
     def make_func():
         def func(x):
@@ -153,7 +145,6 @@ def func(x):
     assert CachableFunction(func1) != CachableFunction(func2)
 
 
-@pytest.mark.no_numba
 def test_func_caching_with_aliased_np_ufunc():
     def make_func1():
         amin = np.argmin
diff --git a/python/cuda_cccl/tests/compute/test_histogram.py b/python/cuda_cccl/tests/compute/test_histogram.py
index 5c104114754..0a9d96674e6 100644
--- a/python/cuda_cccl/tests/compute/test_histogram.py
+++ b/python/cuda_cccl/tests/compute/test_histogram.py
@@ -12,8 +12,6 @@
     CountingIterator,
 )
 
-pytestmark = pytest.mark.no_numba
-
 DTYPE_LIST = [
     np.uint8,
     np.uint16,
diff --git a/python/cuda_cccl/tests/compute/test_iterators.py b/python/cuda_cccl/tests/compute/test_iterators.py
index c3b6acc9fc9..49e2ab39d15 100644
--- a/python/cuda_cccl/tests/compute/test_iterators.py
+++ b/python/cuda_cccl/tests/compute/test_iterators.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import cupy as cp
+import numba.cuda
 import numpy as np
 import pytest
 
@@ -21,7 +22,6 @@
 )
 
 
-@pytest.mark.no_numba
 def test_constant_iterator_equality():
     it1 = ConstantIterator(np.int32(0))
     it2 = ConstantIterator(np.int32(0))
@@ -32,7 +32,6 @@ def test_constant_iterator_equality():
     assert it1.kind != it4.kind
 
 
-@pytest.mark.no_numba
 def test_counting_iterator_equality():
     it1 = CountingIterator(np.int32(0))
     it2 = CountingIterator(np.int32(0))
@@ -43,7 +42,6 @@ def test_counting_iterator_equality():
     assert it1.kind != it4.kind
 
 
-@pytest.mark.no_numba
 def test_cache_modified_input_iterator_equality():
     ary1 = cp.asarray([0, 1, 2], dtype="int32")
     ary2 = cp.asarray([3, 4, 5], dtype="int32")
@@ -124,14 +122,11 @@ def reverse_iterator_array(request):
     if array_type == "cupy":
         array = cp.array(base_array)
     else:
-        import numba.cuda
-
         array = numba.cuda.to_device(base_array)
 
     return array
 
 
-@pytest.mark.no_numba
 def test_reverse_input_iterator_equality():
     ary1 = cp.asarray([0, 1, 2], dtype="int32")
     ary2 = cp.asarray([3, 4, 5], dtype="int32")
@@ -146,7 +141,6 @@ def test_reverse_input_iterator_equality():
     assert it1.kind != it4.kind
 
 
-@pytest.mark.no_numba
 def test_reverse_output_iterator_equality():
     ary1 = cp.asarray([0, 1, 2], dtype="int32")
     ary2 = cp.asarray([3, 4, 5], dtype="int32")
@@ -161,7 +155,6 @@ def test_reverse_output_iterator_equality():
     assert it1.kind != it4.kind
 
 
-@pytest.mark.no_numba
 @pytest.mark.parametrize(
     "shape, itemsize, expected",
     [
@@ -186,7 +179,6 @@ def test_compute_c_contiguous_strides_in_bytes(shape, itemsize, expected):
     assert result == expected
 
 
-@pytest.mark.no_numba
 @pytest.mark.parametrize(
     "shape, dtype",
     [
diff --git a/python/cuda_cccl/tests/compute/test_merge_sort.py b/python/cuda_cccl/tests/compute/test_merge_sort.py
index db97d5aca01..33d7d15de58 100644
--- a/python/cuda_cccl/tests/compute/test_merge_sort.py
+++ b/python/cuda_cccl/tests/compute/test_merge_sort.py
@@ -5,6 +5,7 @@
 from typing import List
 
 import cupy as cp
+import numba.cuda
 import numpy as np
 import pytest
 
@@ -81,11 +82,11 @@ def compare_op(lhs, rhs):
 def test_merge_sort_keys(dtype, num_items, op):
     h_in_keys = random_array(num_items, dtype)
 
-    d_in_keys = cp.asarray(h_in_keys)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
 
     merge_sort_device(d_in_keys, None, d_in_keys, None, op, num_items)
 
-    h_out_keys = d_in_keys.get()
+    h_out_keys = d_in_keys.copy_to_host()
     h_in_keys.sort()
 
     np.testing.assert_array_equal(h_out_keys, h_in_keys)
@@ -101,13 +102,13 @@ def test_merge_sort_pairs(dtype, num_items, op, monkeypatch):
     h_in_keys = random_array(num_items, dtype)
     h_in_items = random_array(num_items, np.float32)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_items = cp.asarray(h_in_items)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_items = numba.cuda.to_device(h_in_items)
 
     merge_sort_device(d_in_keys, d_in_items, d_in_keys, d_in_items, op, num_items)
 
-    h_out_keys = d_in_keys.get()
-    h_out_items = d_in_items.get()
+    h_out_keys = d_in_keys.copy_to_host()
+    h_out_items = d_in_items.copy_to_host()
 
     argsort = np.argsort(h_in_keys, stable=True)
     h_in_keys = np.array(h_in_keys)[argsort]
@@ -122,12 +123,12 @@ def test_merge_sort_keys_copy(dtype, num_items, op):
     h_in_keys = random_array(num_items, dtype)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_out_keys = cp.asarray(h_out_keys)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
 
     merge_sort_device(d_in_keys, None, d_out_keys, None, op, num_items)
 
-    h_out_keys = d_out_keys.get()
+    h_out_keys = d_out_keys.copy_to_host()
     h_in_keys.sort()
 
     np.testing.assert_array_equal(h_out_keys, h_in_keys)
@@ -145,15 +146,15 @@ def test_merge_sort_pairs_copy(dtype, num_items, op, monkeypatch):
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_items = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_items = cp.asarray(h_in_items)
-    d_out_keys = cp.asarray(h_out_keys)
-    d_out_items = cp.asarray(h_out_items)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_items = numba.cuda.to_device(h_in_items)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_out_items = numba.cuda.to_device(h_out_items)
 
     merge_sort_device(d_in_keys, d_in_items, d_out_keys, d_out_items, op, num_items)
 
-    h_out_keys = d_out_keys.get()
-    h_out_items = d_out_items.get()
+    h_out_keys = d_out_keys.copy_to_host()
+    h_out_items = d_out_items.copy_to_host()
 
     argsort = np.argsort(h_in_keys, stable=True)
     h_in_keys = np.array(h_in_keys)[argsort]
@@ -224,11 +225,11 @@ def compare_complex(lhs, rhs):
     imaginary = random_array(num_items, np.int64, max_value)
 
     h_in_keys = real + 1j * imaginary
-    d_in_keys = cp.asarray(h_in_keys)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
 
     merge_sort_device(d_in_keys, None, d_in_keys, None, compare_complex, num_items)
 
-    h_out_keys = d_in_keys.get()
+    h_out_keys = d_in_keys.copy_to_host()
     h_in_keys = h_in_keys[np.argsort(h_in_keys.real, stable=True)]
 
     np.testing.assert_array_equal(h_out_keys, h_in_keys)
@@ -239,15 +240,15 @@ def test_merge_sort_keys_copy_iterator_input(dtype, num_items, op):
     h_in_keys = random_array(num_items, dtype)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_out_keys = cp.asarray(h_out_keys)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
 
     i_input = CacheModifiedInputIterator(d_in_keys, modifier="stream")
 
     merge_sort_device(i_input, None, d_out_keys, None, op, num_items)
 
     h_in_keys.sort()
-    h_out_keys = d_out_keys.get()
+    h_out_keys = d_out_keys.copy_to_host()
 
     np.testing.assert_array_equal(h_out_keys, h_in_keys)
 
@@ -264,10 +265,10 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch)
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_items = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_items = cp.asarray(h_in_items)
-    d_out_keys = cp.asarray(h_out_keys)
-    d_out_items = cp.asarray(h_out_items)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_items = numba.cuda.to_device(h_in_items)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_out_items = numba.cuda.to_device(h_out_items)
 
     i_input_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream")
     i_input_items = CacheModifiedInputIterator(d_in_items, modifier="stream")
@@ -276,8 +277,8 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch)
         i_input_keys, i_input_items, d_out_keys, d_out_items, op, num_items
     )
 
-    h_out_keys = d_out_keys.get()
-    h_out_items = d_out_items.get()
+    h_out_keys = d_out_keys.copy_to_host()
+    h_out_items = d_out_items.copy_to_host()
 
     argsort = np.argsort(h_in_keys, stable=True)
     h_in_keys = np.array(h_in_keys)[argsort]
@@ -306,7 +307,6 @@ def test_merge_sort_with_stream(cuda_stream):
     np.testing.assert_array_equal(got, h_in_keys)
 
 
-@pytest.mark.no_numba
 def test_merge_sort_well_known_less():
     dtype = np.int32
 
@@ -326,7 +326,6 @@ def test_merge_sort_well_known_less():
     np.testing.assert_equal(d_out_keys.get(), expected)
 
 
-@pytest.mark.no_numba
 def test_merge_sort_well_known_greater():
     dtype = np.int32
 
diff --git a/python/cuda_cccl/tests/compute/test_nested_struct.py b/python/cuda_cccl/tests/compute/test_nested_struct.py
index 715873af3c8..9cbfa40e393 100644
--- a/python/cuda_cccl/tests/compute/test_nested_struct.py
+++ b/python/cuda_cccl/tests/compute/test_nested_struct.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 import cupy as cp
 import numpy as np
-import pytest
 
 import cuda.compute
 from cuda.compute import ZipIterator, gpu_struct
@@ -150,7 +149,6 @@ def sum_pixels(p1, p2):
     assert result["color"]["b"] == expected_b
 
 
-@pytest.mark.no_numba
 def test_dict_init_nested_struct():
     """Test initializing a nested struct with a dictionary."""
     Inner = gpu_struct({"a": np.int32, "b": np.float32})
@@ -164,7 +162,6 @@ def test_dict_init_nested_struct():
     assert np.isclose(obj.inner.b, 3.14)
 
 
-@pytest.mark.no_numba
 def test_dict_init_per_field():
     """Test initializing a struct with a dictionary for a nested field."""
     Inner = gpu_struct({"a": np.int32, "b": np.float32})
@@ -178,7 +175,6 @@ def test_dict_init_per_field():
     assert np.isclose(obj.inner.b, 3.14)
 
 
-@pytest.mark.no_numba
 def test_dict_init_deeply_nested():
     """Test initializing deeply nested structs (3+ levels) with dictionaries."""
     Level1 = gpu_struct({"value": np.int32})
@@ -193,7 +189,6 @@ def test_dict_init_deeply_nested():
     assert obj.middle.nested.value == 42
 
 
-@pytest.mark.no_numba
 def test_dict_init_mixed():
     """Test mixed initialization with some dicts and some direct values."""
     Inner1 = gpu_struct({"a": np.int32, "b": np.int32})
diff --git a/python/cuda_cccl/tests/compute/test_no_numba.py b/python/cuda_cccl/tests/compute/test_no_numba.py
index bbab965c8bf..6fb1ef0e811 100644
--- a/python/cuda_cccl/tests/compute/test_no_numba.py
+++ b/python/cuda_cccl/tests/compute/test_no_numba.py
@@ -1,12 +1,568 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import cupy as cp
+import numpy as np
 import pytest
 
-# Check that tests marked no_numba fail fast if they import numba.
+import cuda.compute
+from cuda.compute import (
+    CacheModifiedInputIterator,
+    ConstantIterator,
+    CountingIterator,
+    DiscardIterator,
+    OpKind,
+    PermutationIterator,
+    ReverseIterator,
+    ShuffleIterator,
+    SortOrder,
+    TransformIterator,
+    TransformOutputIterator,
+    ZipIterator,
+)
+from cuda.compute._cpp_compile import compile_cpp_op_code
+from cuda.compute.op import RawOp
+from cuda.compute.types import int16 as cccl_int16
+from cuda.compute.types import int32 as cccl_int32
+
+# These tests define the minimal-extra integration contract. They intentionally
+# use small fixed inputs and avoid the Python-callable operator path.
 pytestmark = pytest.mark.no_numba
 
 
-@pytest.mark.no_numba
+def _raw_op(source: str, name: str) -> RawOp:
+    return RawOp(ltoir=compile_cpp_op_code(source), name=name)
+
+
+def _raw_even_i32_op() -> RawOp:
+    source = """
+extern "C" __device__ void no_numba_even_i32(void* x, void* result) {
+    int value = *static_cast<int*>(x);
+    *static_cast<bool*>(result) = (value % 2) == 0;
+}
+"""
+    return _raw_op(source, "no_numba_even_i32")
+
+
+def _raw_less_than_i32_op(name: str, threshold: int) -> RawOp:
+    source = f"""
+extern "C" __device__ void {name}(void* x, void* result) {{
+    int value = *static_cast<int*>(x);
+    *static_cast<unsigned char*>(result) = value < {threshold} ? 1 : 0;
+}}
+"""
+    return _raw_op(source, name)
+
+
+def _raw_plus_i64_op() -> RawOp:
+    source = """
+extern "C" __device__ void no_numba_plus_i64(
+    void* lhs,
+    void* rhs,
+    void* result
+) {
+    *static_cast<long long*>(result) =
+        *static_cast<long long*>(lhs) + *static_cast<long long*>(rhs);
+}
+"""
+    return _raw_op(source, "no_numba_plus_i64")
+
+
+def _raw_square_i32_op() -> RawOp:
+    source = """
+extern "C" __device__ void no_numba_square_i32(void* x, void* result) {
+    int value = *static_cast<int*>(x);
+    *static_cast<int*>(result) = value * value;
+}
+"""
+    return _raw_op(source, "no_numba_square_i32")
+
+
+def _raw_zip_sum_i32_op() -> RawOp:
+    source = """
+struct Zip2I32 {
+    int field_0;
+    int field_1;
+};
+
+extern "C" __device__ void no_numba_zip_sum_i32(void* x, void* result) {
+    auto values = static_cast<Zip2I32*>(x);
+    *static_cast<int*>(result) = values->field_0 + values->field_1;
+}
+"""
+    return _raw_op(source, "no_numba_zip_sum_i32")
+
+
+def _raw_negate_i16_op() -> RawOp:
+    source = """
+extern "C" __device__ void no_numba_negate_i16(void* x, void* result) {
+    *static_cast<short*>(result) = -*static_cast<short*>(x);
+}
+"""
+    return _raw_op(source, "no_numba_negate_i16")
+
+
 def test_import_numba_raises():
     with pytest.raises(
         ImportError, match="This test is marked 'no_numba' but attempted to import it"
     ):
         import numba.cuda  # noqa: F401
+
+
+def test_reduce_well_known_plus():
+    h_input = np.arange(1, 14, dtype=np.int32)
+    d_input = cp.asarray(h_input)
+    d_output = cp.empty(1, dtype=np.int32)
+    h_init = np.array([5], dtype=np.int32)
+
+    cuda.compute.reduce_into(
+        d_in=d_input,
+        d_out=d_output,
+        num_items=d_input.size,
+        op=OpKind.PLUS,
+        h_init=h_init,
+    )
+
+    assert d_output.get()[0] == np.sum(h_input, initial=h_init[0])
+
+
+def test_exclusive_scan_well_known_plus():
+    d_input = cp.asarray([2, 4, 6, 8, 10, 12], dtype=np.uint16)
+    d_output = cp.empty_like(d_input)
+    h_init = np.array([1], dtype=np.uint16)
+
+    cuda.compute.exclusive_scan(
+        d_in=d_input,
+        d_out=d_output,
+        op=OpKind.PLUS,
+        init_value=h_init,
+        num_items=d_input.size,
+    )
+
+    expected = np.asarray([1, 3, 7, 13, 21, 31], dtype=np.uint16)
+    np.testing.assert_array_equal(d_output.get(), expected)
+
+
+def test_binary_transform_well_known_plus():
+    d_lhs = cp.asarray([1.5, 2.5, 3.5, 4.5], dtype=np.float32)
+    d_rhs = cp.asarray([10.0, 20.0, 30.0, 40.0], dtype=np.float32)
+    d_output = cp.empty_like(d_lhs)
+
+    cuda.compute.binary_transform(
+        d_in1=d_lhs,
+        d_in2=d_rhs,
+        d_out=d_output,
+        op=OpKind.PLUS,
+        num_items=d_lhs.size,
+    )
+
+    np.testing.assert_allclose(d_output.get(), d_lhs.get() + d_rhs.get())
+
+
+def test_unary_transform_well_known_negate():
+    d_input = cp.asarray([-4, -2, 0, 2, 4], dtype=np.int8)
+    d_output = cp.empty_like(d_input)
+
+    cuda.compute.unary_transform(
+        d_in=d_input,
+        d_out=d_output,
+        op=OpKind.NEGATE,
+        num_items=d_input.size,
+    )
+
+    np.testing.assert_array_equal(d_output.get(), np.asarray([4, 2, 0, -2, -4]))
+
+
+@pytest.mark.parametrize(
+    "search, side",
+    [
+        (cuda.compute.lower_bound, "left"),
+        (cuda.compute.upper_bound, "right"),
+    ],
+)
+def test_binary_search_explicit_opkind_less(search, side):
+    h_data = np.asarray([1, 3, 3, 7, 9, 11], dtype=np.int64)
+    h_values = np.asarray([0, 3, 4, 10, 12], dtype=np.int64)
+    d_out = cp.empty(h_values.size, dtype=np.uintp)
+
+    search(
+        d_data=cp.asarray(h_data),
+        num_items=h_data.size,
+        d_values=cp.asarray(h_values),
+        num_values=h_values.size,
+        d_out=d_out,
+        comp=OpKind.LESS,
+    )
+
+    expected = np.searchsorted(h_data, h_values, side=side).astype(np.uintp)
+    np.testing.assert_array_equal(d_out.get(), expected)
+
+
+def test_segmented_reduce_well_known_plus(monkeypatch):
+    monkeypatch.setattr(cuda.compute._cccl_interop, "_check_sass", False)
+
+    d_input = cp.asarray([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.uint32)
+    d_starts = cp.asarray([0, 3, 5], dtype=np.int32)
+    d_ends = cp.asarray([3, 5, 8], dtype=np.int32)
+    d_output = cp.empty(3, dtype=np.uint32)
+    h_init = np.array([0], dtype=np.uint32)
+
+    cuda.compute.segmented_reduce(
+        d_in=d_input,
+        d_out=d_output,
+        num_segments=3,
+        start_offsets_in=d_starts,
+        end_offsets_in=d_ends,
+        op=OpKind.PLUS,
+        h_init=h_init,
+    )
+
+    np.testing.assert_array_equal(d_output.get(), np.asarray([6, 9, 21]))
+
+
+def test_merge_sort_well_known_less():
+    d_input = cp.asarray([3.5, -1.0, 2.25, 2.0, 7.0], dtype=np.float64)
+    d_output = cp.empty_like(d_input)
+
+    cuda.compute.merge_sort(
+        d_in_keys=d_input,
+        d_in_values=None,
+        d_out_keys=d_output,
+        d_out_values=None,
+        num_items=d_input.size,
+        op=OpKind.LESS,
+    )
+
+    np.testing.assert_array_equal(d_output.get(), np.sort(d_input.get()))
+
+
+def test_radix_sort_key_value_pairs():
+    h_keys = np.asarray([4, -2, 7, 1, -2, 0], dtype=np.int16)
+    h_values = np.asarray([40, 20, 70, 10, 21, 0], dtype=np.uint8)
+    d_out_keys = cp.empty_like(cp.asarray(h_keys))
+    d_out_values = cp.empty_like(cp.asarray(h_values))
+
+    cuda.compute.radix_sort(
+        d_in_keys=cp.asarray(h_keys),
+        d_out_keys=d_out_keys,
+        d_in_values=cp.asarray(h_values),
+        d_out_values=d_out_values,
+        num_items=h_keys.size,
+        order=SortOrder.ASCENDING,
+    )
+
+    order = np.argsort(h_keys, stable=True)
+    np.testing.assert_array_equal(d_out_keys.get(), h_keys[order])
+    np.testing.assert_array_equal(d_out_values.get(), h_values[order])
+
+
+def test_segmented_sort_keys():
+    h_keys = np.asarray([3, 1, 2, 9, 7, 8, 6, 5], dtype=np.uint64)
+    h_offsets = np.asarray([0, 3, 6, 8], dtype=np.int64)
+    d_output = cp.empty_like(cp.asarray(h_keys))
+
+    cuda.compute.segmented_sort(
+        d_in_keys=cp.asarray(h_keys),
+        d_out_keys=d_output,
+        d_in_values=None,
+        d_out_values=None,
+        num_items=h_keys.size,
+        num_segments=h_offsets.size - 1,
+        start_offsets_in=cp.asarray(h_offsets[:-1]),
+        end_offsets_in=cp.asarray(h_offsets[1:]),
+        order=SortOrder.ASCENDING,
+    )
+
+    expected = np.asarray([1, 2, 3, 7, 8, 9, 5, 6], dtype=np.uint64)
+    np.testing.assert_array_equal(d_output.get(), expected)
+
+
+def test_unique_by_key_well_known_equal_to(monkeypatch):
+    cc_major, _ = cuda.compute._cccl_interop.CudaDevice().compute_capability
+    if cc_major >= 9:
+        monkeypatch.setattr(cuda.compute._cccl_interop, "_check_sass", False)
+
+    d_keys = cp.asarray([1, 1, 2, 2, 2, 3, 4, 4], dtype=np.int16)
+    d_values = cp.asarray([10, 11, 20, 21, 22, 30, 40, 41], dtype=np.int8)
+    d_out_keys = cp.empty_like(d_keys)
+    d_out_values = cp.empty_like(d_values)
+    d_num_selected = cp.empty(1, dtype=np.int64)
+
+    cuda.compute.unique_by_key(
+        d_in_keys=d_keys,
+        d_in_items=d_values,
+        d_out_keys=d_out_keys,
+        d_out_items=d_out_values,
+        d_out_num_selected=d_num_selected,
+        op=OpKind.EQUAL_TO,
+        num_items=d_keys.size,
+    )
+
+    num_selected = int(d_num_selected.get()[0])
+    np.testing.assert_array_equal(d_out_keys.get()[:num_selected], [1, 2, 3, 4])
+    np.testing.assert_array_equal(d_out_values.get()[:num_selected], [10, 20, 30, 40])
+
+
+def test_histogram_even_small_range():
+    h_samples = np.asarray([0.5, 1.5, 2.5, 2.75, 3.0, 3.5], dtype=np.float32)
+    d_histogram = cp.empty(4, dtype=np.int32)
+
+    cuda.compute.histogram_even(
+        d_samples=cp.asarray(h_samples),
+        d_histogram=d_histogram,
+        num_output_levels=5,
+        lower_level=np.float32(0.0),
+        upper_level=np.float32(4.0),
+        num_samples=h_samples.size,
+    )
+
+    expected, _ = np.histogram(h_samples, bins=4, range=(0.0, 4.0))
+    np.testing.assert_array_equal(d_histogram.get(), expected.astype(np.int32))
+
+
+def test_select_raw_op():
+    h_input = np.arange(12, dtype=np.int32)
+    d_output = cp.empty_like(cp.asarray(h_input))
+    d_num_selected = cp.empty(1, dtype=np.uint64)
+
+    cuda.compute.select(
+        d_in=cp.asarray(h_input),
+        d_out=d_output,
+        d_num_selected_out=d_num_selected,
+        cond=_raw_even_i32_op(),
+        num_items=h_input.size,
+    )
+
+    num_selected = int(d_num_selected.get()[0])
+    np.testing.assert_array_equal(d_output.get()[:num_selected], h_input[::2])
+
+
+def test_three_way_partition_raw_op():
+    h_input = np.arange(12, dtype=np.int32)
+    d_first = cp.empty_like(cp.asarray(h_input))
+    d_second = cp.empty_like(cp.asarray(h_input))
+    d_unselected = cp.empty_like(cp.asarray(h_input))
+    d_num_selected = cp.empty(2, dtype=np.uint64)
+
+    cuda.compute.three_way_partition(
+        d_in=cp.asarray(h_input),
+        d_first_part_out=d_first,
+        d_second_part_out=d_second,
+        d_unselected_out=d_unselected,
+        d_num_selected_out=d_num_selected,
+        select_first_part_op=_raw_less_than_i32_op("no_numba_less_than_4_i32", 4),
+        select_second_part_op=_raw_less_than_i32_op("no_numba_less_than_8_i32", 8),
+        num_items=h_input.size,
+    )
+
+    selected = d_num_selected.get()
+    first_count = int(selected[0])
+    second_count = int(selected[1])
+    unselected_count = h_input.size - first_count - second_count
+
+    np.testing.assert_array_equal(d_first.get()[:first_count], h_input[:4])
+    np.testing.assert_array_equal(d_second.get()[:second_count], h_input[4:8])
+    np.testing.assert_array_equal(d_unselected.get()[:unselected_count], h_input[8:])
+
+
+def test_raw_op_reduce():
+    h_input = np.asarray([10, 20, 30, 40], dtype=np.int64)
+    d_output = cp.empty(1, dtype=np.int64)
+
+    cuda.compute.reduce_into(
+        d_in=cp.asarray(h_input),
+        d_out=d_output,
+        num_items=h_input.size,
+        op=_raw_plus_i64_op(),
+        h_init=np.array([5], dtype=np.int64),
+    )
+
+    assert d_output.get()[0] == 105
+
+
+def test_stream_argument(cuda_stream):
+    d_lhs = cp.asarray([2, 4, 6, 8, 10], dtype=np.int32)
+    d_rhs = cp.asarray([1, 3, 5, 7, 9], dtype=np.int32)
+    d_output = cp.empty_like(d_lhs)
+
+    cuda.compute.binary_transform(
+        d_in1=d_lhs,
+        d_in2=d_rhs,
+        d_out=d_output,
+        op=OpKind.PLUS,
+        num_items=d_lhs.size,
+        stream=cuda_stream,
+    )
+
+    cp.cuda.Device().synchronize()
+    np.testing.assert_array_equal(d_output.get(), np.asarray([3, 7, 11, 15, 19]))
+
+
+def test_counting_iterator_reduce():
+    d_output = cp.empty(1, dtype=np.int32)
+
+    cuda.compute.reduce_into(
+        d_in=CountingIterator(np.int32(3)),
+        d_out=d_output,
+        num_items=8,
+        op=OpKind.PLUS,
+        h_init=np.array([0], dtype=np.int32),
+    )
+
+    assert d_output.get()[0] == 52
+
+
+def test_constant_iterator_reduce():
+    d_output = cp.empty(1, dtype=np.float32)
+
+    cuda.compute.reduce_into(
+        d_in=ConstantIterator(np.float32(1.5)),
+        d_out=d_output,
+        num_items=8,
+        op=OpKind.PLUS,
+        h_init=np.array([0], dtype=np.float32),
+    )
+
+    np.testing.assert_allclose(d_output.get()[0], np.float32(12.0))
+
+
+def test_cache_modified_input_iterator_reduce():
+    d_input = cp.asarray([2, 4, 6, 8, 10], dtype=np.uint16)
+    d_output = cp.empty(1, dtype=np.uint16)
+    iterator = CacheModifiedInputIterator(d_input, modifier="stream")
+
+    cuda.compute.reduce_into(
+        d_in=iterator,
+        d_out=d_output,
+        num_items=d_input.size,
+        op=OpKind.PLUS,
+        h_init=np.array([0], dtype=np.uint16),
+    )
+
+    assert d_output.get()[0] == 30
+
+
+def test_reverse_input_iterator_scan():
+    d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)
+    d_output = cp.empty_like(d_input)
+
+    cuda.compute.inclusive_scan(
+        d_in=ReverseIterator(d_input),
+        d_out=d_output,
+        op=OpKind.PLUS,
+        init_value=np.array([0], dtype=np.int32),
+        num_items=d_input.size,
+    )
+
+    np.testing.assert_array_equal(d_output.get(), np.asarray([5, 9, 12, 14, 15]))
+
+
+def test_reverse_output_iterator_scan():
+    d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)
+    d_output = cp.empty_like(d_input)
+
+    cuda.compute.inclusive_scan(
+        d_in=d_input,
+        d_out=ReverseIterator(d_output),
+        op=OpKind.PLUS,
+        init_value=np.array([0], dtype=np.int32),
+        num_items=d_input.size,
+    )
+
+    np.testing.assert_array_equal(d_output.get(), np.asarray([15, 10, 6, 3, 1]))
+
+
+def test_permutation_iterator_reduce():
+    d_values = cp.asarray([10, 20, 30, 40, 50, 60], dtype=np.int64)
+    d_indices = cp.asarray([4, 2, 5, 1], dtype=np.int32)
+    d_output = cp.empty(1, dtype=np.int64)
+
+    cuda.compute.reduce_into(
+        d_in=PermutationIterator(d_values, d_indices),
+        d_out=d_output,
+        num_items=d_indices.size,
+        op=OpKind.PLUS,
+        h_init=np.array([0], dtype=np.int64),
+    )
+
+    assert d_output.get()[0] == 160
+
+
+def test_transform_iterator_reduce():
+    d_output = cp.empty(1, dtype=np.int32)
+    iterator = TransformIterator(
+        CountingIterator(np.int32(1)), _raw_square_i32_op(), value_type=cccl_int32
+    )
+
+    cuda.compute.reduce_into(
+        d_in=iterator,
+        d_out=d_output,
+        num_items=6,
+        op=OpKind.PLUS,
+        h_init=np.array([0], dtype=np.int32),
+    )
+
+    assert d_output.get()[0] == 91
+
+
+def test_transform_output_iterator_reduce():
+    d_input = cp.asarray([1, 2, 3, 4], dtype=np.int16)
+    d_output = cp.empty(1, dtype=np.int16)
+    output_iterator = TransformOutputIterator(
+        d_output, _raw_negate_i16_op(), output_value_type=cccl_int16
+    )
+
+    cuda.compute.reduce_into(
+        d_in=d_input,
+        d_out=output_iterator,
+        num_items=d_input.size,
+        op=OpKind.PLUS,
+        h_init=np.array([0], dtype=np.int16),
+    )
+
+    assert d_output.get()[0] == -10
+
+
+def test_zip_iterator_transform():
+    d_lhs = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)
+    d_rhs = cp.asarray([10, 20, 30, 40, 50], dtype=np.int32)
+    d_output = cp.empty_like(d_lhs)
+
+    cuda.compute.unary_transform(
+        d_in=ZipIterator(d_lhs, d_rhs),
+        d_out=d_output,
+        op=_raw_zip_sum_i32_op(),
+        num_items=d_lhs.size,
+    )
+
+    np.testing.assert_array_equal(d_output.get(), d_lhs.get() + d_rhs.get())
+
+
+def test_shuffle_iterator_transform():
+    num_items = 17
+    d_output = cp.empty(num_items, dtype=np.int64)
+
+    cuda.compute.unary_transform(
+        d_in=ShuffleIterator(num_items, seed=123),
+        d_out=d_output,
+        op=OpKind.IDENTITY,
+        num_items=num_items,
+    )
+
+    result = d_output.get()
+    assert sorted(result.tolist()) == list(range(num_items))
+
+
+def test_discard_iterator_transform():
+    d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)
+    d_reference = cp.full_like(d_input, -1)
+
+    cuda.compute.unary_transform(
+        d_in=d_input,
+        d_out=DiscardIterator(d_reference),
+        op=OpKind.IDENTITY,
+        num_items=d_input.size,
+    )
+
+    np.testing.assert_array_equal(d_reference.get(), np.full(5, -1, dtype=np.int32))
diff --git a/python/cuda_cccl/tests/compute/test_permutation_iterator.py b/python/cuda_cccl/tests/compute/test_permutation_iterator.py
index 226904b75da..74d74a7b4e6 100644
--- a/python/cuda_cccl/tests/compute/test_permutation_iterator.py
+++ b/python/cuda_cccl/tests/compute/test_permutation_iterator.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 import cupy as cp
 import numpy as np
-import pytest
 
 import cuda.compute
 from cuda.compute.iterators import (
@@ -13,7 +12,6 @@
 )
 
 
-@pytest.mark.no_numba
 def test_permutation_iterator_equality():
     values1 = cp.asarray([10, 20, 30, 40, 50], dtype="int32")
     values2 = cp.asarray([100, 200, 300], dtype="int32")
@@ -39,7 +37,6 @@ def test_permutation_iterator_equality():
     assert it1.kind != it5.kind
 
 
-@pytest.mark.no_numba
 def test_permutation_iterator_with_array_values():
     values = cp.asarray([10, 20, 30, 40, 50], dtype="int32")
     indices = cp.asarray([2, 0, 4, 1], dtype="int32")
@@ -57,7 +54,6 @@ def test_permutation_iterator_with_array_values():
     assert d_output[0] == values[indices].sum()
 
 
-@pytest.mark.no_numba
 def test_permutation_iterator_with_iterator_values():
     values_it = CountingIterator(np.int32(10))
     indices = cp.asarray([2, 0, 4, 1], dtype="int32")
@@ -161,7 +157,6 @@ def op(a):
     assert cp.all(d_out == expected)
 
 
-@pytest.mark.no_numba
 def test_caching_permutation_iterator():
     """Test that iterator compilation is cached across instances with the same structure."""
     from cuda.compute._cpp_compile import compile_cpp_op_code
@@ -209,7 +204,6 @@ def test_caching_permutation_iterator():
     )
 
 
-@pytest.mark.no_numba
 def test_permutation_iterator_advance():
     """Test PermutationIterator.__add__ only advances indices, not values."""
     # Create values array [10, 20, 30, 40, 50, 60, 70]
diff --git a/python/cuda_cccl/tests/compute/test_radix_sort.py b/python/cuda_cccl/tests/compute/test_radix_sort.py
index d39a7e07fad..62b8e7dcb44 100644
--- a/python/cuda_cccl/tests/compute/test_radix_sort.py
+++ b/python/cuda_cccl/tests/compute/test_radix_sort.py
@@ -6,6 +6,7 @@
 from typing import Tuple
 
 import cupy as cp
+import numba
 import numpy as np
 import pytest
 
@@ -14,7 +15,6 @@
     DoubleBuffer,
     SortOrder,
 )
-from cuda.core import Device
 
 
 def get_mark(dt, log_size):
@@ -148,7 +148,7 @@ def host_sort(h_in_keys, h_in_values, order, begin_bit=None, end_bit=None) -> Tu
     DTYPE_SIZE,
 )
 def test_radix_sort_keys(dtype, num_items, monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -164,12 +164,12 @@ def test_radix_sort_keys(dtype, num_items, monkeypatch):
     h_in_keys = random_array(num_items, dtype, max_value=20)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_out_keys = cp.asarray(h_out_keys)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
 
     radix_sort_device(d_in_keys, d_out_keys, None, None, order, num_items)
 
-    h_out_keys = d_out_keys.get()
+    h_out_keys = d_out_keys.copy_to_host()
 
     h_in_keys, _ = host_sort(h_in_keys, None, order)
 
@@ -195,17 +195,17 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch):
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_values = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_values = cp.asarray(h_in_values)
-    d_out_keys = cp.asarray(h_out_keys)
-    d_out_values = cp.asarray(h_out_values)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_values = numba.cuda.to_device(h_in_values)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_out_values = numba.cuda.to_device(h_out_values)
 
     radix_sort_device(
         d_in_keys, d_out_keys, d_in_values, d_out_values, order, num_items
     )
 
-    h_out_keys = d_out_keys.get()
-    h_out_values = d_out_values.get()
+    h_out_keys = d_out_keys.copy_to_host()
+    h_out_values = d_out_values.copy_to_host()
 
     h_in_keys, h_in_values = host_sort(h_in_keys, h_in_values, order)
 
@@ -218,7 +218,7 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch):
     DTYPE_SIZE,
 )
 def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -234,14 +234,14 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch):
     h_in_keys = random_array(num_items, dtype, max_value=20)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_out_keys = cp.asarray(h_out_keys)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
 
     keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys)
 
     radix_sort_device(keys_double_buffer, None, None, None, order, num_items)
 
-    h_out_keys = keys_double_buffer.current().get()
+    h_out_keys = keys_double_buffer.current().copy_to_host()
 
     h_in_keys, _ = host_sort(h_in_keys, None, order)
 
@@ -253,7 +253,7 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch):
     DTYPE_SIZE,
 )
 def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # NOTE: int16 failures seen only with NVRTC 13.1:
     if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)):
         import cuda.compute._cccl_interop
@@ -270,10 +270,10 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_values = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_values = cp.asarray(h_in_values)
-    d_out_keys = cp.asarray(h_out_keys)
-    d_out_values = cp.asarray(h_out_values)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_values = numba.cuda.to_device(h_in_values)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_out_values = numba.cuda.to_device(h_out_values)
 
     keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys)
     values_double_buffer = DoubleBuffer(d_in_values, d_out_values)
@@ -282,8 +282,8 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
         keys_double_buffer, None, values_double_buffer, None, order, num_items
     )
 
-    h_out_keys = keys_double_buffer.current().get()
-    h_out_values = values_double_buffer.current().get()
+    h_out_keys = keys_double_buffer.current().copy_to_host()
+    h_out_values = values_double_buffer.current().copy_to_host()
 
     h_in_keys, h_in_values = host_sort(h_in_keys, h_in_values, order)
 
@@ -304,7 +304,7 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
     DTYPE_SIZE_BIT_WINDOW,
 )
 def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # NOTE: int16 failures seen only with NVRTC 13.1:
     if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)):
         import cuda.compute._cccl_interop
@@ -329,10 +329,10 @@ def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch):
         h_out_keys = np.empty(num_items, dtype=dtype)
         h_out_values = np.empty(num_items, dtype=np.float32)
 
-        d_in_keys = cp.asarray(h_in_keys)
-        d_in_values = cp.asarray(h_in_values)
-        d_out_keys = cp.asarray(h_out_keys)
-        d_out_values = cp.asarray(h_out_values)
+        d_in_keys = numba.cuda.to_device(h_in_keys)
+        d_in_values = numba.cuda.to_device(h_in_values)
+        d_out_keys = numba.cuda.to_device(h_out_keys)
+        d_out_values = numba.cuda.to_device(h_out_values)
 
         radix_sort_device(
             d_in_keys,
@@ -345,8 +345,8 @@ def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch):
             end_bit,
         )
 
-        h_out_keys = d_out_keys.get()
-        h_out_values = d_out_values.get()
+        h_out_keys = d_out_keys.copy_to_host()
+        h_out_values = d_out_values.copy_to_host()
 
         h_in_keys, h_in_values = host_sort(
             h_in_keys, h_in_values, order, begin_bit, end_bit
@@ -384,10 +384,10 @@ def test_radix_sort_pairs_double_buffer_bit_window(dtype, num_items, monkeypatch
         h_out_keys = np.empty(num_items, dtype=dtype)
         h_out_values = np.empty(num_items, dtype=np.float32)
 
-        d_in_keys = cp.asarray(h_in_keys)
-        d_in_values = cp.asarray(h_in_values)
-        d_out_keys = cp.asarray(h_out_keys)
-        d_out_values = cp.asarray(h_out_values)
+        d_in_keys = numba.cuda.to_device(h_in_keys)
+        d_in_values = numba.cuda.to_device(h_in_values)
+        d_out_keys = numba.cuda.to_device(h_out_keys)
+        d_out_values = numba.cuda.to_device(h_out_values)
 
         keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys)
         values_double_buffer = DoubleBuffer(d_in_values, d_out_values)
@@ -403,8 +403,8 @@ def test_radix_sort_pairs_double_buffer_bit_window(dtype, num_items, monkeypatch
             end_bit,
         )
 
-        h_out_keys = keys_double_buffer.current().get()
-        h_out_values = values_double_buffer.current().get()
+        h_out_keys = keys_double_buffer.current().copy_to_host()
+        h_out_values = values_double_buffer.current().copy_to_host()
 
         h_in_keys, h_in_values = host_sort(
             h_in_keys, h_in_values, order, begin_bit, end_bit
@@ -469,9 +469,8 @@ def test_radix_sort_with_stream(cuda_stream):
     np.testing.assert_array_equal(got, h_in_keys)
 
 
-@pytest.mark.no_numba
 def test_radix_sort(monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -519,9 +518,8 @@ def test_radix_sort(monkeypatch):
     np.testing.assert_array_equal(h_out_items, h_in_values)
 
 
-@pytest.mark.no_numba
 def test_radix_sort_double_buffer(monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
diff --git a/python/cuda_cccl/tests/compute/test_reduce.py b/python/cuda_cccl/tests/compute/test_reduce.py
index 15f11428506..df3f024f0ee 100644
--- a/python/cuda_cccl/tests/compute/test_reduce.py
+++ b/python/cuda_cccl/tests/compute/test_reduce.py
@@ -6,6 +6,7 @@
 import random
 
 import cupy as cp
+import numba.cuda
 import numpy as np
 import pytest
 from cupy.cuda import runtime
@@ -72,15 +73,15 @@ def add_op(a, b):
 def test_device_reduce(dtype, num_items, op):
     init_value = 42
     h_init = np.array([init_value], dtype=dtype)
-    d_output = cp.empty(1, dtype=dtype)
+    d_output = numba.cuda.device_array(1, dtype=dtype)
 
     h_input = random_int(num_items, dtype)
-    d_input = cp.asarray(h_input)
+    d_input = numba.cuda.to_device(h_input)
 
     cuda.compute.reduce_into(
         d_in=d_input, d_out=d_output, num_items=d_input.size, op=op, h_init=h_init
     )
-    h_output = d_output.get()
+    h_output = d_output.copy_to_host()
     assert h_output[0] == pytest.approx(
         sum(h_input) + init_value, rel=0.08 if dtype == np.float16 else 0
     )  # obtained relative error value from c2h/include/c2h/check_results.cuh
@@ -93,10 +94,10 @@ def test_device_reduce_with_lambda():
     num_items = 1024
 
     h_init = np.array([init_value], dtype=dtype)
-    d_output = cp.empty(1, dtype=dtype)
+    d_output = numba.cuda.device_array(1, dtype=dtype)
 
     h_input = random_int(num_items, dtype)
-    d_input = cp.asarray(h_input)
+    d_input = numba.cuda.to_device(h_input)
 
     # Use a lambda function directly as the reducer
     cuda.compute.reduce_into(
@@ -106,7 +107,7 @@ def test_device_reduce_with_lambda():
         op=lambda a, b: a + b,
         h_init=h_init,
     )
-    h_output = d_output.get()
+    h_output = d_output.copy_to_host()
     assert h_output[0] == sum(h_input) + init_value
 
 
@@ -117,10 +118,10 @@ def test_device_reduce_with_lambda_variable():
     num_items = 1024
 
     h_init = np.array([init_value], dtype=dtype)
-    d_output = cp.empty(1, dtype=dtype)
+    d_output = numba.cuda.device_array(1, dtype=dtype)
 
     h_input = random_int(num_items, dtype)
-    d_input = cp.asarray(h_input)
+    d_input = numba.cuda.to_device(h_input)
 
     # Use a lambda function assigned to a variable as the reducer
     cuda.compute.reduce_into(
@@ -130,24 +131,24 @@ def test_device_reduce_with_lambda_variable():
         op=add_op_lambda,
         h_init=h_init,
     )
-    h_output = d_output.get()
+    h_output = d_output.copy_to_host()
     assert h_output[0] == sum(h_input) + init_value
 
 
 def test_complex_device_reduce():
     h_init = np.array([40.0 + 2.0j], dtype=complex)
-    d_output = cp.empty(1, dtype=complex)
+    d_output = numba.cuda.device_array(1, dtype=complex)
 
     for num_items in [42, 420000]:
         real_imag = np.random.random((2, num_items))
         h_input = real_imag[0] + 1j * real_imag[1]
-        d_input = cp.asarray(h_input)
+        d_input = numba.cuda.to_device(h_input)
         assert d_input.size == num_items
         cuda.compute.reduce_into(
             d_in=d_input, d_out=d_output, num_items=num_items, op=add_op, h_init=h_init
         )
 
-        result = d_output.get()[0]
+        result = d_output.copy_to_host()[0]
         expected = np.sum(h_input, initial=h_init[0])
         assert result == pytest.approx(expected)
 
@@ -161,11 +162,11 @@ def _test_device_sum_with_iterator(
 
     if use_numpy_array:
         h_input = np.array(l_varr, dtype_inp)
-        d_input = cp.asarray(h_input)
+        d_input = numba.cuda.to_device(h_input)
     else:
         d_input = i_input
 
-    d_output = cp.empty(1, dtype_out)  # to store device sum
+    d_output = numba.cuda.device_array(1, dtype_out)  # to store device sum
 
     h_init = np.array([start_sum_with], dtype_out)
 
@@ -173,7 +174,7 @@ def _test_device_sum_with_iterator(
         d_in=d_input, d_out=d_output, num_items=len(l_varr), op=add_op, h_init=h_init
     )
 
-    h_output = d_output.get()
+    h_output = d_output.copy_to_host()
     assert h_output[0] == expected_result
 
 
@@ -215,7 +216,7 @@ def test_device_sum_cache_modified_input_it(
     l_varr = [rng.randrange(100) for _ in range(num_items)]
     dtype_inp = np.dtype(supported_value_type)
     dtype_out = dtype_inp
-    input_devarr = cp.asarray(np.array(l_varr, dtype=dtype_inp))
+    input_devarr = numba.cuda.to_device(np.array(l_varr, dtype=dtype_inp))
     i_input = CacheModifiedInputIterator(input_devarr, modifier="stream")
     _test_device_sum_with_iterator(
         l_varr, start_sum_with, i_input, dtype_inp, dtype_out, use_numpy_array
@@ -690,7 +691,6 @@ def add_op(x, y):
         )
 
 
-@pytest.mark.no_numba
 def test_device_reduce_well_known_plus():
     dtype = np.int32
     h_init = np.array([0], dtype=dtype)
@@ -709,7 +709,6 @@ def test_device_reduce_well_known_plus():
     assert (d_output == expected_output).all()
 
 
-@pytest.mark.no_numba
 def test_device_reduce_well_known_minimum():
     dtype = np.int32
     h_init = np.array([100], dtype=dtype)
@@ -728,7 +727,6 @@ def test_device_reduce_well_known_minimum():
     assert (d_output == expected_output).all()
 
 
-@pytest.mark.no_numba
 def test_device_reduce_well_known_maximum():
     dtype = np.int32
     h_init = np.array([-100], dtype=dtype)
@@ -927,7 +925,6 @@ def sqrt(x: dtype) -> dtype:
     np.testing.assert_allclose(d_output.get(), expected.get(), atol=1e-6)
 
 
-@pytest.mark.no_numba
 def test_reduce_with_not_guaranteed_determinism(floating_array):
     dtype = floating_array.dtype
     h_init = np.array([0], dtype=dtype)
@@ -945,7 +942,6 @@ def test_reduce_with_not_guaranteed_determinism(floating_array):
     )
 
 
-@pytest.mark.no_numba
 def test_reduce_bool():
     h_init = np.array([False])
     d_input = cp.array([True, False, True])
diff --git a/python/cuda_cccl/tests/compute/test_scan.py b/python/cuda_cccl/tests/compute/test_scan.py
index 0eb3af34c49..af6d941b11b 100644
--- a/python/cuda_cccl/tests/compute/test_scan.py
+++ b/python/cuda_cccl/tests/compute/test_scan.py
@@ -4,6 +4,7 @@
 
 
 import cupy as cp
+import numba.cuda
 import numpy as np
 import pytest
 
@@ -15,7 +16,6 @@
     TransformOutputIterator,
     gpu_struct,
 )
-from cuda.core import Device
 
 
 def scan_host(h_input: np.ndarray, op, h_init, force_inclusive):
@@ -52,7 +52,7 @@ def scan_device(d_input, d_output, num_items, op, h_init, force_inclusive, strea
     [True, False],
 )
 def test_scan_array_input(force_inclusive, input_array, monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip sass verification if input is complex
     # as LDL/STL instructions are emitted for complex types.
     # Also skip for:
@@ -214,7 +214,6 @@ def op(a, b):
     np.testing.assert_allclose(expected, got, rtol=1e-5)
 
 
-@pytest.mark.no_numba
 def test_exclusive_scan_well_known_plus():
     dtype = np.int32
     h_init = np.array([0], dtype=dtype)
@@ -233,9 +232,8 @@ def test_exclusive_scan_well_known_plus():
     np.testing.assert_equal(d_output.get(), expected)
 
 
-@pytest.mark.no_numba
 def test_inclusive_scan_well_known_plus(monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip SASS check for CC 9.0+, due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -356,7 +354,7 @@ def add_op(a, b):
 
 
 def test_reverse_input_iterator(monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip SASS check for CC 9.0+, due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -411,7 +409,6 @@ def add_op(a, b):
     np.testing.assert_equal(d_output.get(), expected)
 
 
-@pytest.mark.no_numba
 @pytest.mark.parametrize(
     "force_inclusive",
     [True, False],
@@ -433,14 +430,13 @@ def test_future_init_value(force_inclusive):
     np.testing.assert_array_equal(expected, got)
 
 
-@pytest.mark.no_numba
 def test_no_init_value(monkeypatch):
     force_inclusive = True
     num_items = 1024
     dtype = np.dtype("int32")
 
     # Skip SASS check for CC 9.0 due to LDL/STL CI failure.
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     if cc_major >= 9:
         import cuda.compute._cccl_interop
 
@@ -460,7 +456,6 @@ def test_no_init_value(monkeypatch):
     np.testing.assert_array_equal(expected, got)
 
 
-@pytest.mark.no_numba
 def test_no_init_value_iterator():
     force_inclusive = True
     num_items = 1024
@@ -498,7 +493,6 @@ def test_inclusive_scan_with_lambda():
     np.testing.assert_array_equal(d_output.get(), expected)
 
 
-@pytest.mark.no_numba
 @pytest.mark.parametrize("force_inclusive", [True, False])
 def test_scan_bool_maximum(force_inclusive):
     h_init = np.array([False], dtype=np.bool_)
diff --git a/python/cuda_cccl/tests/compute/test_segmented_reduce.py b/python/cuda_cccl/tests/compute/test_segmented_reduce.py
index 815c98b8edb..950d860a14d 100644
--- a/python/cuda_cccl/tests/compute/test_segmented_reduce.py
+++ b/python/cuda_cccl/tests/compute/test_segmented_reduce.py
@@ -269,7 +269,6 @@ def _plus(a, b):
         )
 
 
-@pytest.mark.no_numba
 def test_segmented_reduce_well_known_plus(monkeypatch):
     # Disable SASS verification for this test (LDL instruction in SASS).
     monkeypatch.setattr(
@@ -300,7 +299,6 @@ def test_segmented_reduce_well_known_plus(monkeypatch):
     np.testing.assert_equal(d_output.get(), expected)
 
 
-@pytest.mark.no_numba
 def test_segmented_reduce_well_known_maximum(monkeypatch):
     # Disable SASS verification for this test (LDL instruction in SASS).
     monkeypatch.setattr(
@@ -331,7 +329,6 @@ def test_segmented_reduce_well_known_maximum(monkeypatch):
     np.testing.assert_equal(d_output.get(), expected)
 
 
-@pytest.mark.no_numba
 def test_segmented_reduce_bool_maximum(monkeypatch):
     # Disable SASS verification for this test (LDL instruction in SASS).
     monkeypatch.setattr(
diff --git a/python/cuda_cccl/tests/compute/test_segmented_sort.py b/python/cuda_cccl/tests/compute/test_segmented_sort.py
index 9c2ab9f2de4..2bdd03abb42 100644
--- a/python/cuda_cccl/tests/compute/test_segmented_sort.py
+++ b/python/cuda_cccl/tests/compute/test_segmented_sort.py
@@ -5,13 +5,12 @@
 from typing import Tuple
 
 import cupy as cp
+import numba
 import numpy as np
 import pytest
 
 import cuda.compute
 
-pytestmark = pytest.mark.no_numba
-
 DTYPE_LIST = [
     np.uint8,
     np.int16,
@@ -119,8 +118,8 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch):
     h_in_keys = random_array(num_items, dtype, max_value=50)
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_out_keys = cp.asarray(np.empty_like(h_in_keys))
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
 
     cuda.compute.segmented_sort(
         d_in_keys=d_in_keys,
@@ -134,7 +133,7 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch):
         order=order,
     )
 
-    h_out_keys = d_out_keys.get()
+    h_out_keys = d_out_keys.copy_to_host()
     expected_keys, _ = host_segmented_sort(
         h_in_keys, None, start_offsets, end_offsets, order
     )
@@ -154,10 +153,10 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size):
 
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_vals = cp.asarray(h_in_vals)
-    d_out_keys = cp.asarray(np.empty_like(h_in_keys))
-    d_out_vals = cp.asarray(np.empty_like(h_in_vals))
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_vals = numba.cuda.to_device(h_in_vals)
+    d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
+    d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals))
 
     cuda.compute.segmented_sort(
         d_in_keys=d_in_keys,
@@ -171,8 +170,8 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size):
         order=order,
     )
 
-    h_out_keys = d_out_keys.get()
-    h_out_vals = d_out_vals.get()
+    h_out_keys = d_out_keys.copy_to_host()
+    h_out_vals = d_out_vals.copy_to_host()
 
     expected_keys, expected_vals = host_segmented_sort(
         h_in_keys, h_in_vals, start_offsets, end_offsets, order
@@ -190,8 +189,8 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size):
     h_in_keys = random_array(num_items, dtype, max_value=20)
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_tmp_keys = cp.asarray(np.empty_like(h_in_keys))
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
     keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys)
 
     cuda.compute.segmented_sort(
@@ -206,7 +205,7 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size):
         order=order,
     )
 
-    h_out_keys = keys_db.current().get()
+    h_out_keys = keys_db.current().copy_to_host()
     expected_keys, _ = host_segmented_sort(
         h_in_keys, None, start_offsets, end_offsets, order
     )
@@ -225,10 +224,10 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size):
 
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_vals = cp.asarray(h_in_vals)
-    d_tmp_keys = cp.asarray(np.empty_like(h_in_keys))
-    d_tmp_vals = cp.asarray(np.empty_like(h_in_vals))
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_vals = numba.cuda.to_device(h_in_vals)
+    d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
+    d_tmp_vals = numba.cuda.to_device(np.empty_like(h_in_vals))
 
     keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys)
     vals_db = cuda.compute.DoubleBuffer(d_in_vals, d_tmp_vals)
@@ -245,8 +244,8 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size):
         order=order,
     )
 
-    h_out_keys = keys_db.current().get()
-    h_out_vals = vals_db.current().get()
+    h_out_keys = keys_db.current().copy_to_host()
+    h_out_vals = vals_db.current().copy_to_host()
 
     expected_keys, expected_vals = host_segmented_sort(
         h_in_keys, h_in_vals, start_offsets, end_offsets, order
@@ -298,10 +297,10 @@ def test_segmented_sort_variable_segment_sizes(num_segments):
     h_in_keys = random_array(num_items, np.int32, max_value=100)
     h_in_vals = random_array(num_items, np.float32)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_vals = cp.asarray(h_in_vals)
-    d_out_keys = cp.asarray(np.empty_like(h_in_keys))
-    d_out_vals = cp.asarray(np.empty_like(h_in_vals))
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_vals = numba.cuda.to_device(h_in_vals)
+    d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
+    d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals))
 
     cuda.compute.segmented_sort(
         d_in_keys=d_in_keys,
@@ -315,8 +314,8 @@ def test_segmented_sort_variable_segment_sizes(num_segments):
         order=order,
     )
 
-    h_out_keys = d_out_keys.get()
-    h_out_vals = d_out_vals.get()
+    h_out_keys = d_out_keys.copy_to_host()
+    h_out_vals = d_out_vals.copy_to_host()
     expected_keys, expected_vals = host_segmented_sort(
         h_in_keys, h_in_vals, start_offsets, end_offsets, order
     )
diff --git a/python/cuda_cccl/tests/compute/test_select.py b/python/cuda_cccl/tests/compute/test_select.py
index 3dcaf98a507..f2483860a4a 100644
--- a/python/cuda_cccl/tests/compute/test_select.py
+++ b/python/cuda_cccl/tests/compute/test_select.py
@@ -8,8 +8,6 @@
 
 import cuda.compute
 from cuda.compute import CacheModifiedInputIterator, ZipIterator, gpu_struct
-from cuda.compute._cpp_compile import compile_cpp_op_code
-from cuda.compute.op import RawOp
 
 DTYPE_LIST = [
     np.uint8,
@@ -61,35 +59,6 @@ def _host_select(h_in: np.ndarray, cond):
     return selected, np.int64(selected.size)
 
 
-def _raw_even_i32_op() -> RawOp:
-    source = """
-extern "C" __device__ void is_even_i32(void* x, void* result) {
-    int value = *static_cast<int*>(x);
-    *static_cast<bool*>(result) = (value % 2) == 0;
-}
-"""
-    return RawOp(ltoir=compile_cpp_op_code(source), name="is_even_i32")
-
-
-@pytest.mark.no_numba
-def test_select_raw_op_minimal():
-    h_in = np.arange(10, dtype=np.int32)
-    d_in = cp.asarray(h_in)
-    d_out = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.uint64)
-
-    cuda.compute.select(
-        d_in=d_in,
-        d_out=d_out,
-        d_num_selected_out=d_num_selected,
-        cond=_raw_even_i32_op(),
-        num_items=len(d_in),
-    )
-
-    num_selected = int(d_num_selected[0].get())
-    np.testing.assert_array_equal(d_out.get()[:num_selected], h_in[h_in % 2 == 0])
-
-
 @pytest.mark.parametrize("dtype,num_items", select_params)
 def test_select_basic(dtype, num_items):
     h_in = random_array(num_items, dtype, max_value=100)
diff --git a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py
index 715f9e51235..5c82fe1ad38 100644
--- a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py
+++ b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py
@@ -87,7 +87,6 @@ def test_shuffle_iterator_with_permutation_iterator():
     assert sorted(result) == sorted(d_values.get())
 
 
-@pytest.mark.no_numba
 def test_shuffle_iterator_invalid_num_items():
     with pytest.raises(ValueError, match="num_items must be > 0"):
         ShuffleIterator(0, seed=42)
diff --git a/python/cuda_cccl/tests/compute/test_struct_field_validation.py b/python/cuda_cccl/tests/compute/test_struct_field_validation.py
index cf1edd7e681..3ddaee54d41 100644
--- a/python/cuda_cccl/tests/compute/test_struct_field_validation.py
+++ b/python/cuda_cccl/tests/compute/test_struct_field_validation.py
@@ -14,8 +14,6 @@
 
 from cuda.compute import gpu_struct
 
-pytestmark = pytest.mark.no_numba
-
 
 def test_newline_in_field_name_is_rejected():
     """Field names with newlines must be rejected — they are the exec() injection vector."""
diff --git a/python/cuda_cccl/tests/compute/test_three_way_partition.py b/python/cuda_cccl/tests/compute/test_three_way_partition.py
index 8def513681b..3fb5b102275 100644
--- a/python/cuda_cccl/tests/compute/test_three_way_partition.py
+++ b/python/cuda_cccl/tests/compute/test_three_way_partition.py
@@ -8,8 +8,6 @@
 
 import cuda.compute
 from cuda.compute import CacheModifiedInputIterator, gpu_struct
-from cuda.compute._cpp_compile import compile_cpp_op_code
-from cuda.compute.op import RawOp
 
 DTYPE_LIST = [
     np.uint8,
@@ -67,50 +65,6 @@ def _host_three_way_partition(h_in: np.ndarray, less_than_op, greater_equal_op):
     )
 
 
-def _raw_less_than_i32(name: str, threshold: int) -> RawOp:
-    source = f"""
-extern "C" __device__ void {name}(void* x, void* result) {{
-    int value = *static_cast<int*>(x);
-    *static_cast<unsigned char*>(result) = value < {threshold} ? 1 : 0;
-}}
-"""
-    return RawOp(ltoir=compile_cpp_op_code(source), name=name)
-
-
-@pytest.mark.no_numba
-def test_three_way_partition_raw_op_minimal():
-    h_in = np.arange(10, dtype=np.int32)
-    d_in = cp.asarray(h_in)
-    d_first = cp.empty_like(d_in)
-    d_second = cp.empty_like(d_in)
-    d_unselected = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.uint64)
-
-    cuda.compute.three_way_partition(
-        d_in=d_in,
-        d_first_part_out=d_first,
-        d_second_part_out=d_second,
-        d_unselected_out=d_unselected,
-        d_num_selected_out=d_num_selected,
-        select_first_part_op=_raw_less_than_i32("less_than_3_i32", 3),
-        select_second_part_op=_raw_less_than_i32("less_than_6_i32", 6),
-        num_items=len(d_in),
-    )
-
-    selected = d_num_selected.get()
-    first_count = int(selected[0])
-    second_count = int(selected[1])
-    unselected_count = len(h_in) - first_count - second_count
-
-    np.testing.assert_array_equal(d_first.get()[:first_count], h_in[h_in < 3])
-    np.testing.assert_array_equal(
-        d_second.get()[:second_count], h_in[(h_in >= 3) & (h_in < 6)]
-    )
-    np.testing.assert_array_equal(
-        d_unselected.get()[:unselected_count], h_in[h_in >= 6]
-    )
-
-
 @pytest.mark.parametrize("dtype,num_items", three_way_partition_params)
 def test_three_way_partition_basic(dtype, num_items, monkeypatch):
     # NOTE: the SASS check failure is seen only with NVRTC 13.1:
diff --git a/python/cuda_cccl/tests/compute/test_transform.py b/python/cuda_cccl/tests/compute/test_transform.py
index b8429726d77..c7c3ca2818a 100644
--- a/python/cuda_cccl/tests/compute/test_transform.py
+++ b/python/cuda_cccl/tests/compute/test_transform.py
@@ -266,7 +266,6 @@ def op2(a):
     np.testing.assert_allclose(expected, got)
 
 
-@pytest.mark.no_numba
 def test_unary_transform_well_known_negate():
     """Test unary transform with well-known NEGATE operation."""
     dtype = np.int32
@@ -283,7 +282,6 @@ def test_unary_transform_well_known_negate():
     np.testing.assert_equal(d_output.get(), expected)
 
 
-@pytest.mark.no_numba
 def test_unary_transform_well_known_identity():
     """Test unary transform with well-known IDENTITY operation."""
     dtype = np.int32
@@ -300,7 +298,6 @@ def test_unary_transform_well_known_identity():
     np.testing.assert_equal(d_output.get(), expected)
 
 
-@pytest.mark.no_numba
 @pytest.mark.parametrize("dtype", [np.int32, np.float16])
 def test_binary_transform_well_known_plus(dtype):
     """Test binary transform with well-known PLUS operation."""
@@ -322,7 +319,6 @@ def test_binary_transform_well_known_plus(dtype):
     np.testing.assert_equal(d_output.get(), expected)
 
 
-@pytest.mark.no_numba
 def test_binary_transform_well_known_multiplies():
     """Test binary transform with well-known MULTIPLIES operation."""
     dtype = np.int32
@@ -590,7 +586,6 @@ def test_binary_transform_with_lambda():
     np.testing.assert_array_equal(d_out.get(), expected)
 
 
-@pytest.mark.no_numba
 def test_binary_transform_bool_equal_to():
     d_input1 = cp.array([True, False, True, False], dtype=np.bool_)
     d_input2 = cp.array([True, True, False, False], dtype=np.bool_)
diff --git a/python/cuda_cccl/tests/compute/test_unique_by_key.py b/python/cuda_cccl/tests/compute/test_unique_by_key.py
index 945d4d99028..83a4a17db67 100644
--- a/python/cuda_cccl/tests/compute/test_unique_by_key.py
+++ b/python/cuda_cccl/tests/compute/test_unique_by_key.py
@@ -4,6 +4,7 @@
 
 
 import cupy as cp
+import numba.cuda
 import numpy as np
 import pytest
 
@@ -14,7 +15,6 @@
     OpKind,
     gpu_struct,
 )
-from cuda.core import Device
 
 DTYPE_LIST = [
     np.uint8,
@@ -125,7 +125,7 @@ def compare_op(lhs, rhs):
 
 @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params)
 def test_unique_by_key(dtype, num_items, op, monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -143,11 +143,11 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch):
     h_out_items = np.empty(num_items, dtype=np.float32)
     h_out_num_selected = np.empty(1, np.int32)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_items = cp.asarray(h_in_items)
-    d_out_keys = cp.asarray(h_out_keys)
-    d_out_items = cp.asarray(h_out_items)
-    d_out_num_selected = cp.asarray(h_out_num_selected)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_items = numba.cuda.to_device(h_in_items)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_out_items = numba.cuda.to_device(h_out_items)
+    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
 
     unique_by_key_device(
         d_in_keys,
@@ -159,10 +159,10 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch):
         num_items,
     )
 
-    h_out_num_selected = d_out_num_selected.get()
+    h_out_num_selected = d_out_num_selected.copy_to_host()
     num_selected = h_out_num_selected[0]
-    h_out_keys = d_out_keys.get()[:num_selected]
-    h_out_items = d_out_items.get()[:num_selected]
+    h_out_keys = d_out_keys.copy_to_host()[:num_selected]
+    h_out_items = d_out_items.copy_to_host()[:num_selected]
 
     expected_keys, expected_items = unique_by_key_host(h_in_keys, h_in_items)
 
@@ -172,7 +172,7 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch):
 
 @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params)
 def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip sass verification for CC 9.0+, due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -190,11 +190,11 @@ def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch):
     h_out_items = np.empty(num_items, dtype=np.float32)
     h_out_num_selected = np.empty(1, np.int64)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_items = cp.asarray(h_in_items)
-    d_out_keys = cp.asarray(h_out_keys)
-    d_out_items = cp.asarray(h_out_items)
-    d_out_num_selected = cp.asarray(h_out_num_selected)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_items = numba.cuda.to_device(h_in_items)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_out_items = numba.cuda.to_device(h_out_items)
+    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
 
     i_in_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream")
     i_in_items = CacheModifiedInputIterator(d_in_items, modifier="stream")
@@ -209,10 +209,10 @@ def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch):
         num_items,
     )
 
-    h_out_num_selected = d_out_num_selected.get()
+    h_out_num_selected = d_out_num_selected.copy_to_host()
     num_selected = h_out_num_selected[0]
-    h_out_keys = d_out_keys.get()[:num_selected]
-    h_out_items = d_out_items.get()[:num_selected]
+    h_out_keys = d_out_keys.copy_to_host()[:num_selected]
+    h_out_items = d_out_items.copy_to_host()[:num_selected]
 
     expected_keys, expected_items = unique_by_key_host(h_in_keys, h_in_items)
 
@@ -226,9 +226,9 @@ def test_unique_by_key_keys_only():
     h_out_keys = np.empty(num_items, dtype=np.int32)
     h_out_num_selected = np.empty(1, np.int32)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_out_keys = cp.asarray(h_out_keys)
-    d_out_num_selected = cp.asarray(h_out_num_selected)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
 
     unique_by_key_device(
         d_in_keys,
@@ -240,9 +240,9 @@ def test_unique_by_key_keys_only():
         num_items,
     )
 
-    h_out_num_selected = d_out_num_selected.get()
+    h_out_num_selected = d_out_num_selected.copy_to_host()
     num_selected = h_out_num_selected[0]
-    h_out_keys = d_out_keys.get()[:num_selected]
+    h_out_keys = d_out_keys.copy_to_host()[:num_selected]
 
     expected_keys, _ = unique_by_key_host(
         h_in_keys,
@@ -267,11 +267,11 @@ def compare_complex(lhs, rhs):
     h_out_items = np.empty(num_items, dtype=np.float32)
     h_out_num_selected = np.empty(1, np.int32)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_items = cp.asarray(h_in_items)
-    d_out_keys = cp.asarray(h_out_keys)
-    d_out_items = cp.asarray(h_out_items)
-    d_out_num_selected = cp.asarray(h_out_num_selected)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_items = numba.cuda.to_device(h_in_items)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_out_items = numba.cuda.to_device(h_out_items)
+    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
 
     unique_by_key_device(
         d_in_keys,
@@ -283,10 +283,10 @@ def compare_complex(lhs, rhs):
         num_items,
     )
 
-    h_out_num_selected = d_out_num_selected.get()
+    h_out_num_selected = d_out_num_selected.copy_to_host()
     num_selected = h_out_num_selected[0]
-    h_out_keys = d_out_keys.get()[:num_selected]
-    h_out_items = d_out_items.get()[:num_selected]
+    h_out_keys = d_out_keys.copy_to_host()[:num_selected]
+    h_out_items = d_out_items.copy_to_host()[:num_selected]
 
     expected_keys, expected_items = unique_by_key_host(
         h_in_keys, h_in_items, compare_complex
@@ -363,7 +363,7 @@ def struct_compare_op(lhs, rhs):
 
 
 def test_unique_by_key_with_stream(cuda_stream, monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -417,9 +417,8 @@ def test_unique_by_key_with_stream(cuda_stream, monkeypatch):
     np.testing.assert_array_equal(h_out_items, expected_items)
 
 
-@pytest.mark.no_numba
 def test_unique_by_key_well_known_equal_to(monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
diff --git a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py
index 2788a3c0893..e08709b6e1a 100644
--- a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py
+++ b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py
@@ -12,16 +12,10 @@
 """
 
 import pytest
+from numba import types
 
-numba = pytest.importorskip("numba")
-types = numba.types
-
-from cuda.compute._odr_helpers import (  # noqa: E402
-    _ArgMode,
-    _ArgSpec,
-    _create_void_ptr_wrapper,
-)
-from cuda.compute._utils import sanitize_identifier  # noqa: E402
+from cuda.compute._odr_helpers import _ArgMode, _ArgSpec, _create_void_ptr_wrapper
+from cuda.compute._utils import sanitize_identifier
 
 
 def _make_arg_specs():
diff --git a/python/cuda_cccl/tests/compute/test_zip_iterator.py b/python/cuda_cccl/tests/compute/test_zip_iterator.py
index 9fbed38359f..85630bb4f5f 100644
--- a/python/cuda_cccl/tests/compute/test_zip_iterator.py
+++ b/python/cuda_cccl/tests/compute/test_zip_iterator.py
@@ -12,7 +12,6 @@
     ZipIterator,
     gpu_struct,
 )
-from cuda.core import Device
 
 
 @pytest.mark.parametrize("num_items", [10, 1_000, 100_000])
@@ -262,8 +261,10 @@ def min_pairs(p1, p2):
 @pytest.mark.parametrize("num_items", [10, 1000])
 def test_output_zip_iterator_with_scan(monkeypatch, num_items):
     """Test ZipIterator as output iterator with scan operations."""
+    import numba.cuda
+
     # Skip SASS check for CC 8.0+ due to LDL/STL CI failure.
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     if cc_major >= 8:
         monkeypatch.setattr(
             cuda.compute._cccl_interop,
@@ -425,7 +426,9 @@ def sum_nested_zips(v1, v2):
     ],
 )
 def test_nested_output_zip_iterator_with_scan(monkeypatch, num_items, dtype_map):
-    cc_major, _ = Device().compute_capability
+    import numba.cuda
+
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     if cc_major >= 8:
         monkeypatch.setattr(
             cuda.compute._cccl_interop,
@@ -499,7 +502,6 @@ def g(x):
     assert it1.kind != it2.kind
 
 
-@pytest.mark.no_numba
 def test_caching_zip_iterator():
     """Test that iterator compilation is cached across instances with the same structure."""
     from cuda.compute._cpp_compile import compile_cpp_op_code
@@ -575,7 +577,6 @@ def test_caching_zip_iterator():
     assert len(set(kinds)) == 1, "Same CountingIterator types should have same kind"
 
 
-@pytest.mark.no_numba
 def test_compilation_caching_across_iterator_types():
     """Test that compilation caching works across different iterator types."""
     from cuda.compute import ConstantIterator