From a79569e667706e97b51a876074ef5947431957a8 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Fri, 12 Jun 2026 10:59:39 -0500
Subject: [PATCH 01/16] Add initial free threading cuda.compute implementation

---
 c/parallel/src/transform.cu                   |   16 +
 python/cuda_cccl/CMakeLists.txt               |   19 +
 .../cuda_cccl/cuda/compute/_bindings_impl.pyx |    1 +
 python/cuda_cccl/cuda/compute/_caching.py     |  167 +-
 .../cuda/compute/algorithms/_binary_search.py |   20 +-
 .../cuda/compute/algorithms/_histogram.py     |   31 +-
 .../cuda/compute/algorithms/_reduce.py        |   20 +-
 .../cuda/compute/algorithms/_scan.py          |   21 +-
 .../compute/algorithms/_segmented_reduce.py   |   25 +-
 .../compute/algorithms/_sort/_merge_sort.py   |   22 +-
 .../compute/algorithms/_sort/_radix_sort.py   |   27 +-
 .../algorithms/_sort/_segmented_sort.py       |   29 +-
 .../algorithms/_three_way_partition.py        |   28 +-
 .../cuda/compute/algorithms/_transform.py     |   33 +-
 .../cuda/compute/algorithms/_unique_by_key.py |   25 +-
 .../cuda_cccl/cuda/compute/iterators/_base.py |   19 +-
 .../compute/test_free_threading_stress.py     | 1444 +++++++++++++++++
 17 files changed, 1848 insertions(+), 99 deletions(-)
 create mode 100644 python/cuda_cccl/tests/compute/test_free_threading_stress.py
diff --git a/c/parallel/src/transform.cu b/c/parallel/src/transform.cu
index 63f36106a16..81e8b1dbf88 100644
--- a/c/parallel/src/transform.cu
+++ b/c/parallel/src/transform.cu
@@ -120,24 +120,32 @@ struct transform_kernel_source
   cub::detail::transform::cuda_expected<cub::detail::transform::async_config>
   CacheAsyncConfiguration(const ActionT& action)
   {
+#if defined(CCCL_PYTHON_FREE_THREADED)
+    return action();
+#else // defined(CCCL_PYTHON_FREE_THREADED)
     auto cache = reinterpret_cast<transform::cache*>(build.cache);
     if (!cache->async_config.has_value())
     {
       cache->async_config = action();
     }
     return *cache->async_config;
+#endif // defined(CCCL_PYTHON_FREE_THREADED)
   }
 
   template <class ActionT>
   cub::detail::transform::cuda_expected<cub::detail::transform::prefetch_config>
   CachePrefetchConfiguration(const ActionT& action)
   {
+#if defined(CCCL_PYTHON_FREE_THREADED)
+    return action();
+#else // defined(CCCL_PYTHON_FREE_THREADED)
     auto cache = reinterpret_cast<transform::cache*>(build.cache);
     if (!cache->prefetch_config.has_value())
     {
       cache->prefetch_config = action();
     }
     return *cache->prefetch_config;
+#endif // defined(CCCL_PYTHON_FREE_THREADED)
   }
 
   CUkernel TransformKernel() const
@@ -325,7 +333,11 @@ static_assert(device_transform_policy()(detail::current_tuning_cc()) == {9}, "Ho
   build_ptr->cc                         = cc_major * 10 + cc_minor;
   build_ptr->cubin                      = (void*) result.data.release();
   build_ptr->cubin_size                 = result.size;
+#if defined(CCCL_PYTHON_FREE_THREADED)
+  build_ptr->cache                      = nullptr;
+#else // defined(CCCL_PYTHON_FREE_THREADED)
   build_ptr->cache                      = new transform::cache();
+#endif // defined(CCCL_PYTHON_FREE_THREADED)
 
   // avoid new and delete which requires the allocated and freed types to match
   static_assert(::cuda::is_trivially_copyable_v<decltype(policy_sel)>);
@@ -526,7 +538,11 @@ static_assert(device_transform_policy()(detail::current_tuning_cc()) == {12}, "H
   build_ptr->cc                         = cc_major * 10 + cc_minor;
   build_ptr->cubin                      = (void*) result.data.release();
   build_ptr->cubin_size                 = result.size;
+#if defined(CCCL_PYTHON_FREE_THREADED)
+  build_ptr->cache                      = nullptr;
+#else // defined(CCCL_PYTHON_FREE_THREADED)
   build_ptr->cache                      = new transform::cache();
+#endif // defined(CCCL_PYTHON_FREE_THREADED)
 
   // avoid new and delete which requires the allocated and freed types to match
   static_assert(::cuda::is_trivially_copyable_v<decltype(policy_sel)>);
diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index 09044f19442..905087a7f42 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -83,6 +83,25 @@ install(
 # Build and install Cython extension
 find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED)
 
+set(
+  _python_gil_disabled_query
+  "import sysconfig; print('1' if sysconfig.get_config_var('Py_GIL_DISABLED') in (1, '1') else '0')"
+)
+execute_process(
+  COMMAND "${Python3_EXECUTABLE}" -c "${_python_gil_disabled_query}"
+  OUTPUT_VARIABLE _python_gil_disabled
+  RESULT_VARIABLE _python_gil_disabled_result
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+if (NOT _python_gil_disabled_result EQUAL 0)
+  message(FATAL_ERROR "Failed to query Py_GIL_DISABLED from ${Python3_EXECUTABLE}")
+endif()
+
+if (Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 14 AND "${_python_gil_disabled}" STREQUAL "1")
+  target_compile_definitions(${_cccl_c_parallel_target} PRIVATE CCCL_PYTHON_FREE_THREADED=1)
+  message(STATUS "Enabling CCCL_PYTHON_FREE_THREADED for Python 3.14t")
+endif()
+
 get_filename_component(_python_path "${Python3_EXECUTABLE}" PATH)
 
 set(CYTHON_version_command "${Python3_EXECUTABLE}" -m cython --version)
diff --git a/python/cuda_cccl/cuda/compute/_bindings_impl.pyx b/python/cuda_cccl/cuda/compute/_bindings_impl.pyx
index 89ad14b2209..43c2ad3f1a2 100644
--- a/python/cuda_cccl/cuda/compute/_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/compute/_bindings_impl.pyx
@@ -6,6 +6,7 @@
 # distutils: language = c++
 # cython: language_level=3
 # cython: linetrace=True
+# cython: freethreading_compatible=True
 
 # Python signatures are declared in the companion Python stub file _bindings.pyi
 # Make sure to update PYI with change to Python API to ensure that Python
diff --git a/python/cuda_cccl/cuda/compute/_caching.py b/python/cuda_cccl/cuda/compute/_caching.py
index 8df4779e2cc..f3315bc0a0a 100644
--- a/python/cuda_cccl/cuda/compute/_caching.py
+++ b/python/cuda_cccl/cuda/compute/_caching.py
@@ -6,15 +6,14 @@
 from __future__ import annotations
 
 import functools
+import threading
 import types
+import weakref
 from typing import Any, Callable, Hashable
 
 import numpy as np
 
-try:
-    from cuda.core import Device
-except ImportError:
-    from cuda.core.experimental import Device
+from cuda.core import Device
 
 from ._utils.protocols import get_dtype, get_shape, is_device_array
 from .struct import _Struct
@@ -93,6 +92,150 @@ def _make_cache_key_from_args(*args, **kwargs) -> tuple:
 _cache_registry: dict[str, object] = {}
 
 
+class _ThreadLocalCaches:
+    """
+    Container for wrapper caches owned by a single Python thread.
+
+    Each thread gets its own instance via ``threading.local()``. We use
+    ``__weakref__`` to enable the process-wide registry of caches to hold weak
+    references to the thread's caches. That way, if a thread exits, its caches
+    will be garbage collected and removed from the registry even if the
+    process-wide registry still references them.
+    """
+
+    __slots__ = ("wrapper_caches", "__weakref__")
+
+    def __init__(self) -> None:
+        # Outer key: decorated algorithm factory name. Inner key: current thread
+        # id, current CUDA runtime device ordinal, compute capability, and
+        # specialization key derived from factory arguments.
+        self.wrapper_caches: dict[str, dict[Hashable, Any]] = {}
+
+
+class _InFlightBuild:
+    """
+    Coordination state for one shared build-result currently being built.
+
+    The first thread for a cache key runs the builder. Other threads wait on
+    ``condition`` and receive either the completed build result or the builder's
+    exception.
+    """
+
+    def __init__(self) -> None:
+        self.condition = threading.Condition()
+        self.done = False
+        self.result: Any = None
+        self.exception: BaseException | None = None
+
+
+_thread_local = threading.local()
+# Process wide registry of per-thread caches. It enables a thread to call
+# clear_all_caches() to clear all caches across all threads.
+_thread_cache_registry: weakref.WeakSet[_ThreadLocalCaches] = weakref.WeakSet()
+_thread_cache_registry_lock = threading.Lock()
+
+_shared_build_cache: dict[Hashable, Any] = {}
+_in_flight_builds: dict[Hashable, _InFlightBuild] = {}
+_shared_build_cache_lock = threading.Lock()
+
+
+def _get_current_device_info() -> tuple[int, tuple[int, int]]:
+    device = Device()
+    cc = device.compute_capability
+    return device.device_id, (cc.major, cc.minor)
+
+
+def _get_thread_caches() -> _ThreadLocalCaches:
+    caches = getattr(_thread_local, "caches", None)
+    if caches is None:
+        caches = _ThreadLocalCaches()
+        _thread_local.caches = caches
+        with _thread_cache_registry_lock:
+            _thread_cache_registry.add(caches)
+    return caches
+
+
+def _clear_wrapper_caches(cache_name: str | None = None) -> None:
+    with _thread_cache_registry_lock:
+        thread_caches = list(_thread_cache_registry)
+
+    for caches in thread_caches:
+        if cache_name is None:
+            caches.wrapper_caches.clear()
+        else:
+            caches.wrapper_caches.pop(cache_name, None)
+
+
+def cache_build_result(
+    build_result_type: type,
+    *key_args,
+    builder: Callable[[], Any],
+) -> Any:
+    """
+    Cache a shared Cython build-result object for the current CUDA device.
+
+    The key intentionally excludes the current Python thread. Wrappers are
+    cached per thread, but build results are shared across threads for the same
+    device ordinal and specialization key.
+
+    Args:
+        build_result_type: Cython build-result type. This separates different
+            build-result caches that may otherwise have identical specialization
+            keys.
+        *key_args: Positional values used to form the specialization part of
+            the cache key.
+        builder: Callable that creates the build result on a cache miss.
+            Exactly one thread runs this callable for a given key while other
+            threads wait for the result.
+
+    Returns:
+        The cached or newly built Cython build-result object.
+    """
+    device_id, cc_key = _get_current_device_info()
+    user_cache_key = _make_cache_key_from_args(*key_args)
+    cache_key = (build_result_type, device_id, cc_key, user_cache_key)
+
+    with _shared_build_cache_lock:
+        if cache_key in _shared_build_cache:
+            return _shared_build_cache[cache_key]
+
+        in_flight = _in_flight_builds.get(cache_key)
+        if in_flight is None:
+            in_flight = _InFlightBuild()
+            _in_flight_builds[cache_key] = in_flight
+            is_builder = True
+        else:
+            is_builder = False
+
+    if is_builder:
+        try:
+            result = builder()
+        except BaseException as exc:
+            with _shared_build_cache_lock:
+                _in_flight_builds.pop(cache_key, None)
+            with in_flight.condition:
+                in_flight.exception = exc
+                in_flight.done = True
+                in_flight.condition.notify_all()
+            raise
+
+        with _shared_build_cache_lock:
+            _shared_build_cache[cache_key] = result
+            _in_flight_builds.pop(cache_key, None)
+        with in_flight.condition:
+            in_flight.result = result
+            in_flight.done = True
+            in_flight.condition.notify_all()
+        return result
+
+    with in_flight.condition:
+        while not in_flight.done:
+            in_flight.condition.wait()
+        if in_flight.exception is not None:
+            raise in_flight.exception
+        return in_flight.result
+
+
 class _CacheWithRegisteredKeyFunctions:
     """
     Decorator to cache the result of the decorated function.
@@ -113,19 +256,21 @@ def __call__(self, func: Callable) -> Callable:
         The CUDA compute capability of the current device is appended to
         the cache key.
         """
-        cache: dict = {}
+        cache_name = func.__qualname__
 
         @functools.wraps(func)
         def inner(*args, **kwargs):
-            cc = Device().compute_capability
+            device_id, cc_key = _get_current_device_info()
             user_cache_key = _make_cache_key_from_args(*args, **kwargs)
-            cache_key = (user_cache_key, tuple(cc))
+            cache_key = (threading.get_ident(), device_id, cc_key, user_cache_key)
+            thread_caches = _get_thread_caches()
+            cache = thread_caches.wrapper_caches.setdefault(cache_name, {})
             if cache_key not in cache:
                 result = func(*args, **kwargs)
                 cache[cache_key] = result
             return cache[cache_key]
 
-        inner.cache_clear = cache.clear  # type: ignore[attr-defined]
+        inner.cache_clear = lambda: _clear_wrapper_caches(cache_name)  # type: ignore[attr-defined]
 
         # Register the cache in the central registry
         _cache_registry[func.__qualname__] = inner
@@ -182,8 +327,10 @@ def clear_all_caches():
     >>> import cuda.compute
     >>> cuda.compute.clear_all_caches()
     """
-    for cached_func in _cache_registry.values():
-        cached_func.cache_clear()
+    _clear_wrapper_caches()
+    with _shared_build_cache_lock:
+        _shared_build_cache.clear()
+        _in_flight_builds.clear()
 
 
 class CachableFunction:
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py b/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py
index 6cadd994ceb..83928127bc6 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py
@@ -9,7 +9,7 @@
 
 from .. import _bindings, types
 from .. import _cccl_interop as cccl
-from .._caching import cache_with_registered_key_functions
+from .._caching import cache_build_result, cache_with_registered_key_functions
 from .._cccl_interop import call_build, set_cccl_iterator_state
 from .._utils import protocols
 from ..op import OpAdapter, OpKind, make_op_adapter
@@ -71,13 +71,21 @@ def __init__(
 
         self.op_cccl = comp.compile((data_value_type, data_value_type), types.uint8)
 
-        self.build_result = call_build(
+        self.build_result = cache_build_result(
             _bindings.DeviceBinarySearchBuildResult,
+            d_data,
+            d_values,
+            d_out,
+            comp,
             mode,
-            self.d_data_cccl,
-            self.d_values_cccl,
-            self.d_out_cccl,
-            self.op_cccl,
+            builder=lambda: call_build(
+                _bindings.DeviceBinarySearchBuildResult,
+                mode,
+                self.d_data_cccl,
+                self.d_values_cccl,
+                self.d_out_cccl,
+                self.op_cccl,
+            ),
         )
 
     def __call__(
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_histogram.py b/python/cuda_cccl/cuda/compute/algorithms/_histogram.py
index f865a767dab..6b406989ef2 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_histogram.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_histogram.py
@@ -11,7 +11,7 @@
 
 from .. import _bindings
 from .. import _cccl_interop as cccl
-from .._caching import cache_with_registered_key_functions
+from .._caching import cache_build_result, cache_with_registered_key_functions
 from .._cccl_interop import call_build, set_cccl_iterator_state, to_cccl_value_state
 from .._utils.protocols import get_data_pointer, validate_and_get_stream
 from .._utils.temp_storage_buffer import TempStorageBuffer
@@ -51,17 +51,28 @@ def __init__(
         self.h_lower_level_cccl = cccl.to_cccl_value(h_lower_level)
         self.h_upper_level_cccl = cccl.to_cccl_value(h_upper_level)
 
-        self.build_result = call_build(
+        self.build_result = cache_build_result(
             _bindings.DeviceHistogramBuildResult,
-            num_channels,
-            num_active_channels,
-            self.d_samples_cccl,
-            num_levels,
-            self.d_histogram_cccl,
-            self.h_lower_level_cccl,
-            self.num_rows,
-            row_stride_samples,
+            d_samples,
+            d_histogram,
+            int(num_levels),
+            h_lower_level[0].item(),
+            h_upper_level[0].item(),
+            h_lower_level.dtype,
+            num_samples,
             is_evenly_segmented,
+            builder=lambda: call_build(
+                _bindings.DeviceHistogramBuildResult,
+                num_channels,
+                num_active_channels,
+                self.d_samples_cccl,
+                num_levels,
+                self.d_histogram_cccl,
+                self.h_lower_level_cccl,
+                self.num_rows,
+                row_stride_samples,
+                is_evenly_segmented,
+            ),
         )
 
     def __call__(
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_reduce.py b/python/cuda_cccl/cuda/compute/algorithms/_reduce.py
index d9c20cad2dc..d41b5223cb6 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_reduce.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_reduce.py
@@ -11,7 +11,7 @@
 
 from .. import _bindings
 from .. import _cccl_interop as cccl
-from .._caching import cache_with_registered_key_functions
+from .._caching import cache_build_result, cache_with_registered_key_functions
 from .._cccl_interop import (
     call_build,
     get_value_type,
@@ -59,13 +59,21 @@ def __init__(
         value_type = get_value_type(h_init)
         self.op_cccl = op.compile((value_type, value_type), value_type)
 
-        self.build_result = call_build(
+        self.build_result = cache_build_result(
             _bindings.DeviceReduceBuildResult,
-            self.d_in_cccl,
-            self.d_out_cccl,
-            self.op_cccl,
-            self.h_init_cccl,
+            d_in,
+            d_out,
+            op,
+            h_init,
             determinism,
+            builder=lambda: call_build(
+                _bindings.DeviceReduceBuildResult,
+                self.d_in_cccl,
+                self.d_out_cccl,
+                self.op_cccl,
+                self.h_init_cccl,
+                determinism,
+            ),
         )
 
         match determinism:
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_scan.py b/python/cuda_cccl/cuda/compute/algorithms/_scan.py
index bc7ecd4c587..21d78b342ad 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_scan.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_scan.py
@@ -11,7 +11,7 @@
 
 from .. import _bindings
 from .. import _cccl_interop as cccl
-from .._caching import cache_with_registered_key_functions
+from .._caching import cache_build_result, cache_with_registered_key_functions
 from .._cccl_interop import (
     call_build,
     get_value_type,
@@ -89,14 +89,23 @@ def __init__(
         # Compile the op with value types
         self.op_cccl = op.compile((value_type, value_type), value_type)
 
-        self.build_result = call_build(
+        self.build_result = cache_build_result(
             _bindings.DeviceScanBuildResult,
-            self.d_in_cccl,
-            self.d_out_cccl,
-            self.op_cccl,
-            init_value_type_info,
+            d_in,
+            d_out,
+            op,
+            init_value,
             force_inclusive,
             self.init_kind,
+            builder=lambda: call_build(
+                _bindings.DeviceScanBuildResult,
+                self.d_in_cccl,
+                self.d_out_cccl,
+                self.op_cccl,
+                init_value_type_info,
+                force_inclusive,
+                self.init_kind,
+            ),
         )
 
         match (force_inclusive, self.init_kind):
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py b/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py
index 74b593f9944..5edfa5e0312 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py
@@ -11,7 +11,7 @@
 
 from .. import _bindings
 from .. import _cccl_interop as cccl
-from .._caching import cache_with_registered_key_functions
+from .._caching import cache_build_result, cache_with_registered_key_functions
 from .._cccl_interop import (
     call_build,
     get_value_type,
@@ -58,14 +58,23 @@ def __init__(
 
         self.op_cccl = op.compile((value_type, value_type), value_type)
 
-        self.build_result = call_build(
+        self.build_result = cache_build_result(
             _bindings.DeviceSegmentedReduceBuildResult,
-            self.d_in_cccl,
-            self.d_out_cccl,
-            self.start_offsets_in_cccl,
-            self.end_offsets_in_cccl,
-            self.op_cccl,
-            self.h_init_cccl,
+            d_in,
+            d_out,
+            start_offsets_in,
+            end_offsets_in,
+            op,
+            h_init,
+            builder=lambda: call_build(
+                _bindings.DeviceSegmentedReduceBuildResult,
+                self.d_in_cccl,
+                self.d_out_cccl,
+                self.start_offsets_in_cccl,
+                self.end_offsets_in_cccl,
+                self.op_cccl,
+                self.h_init_cccl,
+            ),
         )
 
     def __call__(
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py b/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py
index 1070042a4c4..ace448bd511 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py
@@ -8,7 +8,7 @@
 
 from ... import _bindings, types
 from ... import _cccl_interop as cccl
-from ..._caching import cache_with_registered_key_functions
+from ..._caching import cache_build_result, cache_with_registered_key_functions
 from ..._cccl_interop import call_build, set_cccl_iterator_state
 from ..._utils.protocols import (
     get_data_pointer,
@@ -52,13 +52,21 @@ def __init__(
         value_type = cccl.get_value_type(d_in_keys)
         self.op_cccl = op.compile((value_type, value_type), types.int8)
 
-        self.build_result = call_build(
+        self.build_result = cache_build_result(
             _bindings.DeviceMergeSortBuildResult,
-            self.d_in_keys_cccl,
-            self.d_in_values_cccl,
-            self.d_out_keys_cccl,
-            self.d_out_values_cccl,
-            self.op_cccl,
+            d_in_keys,
+            d_in_values,
+            d_out_keys,
+            d_out_values,
+            op,
+            builder=lambda: call_build(
+                _bindings.DeviceMergeSortBuildResult,
+                self.d_in_keys_cccl,
+                self.d_in_values_cccl,
+                self.d_out_keys_cccl,
+                self.d_out_values_cccl,
+                self.op_cccl,
+            ),
         )
 
     def __call__(
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py b/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py
index d09dcd0a79a..cf2af440135 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py
@@ -7,7 +7,7 @@
 
 from ... import _bindings
 from ... import _cccl_interop as cccl
-from ..._caching import cache_with_registered_key_functions
+from ..._caching import cache_build_result, cache_with_registered_key_functions
 from ..._cccl_interop import call_build, set_cccl_iterator_state
 from ..._utils.protocols import (
     get_data_pointer,
@@ -56,15 +56,26 @@ def __init__(
         )
         decomposer_return_type = "".encode("utf-8")
 
-        self.build_result = call_build(
-            _bindings.DeviceRadixSortBuildResult,
+        build_order = (
             _bindings.SortOrder.ASCENDING
             if order is SortOrder.ASCENDING
-            else _bindings.SortOrder.DESCENDING,
-            self.d_in_keys_cccl,
-            self.d_in_values_cccl,
-            self.decomposer_op,
-            decomposer_return_type,
+            else _bindings.SortOrder.DESCENDING
+        )
+        self.build_result = cache_build_result(
+            _bindings.DeviceRadixSortBuildResult,
+            d_in_keys,
+            d_out_keys,
+            d_in_values,
+            d_out_values,
+            order,
+            builder=lambda: call_build(
+                _bindings.DeviceRadixSortBuildResult,
+                build_order,
+                self.d_in_keys_cccl,
+                self.d_in_values_cccl,
+                self.decomposer_op,
+                decomposer_return_type,
+            ),
         )
 
     def __call__(
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py b/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py
index d74cd256a81..33ec5279c36 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py
@@ -9,7 +9,7 @@
 
 from ... import _bindings
 from ... import _cccl_interop as cccl
-from ..._caching import cache_with_registered_key_functions
+from ..._caching import cache_build_result, cache_with_registered_key_functions
 from ..._cccl_interop import call_build, set_cccl_iterator_state
 from ..._utils.protocols import (
     get_data_pointer,
@@ -52,15 +52,28 @@ def __init__(
         self.start_offsets_in_cccl = cccl.to_cccl_input_iter(start_offsets_in)
         self.end_offsets_in_cccl = cccl.to_cccl_input_iter(end_offsets_in)
 
-        self.build_result = call_build(
-            _bindings.DeviceSegmentedSortBuildResult,
+        build_order = (
             _bindings.SortOrder.ASCENDING
             if order is SortOrder.ASCENDING
-            else _bindings.SortOrder.DESCENDING,
-            self.d_in_keys_cccl,
-            self.d_in_values_cccl,
-            self.start_offsets_in_cccl,
-            self.end_offsets_in_cccl,
+            else _bindings.SortOrder.DESCENDING
+        )
+        self.build_result = cache_build_result(
+            _bindings.DeviceSegmentedSortBuildResult,
+            d_in_keys,
+            d_out_keys,
+            d_in_values,
+            d_out_values,
+            start_offsets_in,
+            end_offsets_in,
+            order,
+            builder=lambda: call_build(
+                _bindings.DeviceSegmentedSortBuildResult,
+                build_order,
+                self.d_in_keys_cccl,
+                self.d_in_values_cccl,
+                self.start_offsets_in_cccl,
+                self.end_offsets_in_cccl,
+            ),
         )
 
     def __call__(
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py b/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py
index fbd3154feb8..d5b076028c5 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py
@@ -9,7 +9,7 @@
 
 from .. import _bindings, types
 from .. import _cccl_interop as cccl
-from .._caching import cache_with_registered_key_functions
+from .._caching import cache_build_result, cache_with_registered_key_functions
 from .._cccl_interop import call_build, set_cccl_iterator_state
 from .._utils import protocols
 from .._utils.temp_storage_buffer import TempStorageBuffer
@@ -54,15 +54,25 @@ def __init__(
             (value_type,), types.uint8
         )
 
-        self.build_result = call_build(
+        self.build_result = cache_build_result(
             _bindings.DeviceThreeWayPartitionBuildResult,
-            self.d_in_cccl,
-            self.d_first_part_out_cccl,
-            self.d_second_part_out_cccl,
-            self.d_unselected_out_cccl,
-            self.d_num_selected_out_cccl,
-            self.select_first_part_op_cccl,
-            self.select_second_part_op_cccl,
+            d_in,
+            d_first_part_out,
+            d_second_part_out,
+            d_unselected_out,
+            d_num_selected_out,
+            select_first_part_op,
+            select_second_part_op,
+            builder=lambda: call_build(
+                _bindings.DeviceThreeWayPartitionBuildResult,
+                self.d_in_cccl,
+                self.d_first_part_out_cccl,
+                self.d_second_part_out_cccl,
+                self.d_unselected_out_cccl,
+                self.d_num_selected_out_cccl,
+                self.select_first_part_op_cccl,
+                self.select_second_part_op_cccl,
+            ),
         )
 
     def __call__(
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_transform.py b/python/cuda_cccl/cuda/compute/algorithms/_transform.py
index f987efd915c..4e32f9dfbdc 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_transform.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_transform.py
@@ -9,7 +9,7 @@
 
 from .. import _bindings
 from .. import _cccl_interop as cccl
-from .._caching import cache_with_registered_key_functions
+from .._caching import cache_build_result, cache_with_registered_key_functions
 from .._cccl_interop import set_cccl_iterator_state
 from .._utils import protocols
 from ..op import OpAdapter, make_op_adapter
@@ -33,11 +33,17 @@ def __init__(
         out_type = cccl.get_value_type(d_out)
         self.op_cccl = op.compile((in_type,), out_type)
 
-        self.build_result = cccl.call_build(
+        self.build_result = cache_build_result(
             _bindings.DeviceUnaryTransform,
-            self.d_in_cccl,
-            self.d_out_cccl,
-            self.op_cccl,
+            d_in,
+            d_out,
+            op,
+            builder=lambda: cccl.call_build(
+                _bindings.DeviceUnaryTransform,
+                self.d_in_cccl,
+                self.d_out_cccl,
+                self.op_cccl,
+            ),
         )
 
     def __call__(
@@ -92,12 +98,19 @@ def __init__(
         out_type = cccl.get_value_type(d_out)
         self.op_cccl = op.compile((in1_type, in2_type), out_type)
 
-        self.build_result = cccl.call_build(
+        self.build_result = cache_build_result(
             _bindings.DeviceBinaryTransform,
-            self.d_in1_cccl,
-            self.d_in2_cccl,
-            self.d_out_cccl,
-            self.op_cccl,
+            d_in1,
+            d_in2,
+            d_out,
+            op,
+            builder=lambda: cccl.call_build(
+                _bindings.DeviceBinaryTransform,
+                self.d_in1_cccl,
+                self.d_in2_cccl,
+                self.d_out_cccl,
+                self.op_cccl,
+            ),
         )
 
     def __call__(
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py b/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py
index 0e39f182507..7b42038dfbf 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py
@@ -8,7 +8,7 @@
 
 from .. import _bindings, types
 from .. import _cccl_interop as cccl
-from .._caching import cache_with_registered_key_functions
+from .._caching import cache_build_result, cache_with_registered_key_functions
 from .._cccl_interop import call_build, set_cccl_iterator_state
 from .._utils.protocols import (
     get_data_pointer,
@@ -49,14 +49,23 @@ def __init__(
         value_type = cccl.get_value_type(d_in_keys)
         self.op_cccl = op.compile((value_type, value_type), types.uint8)
 
-        self.build_result = call_build(
+        self.build_result = cache_build_result(
             _bindings.DeviceUniqueByKeyBuildResult,
-            self.d_in_keys_cccl,
-            self.d_in_items_cccl,
-            self.d_out_keys_cccl,
-            self.d_out_items_cccl,
-            self.d_out_num_selected_cccl,
-            self.op_cccl,
+            d_in_keys,
+            d_in_items,
+            d_out_keys,
+            d_out_items,
+            d_out_num_selected,
+            op,
+            builder=lambda: call_build(
+                _bindings.DeviceUniqueByKeyBuildResult,
+                self.d_in_keys_cccl,
+                self.d_in_items_cccl,
+                self.d_out_keys_cccl,
+                self.d_out_items_cccl,
+                self.d_out_num_selected_cccl,
+                self.op_cccl,
+            ),
         )
 
     def __call__(
diff --git a/python/cuda_cccl/cuda/compute/iterators/_base.py b/python/cuda_cccl/cuda/compute/iterators/_base.py
index cc1b1b83fc8..746f2dc5931 100644
--- a/python/cuda_cccl/cuda/compute/iterators/_base.py
+++ b/python/cuda_cccl/cuda/compute/iterators/_base.py
@@ -9,6 +9,7 @@
 from __future__ import annotations
 
 import hashlib
+import threading
 from typing import Hashable
 
 from .._bindings import Iterator, IteratorKind, IteratorState, Op
@@ -54,6 +55,7 @@ class IteratorBase:
         "_input_deref_op",
         "_output_deref_op",
         "_uid_cached",
+        "_op_lock",
     ]
 
     def __init__(
@@ -75,6 +77,11 @@ def __init__(
         self._input_deref_op: Op | None = None
         self._output_deref_op: Op | None = None
         self._uid_cached: str | None = None
+        # Free-threaded Python can let multiple threads share a read-only
+        # iterator object and race during the first lazy Op construction.
+        # The lock only protects that cache miss path; cached access stays
+        # lock-free and iterator mutation remains the caller's responsibility.
+        self._op_lock = threading.Lock()
 
     @property
     def state(self) -> IteratorState:
@@ -117,19 +124,25 @@ def _make_output_deref_symbol(self) -> str:
     def get_advance_op(self) -> Op:
         """Get the cached Op for the advance operation."""
         if self._advance_op is None:
-            self._advance_op = self._make_advance_op()
+            with self._op_lock:
+                if self._advance_op is None:
+                    self._advance_op = self._make_advance_op()
         return self._advance_op
 
     def get_input_deref_op(self) -> Op | None:
         """Get the cached Op for input dereference operation, or None if not supported."""
         if self._input_deref_op is None:
-            self._input_deref_op = self._make_input_deref_op()
+            with self._op_lock:
+                if self._input_deref_op is None:
+                    self._input_deref_op = self._make_input_deref_op()
         return self._input_deref_op
 
     def get_output_deref_op(self) -> Op | None:
         """Get the cached Op for output dereference operation, or None if not supported."""
         if self._output_deref_op is None:
-            self._output_deref_op = self._make_output_deref_op()
+            with self._op_lock:
+                if self._output_deref_op is None:
+                    self._output_deref_op = self._make_output_deref_op()
         return self._output_deref_op
 
     @property
diff --git a/python/cuda_cccl/tests/compute/test_free_threading_stress.py b/python/cuda_cccl/tests/compute/test_free_threading_stress.py
new file mode 100644
index 00000000000..fa2e389a627
--- /dev/null
+++ b/python/cuda_cccl/tests/compute/test_free_threading_stress.py
@@ -0,0 +1,1444 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from __future__ import annotations
+
+import concurrent.futures
+import os
+import sys
+import sysconfig
+import threading
+from dataclasses import dataclass
+from typing import Callable
+
+import numpy as np
+import pytest
+
+
+pytestmark = [
+    pytest.mark.no_numba,
+    pytest.mark.no_verify_sass(
+        reason="Free-threading stress tests intentionally run concurrent workers."
+    ),
+]
+
+STRESS_ITERATIONS = int(os.environ.get("CCCL_FREE_THREADING_STRESS_ITERATIONS", "10"))
+STRESS_THREADS = int(os.environ.get("CCCL_FREE_THREADING_STRESS_THREADS", "2"))
+TRANSFORM_NATIVE_CACHE_THREADS = int(
+    os.environ.get(
+        "CCCL_FREE_THREADING_TRANSFORM_NATIVE_CACHE_THREADS",
+        str(max(STRESS_THREADS, 4)),
+    )
+)
+
+
+def _is_free_threaded_build() -> bool:
+    return sysconfig.get_config_var("Py_GIL_DISABLED") in (1, "1")
+
+
+def _assert_gil_disabled(where: str) -> None:
+    is_gil_enabled = getattr(sys, "_is_gil_enabled", None)
+    if is_gil_enabled is not None and is_gil_enabled():
+        pytest.fail(f"the GIL is enabled {where}")
+
+
+def _require_free_threaded_python() -> None:
+    if not _is_free_threaded_build():
+        pytest.skip("requires a free-threaded CPython build")
+    _assert_gil_disabled("before importing cuda.compute")
+
+
+@pytest.fixture
+def compute_modules():
+    _require_free_threaded_python()
+
+    import cupy as cp
+
+    _assert_gil_disabled("after importing cupy")
+
+    import cuda.compute as cc
+
+    _assert_gil_disabled("after importing cuda.compute")
+    cc.clear_all_caches()
+    try:
+        yield cp, cc
+    finally:
+        cc.clear_all_caches()
+
+
+class _CudaStream:
+    def __init__(self, stream):
+        self.stream = stream
+
+    def __cuda_stream__(self):
+        return (0, self.stream.ptr)
+
+    @property
+    def ptr(self):
+        return self.stream.ptr
+
+
+def _make_stream(cp):
+    stream = cp.cuda.Stream()
+    return stream, _CudaStream(stream)
+
+
+def _run_threaded(workers: list[Callable[[threading.Barrier], None]]) -> None:
+    barrier = threading.Barrier(len(workers))
+    with concurrent.futures.ThreadPoolExecutor(max_workers=len(workers)) as executor:
+        futures = [executor.submit(worker, barrier) for worker in workers]
+        for future in futures:
+            future.result()
+    _assert_gil_disabled("after concurrent cuda.compute operations")
+
+
+def _call_with_temp(cp, algorithm, **kwargs):
+    temp_storage_bytes = algorithm(temp_storage=None, **kwargs)
+    temp_storage = cp.empty(temp_storage_bytes, dtype=np.uint8)
+    return algorithm(temp_storage=temp_storage, **kwargs)
+
+
+def _get_build_result(algorithm):
+    if hasattr(algorithm, "build_result"):
+        return algorithm.build_result
+    if hasattr(algorithm, "partitioner"):
+        return _get_build_result(algorithm.partitioner)
+    raise AssertionError(f"{type(algorithm).__name__} does not expose a build result")
+
+
+def _selected_segments(keys, values, starts, ends, descending=False):
+    out_keys = keys.copy()
+    out_values = values.copy()
+    for start, end in zip(starts, ends):
+        segment_keys = keys[start:end]
+        order = np.argsort(segment_keys, kind="stable")
+        if descending:
+            order = order[::-1]
+        out_keys[start:end] = segment_keys[order]
+        out_values[start:end] = values[start:end][order]
+    return out_keys, out_values
+
+
+@dataclass(frozen=True)
+class _AlgorithmCase:
+    name: str
+    make_shared: Callable
+    make_worker: Callable
+    run: Callable
+    check: Callable
+
+    def __str__(self):
+        return self.name
+
+
+def _run_thread_local_algorithm_case(cp, cc, case: _AlgorithmCase) -> None:
+    warm_algorithm = case.make_shared(cp, cc)
+
+    warm_worker = case.make_worker(cp, cc, worker_id=0, iteration=-1)
+    case.run(cp, cc, warm_algorithm, warm_worker)
+    case.check(cp, cc, warm_worker)
+
+    for iteration in range(STRESS_ITERATIONS):
+        worker_state = [
+            case.make_worker(cp, cc, worker_id=worker_id, iteration=iteration)
+            for worker_id in range(STRESS_THREADS)
+        ]
+        returned_algorithms = [None] * STRESS_THREADS
+
+        def make_thread(worker_id, worker):
+            def thread(barrier):
+                barrier.wait()
+                algorithm = case.make_shared(cp, cc)
+                returned_algorithms[worker_id] = algorithm
+                case.run(cp, cc, algorithm, worker)
+                case.check(cp, cc, worker)
+
+            return thread
+
+        _run_threaded(
+            [make_thread(worker_id, worker) for worker_id, worker in enumerate(worker_state)]
+        )
+
+        assert len({id(algorithm) for algorithm in returned_algorithms}) == len(
+            returned_algorithms
+        )
+        assert len(
+            {id(_get_build_result(algorithm)) for algorithm in returned_algorithms}
+        ) == 1
+
+
+def _make_reduce_worker(cp, cc, worker_id, iteration):
+    stream, cuda_stream = _make_stream(cp)
+    h_in = np.arange(64, dtype=np.int32) + worker_id * 101 + iteration
+    h_init = np.array([7 + worker_id], dtype=np.int32)
+    with stream:
+        d_in = cp.asarray(h_in)
+        d_out = cp.empty(1, dtype=np.int32)
+    return {
+        "stream": stream,
+        "cuda_stream": cuda_stream,
+        "h_in": h_in,
+        "d_in": d_in,
+        "d_out": d_out,
+        "h_init": h_init,
+    }
+
+
+def _make_reduce_shared(cp, cc):
+    worker = _make_reduce_worker(cp, cc, 0, -1)
+    return cc.make_reduce_into(
+        d_in=worker["d_in"],
+        d_out=worker["d_out"],
+        op=cc.OpKind.PLUS,
+        h_init=worker["h_init"],
+    )
+
+
+def _run_reduce(cp, cc, reducer, worker):
+    _call_with_temp(
+        cp,
+        reducer,
+        d_in=worker["d_in"],
+        d_out=worker["d_out"],
+        op=cc.OpKind.PLUS,
+        h_init=worker["h_init"],
+        num_items=worker["h_in"].size,
+        stream=worker["cuda_stream"],
+    )
+
+
+def _check_reduce(cp, cc, worker):
+    worker["stream"].synchronize()
+    expected = worker["h_in"].sum(dtype=np.int64) + int(worker["h_init"][0])
+    assert int(worker["d_out"].get()[0]) == int(expected)
+
+
+def _make_unary_worker(cp, cc, worker_id, iteration):
+    stream, cuda_stream = _make_stream(cp)
+    h_in = np.arange(32, dtype=np.int32) + worker_id * 17 + iteration
+    with stream:
+        d_in = cp.asarray(h_in)
+        d_out = cp.empty_like(d_in)
+    return {
+        "stream": stream,
+        "cuda_stream": cuda_stream,
+        "h_in": h_in,
+        "d_in": d_in,
+        "d_out": d_out,
+    }
+
+
+def _make_unary_shared(cp, cc):
+    worker = _make_unary_worker(cp, cc, 0, -1)
+    return cc.make_unary_transform(
+        d_in=worker["d_in"], d_out=worker["d_out"], op=cc.OpKind.NEGATE
+    )
+
+
+def _make_unary_for_worker(cp, cc, worker):
+    return cc.make_unary_transform(
+        d_in=worker["d_in"], d_out=worker["d_out"], op=cc.OpKind.NEGATE
+    )
+
+
+def _run_unary(cp, cc, transformer, worker):
+    transformer(
+        d_in=worker["d_in"],
+        d_out=worker["d_out"],
+        op=cc.OpKind.NEGATE,
+        num_items=worker["h_in"].size,
+        stream=worker["cuda_stream"],
+    )
+
+
+def _check_unary(cp, cc, worker):
+    worker["stream"].synchronize()
+    np.testing.assert_array_equal(worker["d_out"].get(), -worker["h_in"])
+
+
+def _make_binary_worker(cp, cc, worker_id, iteration):
+    stream, cuda_stream = _make_stream(cp)
+    h_in1 = np.arange(32, dtype=np.int32) + worker_id * 13
+    h_in2 = np.arange(32, dtype=np.int32) + iteration * 7
+    with stream:
+        d_in1 = cp.asarray(h_in1)
+        d_in2 = cp.asarray(h_in2)
+        d_out = cp.empty_like(d_in1)
+    return {
+        "stream": stream,
+        "cuda_stream": cuda_stream,
+        "h_in1": h_in1,
+        "h_in2": h_in2,
+        "d_in1": d_in1,
+        "d_in2": d_in2,
+        "d_out": d_out,
+    }
+
+
+def _make_binary_shared(cp, cc):
+    worker = _make_binary_worker(cp, cc, 0, -1)
+    return cc.make_binary_transform(
+        d_in1=worker["d_in1"],
+        d_in2=worker["d_in2"],
+        d_out=worker["d_out"],
+        op=cc.OpKind.PLUS,
+    )
+
+
+def _make_binary_for_worker(cp, cc, worker):
+    return cc.make_binary_transform(
+        d_in1=worker["d_in1"],
+        d_in2=worker["d_in2"],
+        d_out=worker["d_out"],
+        op=cc.OpKind.PLUS,
+    )
+
+
+def _run_binary(cp, cc, transformer, worker):
+    transformer(
+        d_in1=worker["d_in1"],
+        d_in2=worker["d_in2"],
+        d_out=worker["d_out"],
+        op=cc.OpKind.PLUS,
+        num_items=worker["h_in1"].size,
+        stream=worker["cuda_stream"],
+    )
+
+
+def _check_binary(cp, cc, worker):
+    worker["stream"].synchronize()
+    np.testing.assert_array_equal(worker["d_out"].get(), worker["h_in1"] + worker["h_in2"])
+
+
+def _make_scan_worker(cp, cc, worker_id, iteration):
+    stream, cuda_stream = _make_stream(cp)
+    h_in = np.arange(1, 33, dtype=np.int32) + worker_id + iteration
+    h_init = np.array([3 + worker_id], dtype=np.int32)
+    with stream:
+        d_in = cp.asarray(h_in)
+        d_out = cp.empty_like(d_in)
+    return {
+        "stream": stream,
+        "cuda_stream": cuda_stream,
+        "h_in": h_in,
+        "h_init": h_init,
+        "d_in": d_in,
+        "d_out": d_out,
+    }
+
+
+def _make_exclusive_scan_shared(cp, cc):
+    worker = _make_scan_worker(cp, cc, 0, -1)
+    return cc.make_exclusive_scan(
+        d_in=worker["d_in"],
+        d_out=worker["d_out"],
+        op=cc.OpKind.PLUS,
+        init_value=worker["h_init"],
+    )
+
+
+def _make_inclusive_scan_shared(cp, cc):
+    worker = _make_scan_worker(cp, cc, 0, -1)
+    return cc.make_inclusive_scan(
+        d_in=worker["d_in"],
+        d_out=worker["d_out"],
+        op=cc.OpKind.PLUS,
+        init_value=worker["h_init"],
+    )
+
+
+def _run_scan(cp, cc, scanner, worker):
+    _call_with_temp(
+        cp,
+        scanner,
+        d_in=worker["d_in"],
+        d_out=worker["d_out"],
+        op=cc.OpKind.PLUS,
+        init_value=worker["h_init"],
+        num_items=worker["h_in"].size,
+        stream=worker["cuda_stream"],
+    )
+
+
+def _check_exclusive_scan(cp, cc, worker):
+    worker["stream"].synchronize()
+    expected = np.empty_like(worker["h_in"])
+    expected[0] = worker["h_init"][0]
+    expected[1:] = worker["h_init"][0] + np.cumsum(worker["h_in"][:-1])
+    np.testing.assert_array_equal(worker["d_out"].get(), expected)
+
+
+def _check_inclusive_scan(cp, cc, worker):
+    worker["stream"].synchronize()
+    expected = worker["h_init"][0] + np.cumsum(worker["h_in"])
+    np.testing.assert_array_equal(worker["d_out"].get(), expected)
+
+
+def _make_segmented_reduce_worker(cp, cc, worker_id, iteration):
+    stream, cuda_stream = _make_stream(cp)
+    h_in = np.arange(1, 17, dtype=np.int32) + worker_id * 3 + iteration
+    h_start_offsets = np.array([0, 3, 8, 12], dtype=np.int32)
+    h_end_offsets = np.array([3, 8, 12, 16], dtype=np.int32)
+    h_init = np.array([worker_id], dtype=np.int32)
+    with stream:
+        d_in = cp.asarray(h_in)
+        d_out = cp.empty(len(h_start_offsets), dtype=np.int32)
+        d_start_offsets = cp.asarray(h_start_offsets)
+        d_end_offsets = cp.asarray(h_end_offsets)
+    return {
+        "stream": stream,
+        "cuda_stream": cuda_stream,
+        "h_in": h_in,
+        "h_start_offsets": h_start_offsets,
+        "h_end_offsets": h_end_offsets,
+        "h_init": h_init,
+        "d_in": d_in,
+        "d_out": d_out,
+        "d_start_offsets": d_start_offsets,
+        "d_end_offsets": d_end_offsets,
+    }
+
+
+def _make_segmented_reduce_shared(cp, cc):
+    worker = _make_segmented_reduce_worker(cp, cc, 0, -1)
+    return cc.make_segmented_reduce(
+        d_in=worker["d_in"],
+        d_out=worker["d_out"],
+        start_offsets_in=worker["d_start_offsets"],
+        end_offsets_in=worker["d_end_offsets"],
+        op=cc.OpKind.PLUS,
+        h_init=worker["h_init"],
+    )
+
+
+def _run_segmented_reduce(cp, cc, reducer, worker):
+    _call_with_temp(
+        cp,
+        reducer,
+        d_in=worker["d_in"],
+        d_out=worker["d_out"],
+        num_segments=len(worker["h_start_offsets"]),
+        start_offsets_in=worker["d_start_offsets"],
+        end_offsets_in=worker["d_end_offsets"],
+        op=cc.OpKind.PLUS,
+        h_init=worker["h_init"],
+        stream=worker["cuda_stream"],
+    )
+
+
+def _check_segmented_reduce(cp, cc, worker):
+    worker["stream"].synchronize()
+    expected = np.array(
+        [
+            worker["h_in"][start:end].sum() + worker["h_init"][0]
+            for start, end in zip(worker["h_start_offsets"], worker["h_end_offsets"])
+        ],
+        dtype=np.int32,
+    )
+    np.testing.assert_array_equal(worker["d_out"].get(), expected)
+
+
+def _make_histogram_worker(cp, cc, worker_id, iteration):
+    stream, cuda_stream = _make_stream(cp)
+    lower = np.float32(worker_id * 10)
+    upper = np.float32(lower + 8)
+    h_samples = np.array(
+        [
+            lower + 0.5,
+            lower + 1.5,
+            lower + 2.0,
+            lower + 3.5,
+            lower + 6.0,
+            upper + 1.0,
+        ],
+        dtype=np.float32,
+    )
+    h_num_levels = np.array([5], dtype=np.int32)
+    h_lower = np.array([lower], dtype=np.float32)
+    h_upper = np.array([upper], dtype=np.float32)
+    with stream:
+        d_samples = cp.asarray(h_samples)
+        d_histogram = cp.zeros(h_num_levels[0] - 1, dtype=np.int32)
+    return {
+        "stream": stream,
+        "cuda_stream": cuda_stream,
+        "h_samples": h_samples,
+        "h_num_levels": h_num_levels,
+        "h_lower": h_lower,
+        "h_upper": h_upper,
+        "d_samples": d_samples,
+        "d_histogram": d_histogram,
+    }
+
+
+def _make_histogram_shared(cp, cc):
+    worker = _make_histogram_worker(cp, cc, 0, -1)
+    return cc.make_histogram_even(
+        d_samples=worker["d_samples"],
+        d_histogram=worker["d_histogram"],
+        h_num_output_levels=worker["h_num_levels"],
+        h_lower_level=worker["h_lower"],
+        h_upper_level=worker["h_upper"],
+        num_samples=worker["h_samples"].size,
+    )
+
+
+def _run_histogram(cp, cc, histogrammer, worker):
+    with worker["stream"]:
+        worker["d_histogram"].fill(0)
+    _call_with_temp(
+        cp,
+        histogrammer,
+        d_samples=worker["d_samples"],
+        d_histogram=worker["d_histogram"],
+        h_num_output_levels=worker["h_num_levels"],
+        h_lower_level=worker["h_lower"],
+        h_upper_level=worker["h_upper"],
+        num_samples=worker["h_samples"].size,
+        stream=worker["cuda_stream"],
+    )
+
+
+def _check_histogram(cp, cc, worker):
+    worker["stream"].synchronize()
+    expected, _ = np.histogram(
+        worker["h_samples"],
+        bins=int(worker["h_num_levels"][0] - 1),
+        range=(float(worker["h_lower"][0]), float(worker["h_upper"][0])),
+    )
+    np.testing.assert_array_equal(worker["d_histogram"].get(), expected.astype(np.int32))
+
+
+def _make_binary_search_worker(cp, cc, worker_id, iteration):
+    stream, cuda_stream = _make_stream(cp)
+    h_data = np.array([90, 70, 50, 30, 10], dtype=np.int32) - worker_id
+    h_values = np.array([95, 70, 45, 10, 5], dtype=np.int32) - worker_id
+    with stream:
+        d_data = cp.asarray(h_data)
+        d_values = cp.asarray(h_values)
+        d_out = cp.empty(h_values.size, dtype=np.uintp)
+    return {
+        "stream": stream,
+        "cuda_stream": cuda_stream,
+        "h_data": h_data,
+        "h_values": h_values,
+        "d_data": d_data,
+        "d_values": d_values,
+        "d_out": d_out,
+    }
+
+
+def _make_lower_bound_shared(cp, cc):
+    worker = _make_binary_search_worker(cp, cc, 0, -1)
+    return cc.make_lower_bound(
+        d_data=worker["d_data"],
+        d_values=worker["d_values"],
+        d_out=worker["d_out"],
+        comp=cc.OpKind.GREATER,
+    )
+
+
+def _make_upper_bound_shared(cp, cc):
+    worker = _make_binary_search_worker(cp, cc, 0, -1)
+    return cc.make_upper_bound(
+        d_data=worker["d_data"],
+        d_values=worker["d_values"],
+        d_out=worker["d_out"],
+        comp=cc.OpKind.GREATER,
+    )
+
+
+def _run_binary_search(cp, cc, searcher, worker):
+    searcher(
+        d_data=worker["d_data"],
+        num_items=worker["h_data"].size,
+        d_values=worker["d_values"],
+        num_values=worker["h_values"].size,
+        d_out=worker["d_out"],
+        comp=cc.OpKind.GREATER,
+        stream=worker["cuda_stream"],
+    )
+
+
+def _check_lower_bound(cp, cc, worker):
+    worker["stream"].synchronize()
+    expected = np.searchsorted(-worker["h_data"], -worker["h_values"], side="left")
+    np.testing.assert_array_equal(worker["d_out"].get(), expected.astype(np.uintp))
+
+
+def _check_upper_bound(cp, cc, worker):
+    worker["stream"].synchronize()
+    expected = np.searchsorted(-worker["h_data"], -worker["h_values"], side="right")
+    np.testing.assert_array_equal(worker["d_out"].get(), expected.astype(np.uintp))
+
+
+def _make_select_worker(cp, cc, worker_id, iteration):
+    stream, cuda_stream = _make_stream(cp)
+    h_in = np.array(
+        [True, False, worker_id % 2 == 0, True, False, iteration % 2 == 0],
+        dtype=np.bool_,
+    )
+    with stream:
+        d_in = cp.asarray(h_in)
+        d_out = cp.empty_like(d_in)
+        d_count = cp.empty(2, dtype=np.uint64)
+    return {
+        "stream": stream,
+        "cuda_stream": cuda_stream,
+        "h_in": h_in,
+        "d_in": d_in,
+        "d_out": d_out,
+        "d_count": d_count,
+    }
+
+
+def _make_select_shared(cp, cc):
+    worker = _make_select_worker(cp, cc, 0, -1)
+    return cc.make_select(
+        d_in=worker["d_in"],
+        d_out=worker["d_out"],
+        d_num_selected_out=worker["d_count"],
+        cond=cc.OpKind.IDENTITY,
+    )
+
+
+def _run_select(cp, cc, selector, worker):
+    _call_with_temp(
+        cp,
+        selector,
+        d_in=worker["d_in"],
+        d_out=worker["d_out"],
+        d_num_selected_out=worker["d_count"],
+        cond=cc.OpKind.IDENTITY,
+        num_items=worker["h_in"].size,
+        stream=worker["cuda_stream"],
+    )
+
+
+def _check_select(cp, cc, worker):
+    worker["stream"].synchronize()
+    count = int(worker["d_count"].get()[0])
+    expected = worker["h_in"][worker["h_in"]]
+    assert count == expected.size
+    np.testing.assert_array_equal(worker["d_out"].get()[:count], expected)
+
+
+def _make_three_way_shared(cp, cc):
+    worker = _make_select_worker(cp, cc, 0, -1)
+    d_unselected = cp.empty_like(worker["d_in"])
+    return cc.make_three_way_partition(
+        d_in=worker["d_in"],
+        d_first_part_out=worker["d_out"],
+        d_second_part_out=d_unselected,
+        d_unselected_out=cp.empty_like(worker["d_in"]),
+        d_num_selected_out=worker["d_count"],
+        select_first_part_op=cc.OpKind.IDENTITY,
+        select_second_part_op=cc.OpKind.LOGICAL_NOT,
+    )
+
+
+def _make_three_way_worker(cp, cc, worker_id, iteration):
+    worker = _make_select_worker(cp, cc, worker_id, iteration)
+    stream = worker["stream"]
+    with stream:
+        worker["d_second_out"] = cp.empty_like(worker["d_in"])
+        worker["d_unselected"] = cp.empty_like(worker["d_in"])
+    return worker
+
+
+def _run_three_way(cp, cc, partitioner, worker):
+    _call_with_temp(
+        cp,
+        partitioner,
+        d_in=worker["d_in"],
+        d_first_part_out=worker["d_out"],
+        d_second_part_out=worker["d_second_out"],
+        d_unselected_out=worker["d_unselected"],
+        d_num_selected_out=worker["d_count"],
+        select_first_part_op=cc.OpKind.IDENTITY,
+        select_second_part_op=cc.OpKind.LOGICAL_NOT,
+        num_items=worker["h_in"].size,
+        stream=worker["cuda_stream"],
+    )
+
+
+def _check_three_way(cp, cc, worker):
+    worker["stream"].synchronize()
+    counts = worker["d_count"].get()
+    true_count = int(np.count_nonzero(worker["h_in"]))
+    false_count = int(worker["h_in"].size - true_count)
+    assert int(counts[0]) == true_count
+    assert int(counts[1]) == false_count
+    np.testing.assert_array_equal(
+        worker["d_out"].get()[:true_count], np.ones(true_count, dtype=np.bool_)
+    )
+    np.testing.assert_array_equal(
+        worker["d_second_out"].get()[:false_count], np.zeros(false_count, dtype=np.bool_)
+    )
+
+
+def _make_unique_worker(cp, cc, worker_id, iteration):
+    stream, cuda_stream = _make_stream(cp)
+    base = worker_id * 10 + iteration
+    h_keys = np.array([base, base, base + 1, base + 2, base + 2, base + 3], dtype=np.int32)
+    h_items = np.arange(h_keys.size, dtype=np.int32) + worker_id * 100
+    with stream:
+        d_in_keys = cp.asarray(h_keys)
+        d_in_items = cp.asarray(h_items)
+        d_out_keys = cp.empty_like(d_in_keys)
+        d_out_items = cp.empty_like(d_in_items)
+        d_count = cp.empty(1, dtype=np.int32)
+    return {
+        "stream": stream,
+        "cuda_stream": cuda_stream,
+        "h_keys": h_keys,
+        "h_items": h_items,
+        "d_in_keys": d_in_keys,
+        "d_in_items": d_in_items,
+        "d_out_keys": d_out_keys,
+        "d_out_items": d_out_items,
+        "d_count": d_count,
+    }
+
+
+def _make_unique_shared(cp, cc):
+    worker = _make_unique_worker(cp, cc, 0, -1)
+    return cc.make_unique_by_key(
+        d_in_keys=worker["d_in_keys"],
+        d_in_items=worker["d_in_items"],
+        d_out_keys=worker["d_out_keys"],
+        d_out_items=worker["d_out_items"],
+        d_out_num_selected=worker["d_count"],
+        op=cc.OpKind.EQUAL_TO,
+    )
+
+
+def _run_unique(cp, cc, uniquer, worker):
+    _call_with_temp(
+        cp,
+        uniquer,
+        d_in_keys=worker["d_in_keys"],
+        d_in_items=worker["d_in_items"],
+        d_out_keys=worker["d_out_keys"],
+        d_out_items=worker["d_out_items"],
+        d_out_num_selected=worker["d_count"],
+        op=cc.OpKind.EQUAL_TO,
+        num_items=worker["h_keys"].size,
+        stream=worker["cuda_stream"],
+    )
+
+
+def _check_unique(cp, cc, worker):
+    worker["stream"].synchronize()
+    selected = np.concatenate(([True], worker["h_keys"][1:] != worker["h_keys"][:-1]))
+    expected_keys = worker["h_keys"][selected]
+    expected_items = worker["h_items"][selected]
+    count = int(worker["d_count"].get()[0])
+    assert count == expected_keys.size
+    np.testing.assert_array_equal(worker["d_out_keys"].get()[:count], expected_keys)
+    np.testing.assert_array_equal(worker["d_out_items"].get()[:count], expected_items)
+
+
+def _make_merge_sort_worker(cp, cc, worker_id, iteration):
+    stream, cuda_stream = _make_stream(cp)
+    h_keys = np.array([5, 1, 3, 1, 4, 2], dtype=np.int32) + worker_id * 10
+    h_values = np.arange(h_keys.size, dtype=np.int32) + iteration * 100
+    with stream:
+        d_in_keys = cp.asarray(h_keys)
+        d_in_values = cp.asarray(h_values)
+        d_out_keys = cp.empty_like(d_in_keys)
+        d_out_values = cp.empty_like(d_in_values)
+    return {
+        "stream": stream,
+        "cuda_stream": cuda_stream,
+        "h_keys": h_keys,
+        "h_values": h_values,
+        "d_in_keys": d_in_keys,
+        "d_in_values": d_in_values,
+        "d_out_keys": d_out_keys,
+        "d_out_values": d_out_values,
+    }
+
+
+def _make_merge_sort_shared(cp, cc):
+    worker = _make_merge_sort_worker(cp, cc, 0, -1)
+    return cc.make_merge_sort(
+        d_in_keys=worker["d_in_keys"],
+        d_in_values=worker["d_in_values"],
+        d_out_keys=worker["d_out_keys"],
+        d_out_values=worker["d_out_values"],
+        op=cc.OpKind.LESS,
+    )
+
+
+def _run_merge_sort(cp, cc, sorter, worker):
+    _call_with_temp(
+        cp,
+        sorter,
+        d_in_keys=worker["d_in_keys"],
+        d_in_values=worker["d_in_values"],
+        d_out_keys=worker["d_out_keys"],
+        d_out_values=worker["d_out_values"],
+        op=cc.OpKind.LESS,
+        num_items=worker["h_keys"].size,
+        stream=worker["cuda_stream"],
+    )
+
+
+def _check_merge_sort(cp, cc, worker):
+    worker["stream"].synchronize()
+    order = np.argsort(worker["h_keys"], kind="stable")
+    np.testing.assert_array_equal(worker["d_out_keys"].get(), worker["h_keys"][order])
+    np.testing.assert_array_equal(worker["d_out_values"].get(), worker["h_values"][order])
+
+
+def _make_radix_sort_worker(cp, cc, worker_id, iteration):
+    stream, cuda_stream = _make_stream(cp)
+    h_keys = np.array([7, 3, 5, 3, 1, 9], dtype=np.uint32) + np.uint32(worker_id * 11)
+    h_values = np.arange(h_keys.size, dtype=np.int32) + iteration * 10
+    with stream:
+        d_in_keys = cp.asarray(h_keys)
+        d_tmp_keys = cp.empty_like(d_in_keys)
+        d_in_values = cp.asarray(h_values)
+        d_tmp_values = cp.empty_like(d_in_values)
+    return {
+        "stream": stream,
+        "cuda_stream": cuda_stream,
+        "h_keys": h_keys,
+        "h_values": h_values,
+        "keys": cc.DoubleBuffer(d_in_keys, d_tmp_keys),
+        "values": cc.DoubleBuffer(d_in_values, d_tmp_values),
+    }
+
+
+def _make_radix_sort_shared(cp, cc):
+    worker = _make_radix_sort_worker(cp, cc, 0, -1)
+    return cc.make_radix_sort(
+        d_in_keys=worker["keys"],
+        d_out_keys=None,
+        d_in_values=worker["values"],
+        d_out_values=None,
+        order=cc.SortOrder.ASCENDING,
+    )
+
+
+def _run_radix_sort(cp, cc, sorter, worker):
+    _call_with_temp(
+        cp,
+        sorter,
+        d_in_keys=worker["keys"],
+        d_out_keys=None,
+        d_in_values=worker["values"],
+        d_out_values=None,
+        num_items=worker["h_keys"].size,
+        stream=worker["cuda_stream"],
+    )
+
+
+def _check_radix_sort(cp, cc, worker):
+    worker["stream"].synchronize()
+    order = np.argsort(worker["h_keys"], kind="stable")
+    np.testing.assert_array_equal(worker["keys"].current().get(), worker["h_keys"][order])
+    np.testing.assert_array_equal(worker["values"].current().get(), worker["h_values"][order])
+    assert worker["keys"].selector == worker["values"].selector
+
+
+def _make_segmented_sort_worker(cp, cc, worker_id, iteration):
+    stream, cuda_stream = _make_stream(cp)
+    h_keys = np.array([4, 2, 3, 8, 6, 7, 1, 5], dtype=np.int32) + worker_id * 13
+    h_values = np.arange(h_keys.size, dtype=np.int32) + iteration * 100
+    h_start_offsets = np.array([0, 3, 6], dtype=np.int32)
+    h_end_offsets = np.array([3, 6, 8], dtype=np.int32)
+    with stream:
+        d_in_keys = cp.asarray(h_keys)
+        d_tmp_keys = cp.empty_like(d_in_keys)
+        d_in_values = cp.asarray(h_values)
+        d_tmp_values = cp.empty_like(d_in_values)
+        d_start_offsets = cp.asarray(h_start_offsets)
+        d_end_offsets = cp.asarray(h_end_offsets)
+    return {
+        "stream": stream,
+        "cuda_stream": cuda_stream,
+        "h_keys": h_keys,
+        "h_values": h_values,
+        "h_start_offsets": h_start_offsets,
+        "h_end_offsets": h_end_offsets,
+        "keys": cc.DoubleBuffer(d_in_keys, d_tmp_keys),
+        "values": cc.DoubleBuffer(d_in_values, d_tmp_values),
+        "d_start_offsets": d_start_offsets,
+        "d_end_offsets": d_end_offsets,
+    }
+
+
+def _make_segmented_sort_shared(cp, cc):
+    worker = _make_segmented_sort_worker(cp, cc, 0, -1)
+    return cc.make_segmented_sort(
+        d_in_keys=worker["keys"],
+        d_out_keys=None,
+        d_in_values=worker["values"],
+        d_out_values=None,
+        start_offsets_in=worker["d_start_offsets"],
+        end_offsets_in=worker["d_end_offsets"],
+        order=cc.SortOrder.ASCENDING,
+    )
+
+
+def _run_segmented_sort(cp, cc, sorter, worker):
+    _call_with_temp(
+        cp,
+        sorter,
+        d_in_keys=worker["keys"],
+        d_out_keys=None,
+        d_in_values=worker["values"],
+        d_out_values=None,
+        num_items=worker["h_keys"].size,
+        num_segments=worker["h_start_offsets"].size,
+        start_offsets_in=worker["d_start_offsets"],
+        end_offsets_in=worker["d_end_offsets"],
+        stream=worker["cuda_stream"],
+    )
+
+
+def _check_segmented_sort(cp, cc, worker):
+    worker["stream"].synchronize()
+    expected_keys, expected_values = _selected_segments(
+        worker["h_keys"],
+        worker["h_values"],
+        worker["h_start_offsets"],
+        worker["h_end_offsets"],
+    )
+    np.testing.assert_array_equal(worker["keys"].current().get(), expected_keys)
+    np.testing.assert_array_equal(worker["values"].current().get(), expected_values)
+    assert worker["keys"].selector == worker["values"].selector
+
+
+SHARED_ALGORITHM_CASES = [
+    _AlgorithmCase("reduce", _make_reduce_shared, _make_reduce_worker, _run_reduce, _check_reduce),
+    _AlgorithmCase(
+        "unary_transform", _make_unary_shared, _make_unary_worker, _run_unary, _check_unary
+    ),
+    _AlgorithmCase(
+        "binary_transform",
+        _make_binary_shared,
+        _make_binary_worker,
+        _run_binary,
+        _check_binary,
+    ),
+    _AlgorithmCase(
+        "exclusive_scan",
+        _make_exclusive_scan_shared,
+        _make_scan_worker,
+        _run_scan,
+        _check_exclusive_scan,
+    ),
+    _AlgorithmCase(
+        "inclusive_scan",
+        _make_inclusive_scan_shared,
+        _make_scan_worker,
+        _run_scan,
+        _check_inclusive_scan,
+    ),
+    _AlgorithmCase(
+        "segmented_reduce",
+        _make_segmented_reduce_shared,
+        _make_segmented_reduce_worker,
+        _run_segmented_reduce,
+        _check_segmented_reduce,
+    ),
+    _AlgorithmCase(
+        "histogram",
+        _make_histogram_shared,
+        _make_histogram_worker,
+        _run_histogram,
+        _check_histogram,
+    ),
+    _AlgorithmCase(
+        "lower_bound",
+        _make_lower_bound_shared,
+        _make_binary_search_worker,
+        _run_binary_search,
+        _check_lower_bound,
+    ),
+    _AlgorithmCase(
+        "upper_bound",
+        _make_upper_bound_shared,
+        _make_binary_search_worker,
+        _run_binary_search,
+        _check_upper_bound,
+    ),
+    _AlgorithmCase("select", _make_select_shared, _make_select_worker, _run_select, _check_select),
+    _AlgorithmCase(
+        "three_way_partition",
+        _make_three_way_shared,
+        _make_three_way_worker,
+        _run_three_way,
+        _check_three_way,
+    ),
+    _AlgorithmCase(
+        "unique_by_key", _make_unique_shared, _make_unique_worker, _run_unique, _check_unique
+    ),
+    _AlgorithmCase(
+        "merge_sort",
+        _make_merge_sort_shared,
+        _make_merge_sort_worker,
+        _run_merge_sort,
+        _check_merge_sort,
+    ),
+    _AlgorithmCase(
+        "radix_sort",
+        _make_radix_sort_shared,
+        _make_radix_sort_worker,
+        _run_radix_sort,
+        _check_radix_sort,
+    ),
+    _AlgorithmCase(
+        "segmented_sort",
+        _make_segmented_sort_shared,
+        _make_segmented_sort_worker,
+        _run_segmented_sort,
+        _check_segmented_sort,
+    ),
+]
+
+
+def test_free_threaded_import_keeps_gil_disabled(compute_modules):
+    cp, cc = compute_modules
+
+    h_in = np.arange(8, dtype=np.int32)
+    d_in = cp.asarray(h_in)
+    d_out = cp.empty(1, dtype=np.int32)
+    h_init = np.array([0], dtype=np.int32)
+
+    cc.reduce_into(
+        d_in=d_in,
+        d_out=d_out,
+        num_items=h_in.size,
+        op=cc.OpKind.PLUS,
+        h_init=h_init,
+    )
+
+    assert int(d_out.get()[0]) == int(h_in.sum())
+    _assert_gil_disabled("after running cuda.compute smoke operation")
+
+
+@pytest.mark.parametrize("case", SHARED_ALGORITHM_CASES, ids=str)
+def test_thread_local_algorithm_objects_share_build_result(compute_modules, case):
+    cp, cc = compute_modules
+
+    _run_thread_local_algorithm_case(cp, cc, case)
+
+
+def _cache_miss_reduce(cp, cc, worker_id, iteration):
+    worker = _make_reduce_worker(cp, cc, worker_id, iteration)
+    reducer = cc.make_reduce_into(
+        d_in=worker["d_in"],
+        d_out=worker["d_out"],
+        op=cc.OpKind.PLUS,
+        h_init=worker["h_init"],
+    )
+    _run_reduce(cp, cc, reducer, worker)
+    _check_reduce(cp, cc, worker)
+    return reducer
+
+
+def _cache_miss_unary_transform(cp, cc, worker_id, iteration):
+    worker = _make_unary_worker(cp, cc, worker_id, iteration)
+    transformer = cc.make_unary_transform(
+        d_in=worker["d_in"], d_out=worker["d_out"], op=cc.OpKind.NEGATE
+    )
+    _run_unary(cp, cc, transformer, worker)
+    _check_unary(cp, cc, worker)
+    return transformer
+
+
+def _cache_miss_binary_transform(cp, cc, worker_id, iteration):
+    worker = _make_binary_worker(cp, cc, worker_id, iteration)
+    transformer = cc.make_binary_transform(
+        d_in1=worker["d_in1"],
+        d_in2=worker["d_in2"],
+        d_out=worker["d_out"],
+        op=cc.OpKind.PLUS,
+    )
+    _run_binary(cp, cc, transformer, worker)
+    _check_binary(cp, cc, worker)
+    return transformer
+
+
+@pytest.mark.parametrize(
+    "factory",
+    [_cache_miss_reduce, _cache_miss_unary_transform, _cache_miss_binary_transform],
+    ids=["reduce", "unary_transform", "binary_transform"],
+)
+def test_same_key_factory_cache_miss_storm(compute_modules, factory):
+    cp, cc = compute_modules
+
+    for iteration in range(STRESS_ITERATIONS):
+        cc.clear_all_caches()
+        returned_objects = [None] * STRESS_THREADS
+
+        def make_thread(worker_id):
+            def thread(barrier):
+                barrier.wait()
+                returned_objects[worker_id] = factory(cp, cc, worker_id, iteration)
+
+            return thread
+
+        _run_threaded([make_thread(worker_id) for worker_id in range(STRESS_THREADS)])
+
+        assert len({id(obj) for obj in returned_objects}) == len(returned_objects)
+        assert len({id(_get_build_result(obj)) for obj in returned_objects}) == 1
+
+
+def test_shared_raw_op_object_direct_algorithm_stress(compute_modules):
+    cp, cc = compute_modules
+
+    from cuda.compute._cpp_compile import compile_cpp_op_code
+    from cuda.compute.op import RawOp
+
+    source = """
+    extern "C" __device__ void raw_add_i32(void* a, void* b, void* result) {
+        *static_cast<int*>(result) = *static_cast<int*>(a) + *static_cast<int*>(b);
+    }
+    """
+    shared_op = RawOp(ltoir=compile_cpp_op_code(source), name="raw_add_i32")
+
+    for iteration in range(STRESS_ITERATIONS):
+        cc.clear_all_caches()
+        returned_reducers = [None] * STRESS_THREADS
+
+        def make_thread(worker_id):
+            stream, cuda_stream = _make_stream(cp)
+            h_in = np.arange(32, dtype=np.int32) + worker_id * 31 + iteration
+            h_init = np.array([worker_id + 5], dtype=np.int32)
+            with stream:
+                d_in = cp.asarray(h_in)
+                d_out = cp.empty(1, dtype=np.int32)
+
+            def thread(barrier):
+                barrier.wait()
+                reducer = cc.make_reduce_into(
+                    d_in=d_in,
+                    d_out=d_out,
+                    op=shared_op,
+                    h_init=h_init,
+                )
+                returned_reducers[worker_id] = reducer
+                _call_with_temp(
+                    cp,
+                    reducer,
+                    d_in=d_in,
+                    d_out=d_out,
+                    op=shared_op,
+                    h_init=h_init,
+                    num_items=h_in.size,
+                    stream=cuda_stream,
+                )
+                stream.synchronize()
+                expected = int(h_in.sum(dtype=np.int64) + h_init[0])
+                assert int(d_out.get()[0]) == expected
+
+            return thread
+
+        _run_threaded([make_thread(worker_id) for worker_id in range(STRESS_THREADS)])
+
+        assert len({id(reducer) for reducer in returned_reducers}) == len(
+            returned_reducers
+        )
+        assert len({id(_get_build_result(reducer)) for reducer in returned_reducers}) == 1
+
+
+@dataclass(frozen=True)
+class _IteratorCase:
+    name: str
+    make_iterator: Callable
+    dtype: np.dtype
+    num_items: int
+    expected_sum: int
+
+    def __str__(self):
+        return self.name
+
+
+@dataclass(frozen=True)
+class _ColdTransformCase:
+    name: str
+    make_worker: Callable
+    make_transformer: Callable
+    run: Callable
+    check: Callable
+
+    def __str__(self):
+        return self.name
+
+
+def _run_cold_transform_native_cache_case(cp, cc, case: _ColdTransformCase) -> None:
+    for iteration in range(STRESS_ITERATIONS):
+        cc.clear_all_caches()
+        workers = [
+            case.make_worker(cp, cc, worker_id=worker_id, iteration=iteration)
+            for worker_id in range(TRANSFORM_NATIVE_CACHE_THREADS)
+        ]
+        returned_algorithms = [None] * TRANSFORM_NATIVE_CACHE_THREADS
+        # Transform's native launch config cache is filled on first execution,
+        # so build wrappers first and synchronize the first call separately.
+        execute_barrier = threading.Barrier(TRANSFORM_NATIVE_CACHE_THREADS)
+
+        def make_thread(worker_id, worker):
+            def thread(barrier):
+                barrier.wait()
+                try:
+                    algorithm = case.make_transformer(cp, cc, worker)
+                    returned_algorithms[worker_id] = algorithm
+                except BaseException:
+                    execute_barrier.abort()
+                    raise
+
+                execute_barrier.wait(timeout=60)
+                case.run(cp, cc, algorithm, worker)
+                case.check(cp, cc, worker)
+
+            return thread
+
+        _run_threaded(
+            [make_thread(worker_id, worker) for worker_id, worker in enumerate(workers)]
+        )
+
+        assert len({id(algorithm) for algorithm in returned_algorithms}) == len(
+            returned_algorithms
+        )
+        assert len(
+            {id(_get_build_result(algorithm)) for algorithm in returned_algorithms}
+        ) == 1
+
+
+@pytest.mark.parametrize(
+    "case",
+    [
+        _ColdTransformCase(
+            "unary_transform",
+            _make_unary_worker,
+            _make_unary_for_worker,
+            _run_unary,
+            _check_unary,
+        ),
+        _ColdTransformCase(
+            "binary_transform",
+            _make_binary_worker,
+            _make_binary_for_worker,
+            _run_binary,
+            _check_binary,
+        ),
+    ],
+    ids=str,
+)
+def test_cold_transform_native_cache_initialization_stress(compute_modules, case):
+    cp, cc = compute_modules
+
+    _run_cold_transform_native_cache_case(cp, cc, case)
+
+
+def _iterator_counting(cp, cc):
+    return cc.CountingIterator(np.int32(0)), np.dtype(np.int32), 32, sum(range(32))
+
+
+def _iterator_constant(cp, cc):
+    return cc.ConstantIterator(np.int32(5)), np.dtype(np.int32), 32, 32 * 5
+
+
+def _iterator_cache_modified(cp, cc):
+    h_in = np.arange(32, dtype=np.int32)
+    d_in = cp.asarray(h_in)
+    return cc.CacheModifiedInputIterator(d_in, "stream"), h_in.dtype, h_in.size, int(h_in.sum())
+
+
+def _iterator_reverse(cp, cc):
+    h_in = np.arange(32, dtype=np.int32)
+    d_in = cp.asarray(h_in)
+    return cc.ReverseIterator(d_in), h_in.dtype, h_in.size, int(h_in.sum())
+
+
+def _iterator_permutation(cp, cc):
+    h_values = np.arange(32, dtype=np.int32)
+    h_indices = np.arange(31, -1, -1, dtype=np.int32)
+    d_values = cp.asarray(h_values)
+    d_indices = cp.asarray(h_indices)
+    return (
+        cc.PermutationIterator(d_values, d_indices),
+        h_values.dtype,
+        h_indices.size,
+        int(h_values[h_indices].sum()),
+    )
+
+
+def _iterator_shuffle(cp, cc):
+    num_items = 32
+    return (
+        cc.ShuffleIterator(num_items, seed=1234),
+        np.dtype(np.int64),
+        num_items,
+        sum(range(num_items)),
+    )
+
+
+def _iterator_transform(cp, cc):
+    from cuda.compute import types
+    from cuda.compute._cpp_compile import compile_cpp_op_code
+    from cuda.compute.op import RawOp
+
+    num_items = 32
+    source = """
+    extern "C" __device__ void negate_i32(void* input, void* result) {
+        *static_cast<int*>(result) = -*static_cast<int*>(input);
+    }
+    """
+    op = RawOp(ltoir=compile_cpp_op_code(source), name="negate_i32")
+    return (
+        cc.TransformIterator(cc.CountingIterator(np.int32(0)), op, value_type=types.int32),
+        np.dtype(np.int32),
+        num_items,
+        -sum(range(num_items)),
+    )
+
+
+ITERATOR_FACTORIES = [
+    _iterator_counting,
+    _iterator_constant,
+    _iterator_cache_modified,
+    _iterator_reverse,
+    _iterator_permutation,
+    _iterator_shuffle,
+    _iterator_transform,
+]
+
+
+@pytest.mark.parametrize(
+    "make_iterator",
+    ITERATOR_FACTORIES,
+    ids=lambda fn: fn.__name__.removeprefix("_iterator_"),
+)
+def test_shared_iterator_object_stress(compute_modules, make_iterator):
+    cp, cc = compute_modules
+
+    shared_iterator, dtype, num_items, expected_sum = make_iterator(cp, cc)
+    cp.cuda.Device().synchronize()
+
+    for iteration in range(STRESS_ITERATIONS):
+        cc.clear_all_caches()
+
+        def make_thread(worker_id):
+            stream, cuda_stream = _make_stream(cp)
+            h_init = np.array([worker_id], dtype=dtype)
+            with stream:
+                d_out = cp.empty(1, dtype=dtype)
+
+            def thread(barrier):
+                barrier.wait()
+                reducer = cc.make_reduce_into(
+                    d_in=shared_iterator,
+                    d_out=d_out,
+                    op=cc.OpKind.PLUS,
+                    h_init=h_init,
+                )
+                _call_with_temp(
+                    cp,
+                    reducer,
+                    d_in=shared_iterator,
+                    d_out=d_out,
+                    op=cc.OpKind.PLUS,
+                    h_init=h_init,
+                    num_items=num_items,
+                    stream=cuda_stream,
+                )
+                stream.synchronize()
+                assert int(d_out.get()[0]) == int(expected_sum + h_init[0])
+
+            return thread
+
+        _run_threaded([make_thread(worker_id) for worker_id in range(STRESS_THREADS)])
+
+
+def test_runtime_ownership_isolation(compute_modules):
+    cp, cc = compute_modules
+
+    def make_thread(worker_id):
+        def thread(barrier):
+            barrier.wait()
+            stream, cuda_stream = _make_stream(cp)
+            h_in = np.arange(16, dtype=np.int32) + worker_id * 10
+            h_init = np.array([worker_id], dtype=np.int32)
+
+            with stream:
+                d_in = cp.asarray(h_in)
+                d_reduce_out = cp.empty(1, dtype=np.int32)
+                d_scan_out = cp.empty_like(d_in)
+                d_transform_out = cp.empty_like(d_in)
+                d_hist = cp.zeros(4, dtype=np.int32)
+                h_keys = np.array([3, 1, 2, 1], dtype=np.uint32) + worker_id
+                d_keys_in = cp.asarray(h_keys)
+                d_keys_tmp = cp.empty_like(d_keys_in)
+
+            cc.reduce_into(
+                d_in=d_in,
+                d_out=d_reduce_out,
+                num_items=h_in.size,
+                op=cc.OpKind.PLUS,
+                h_init=h_init,
+                stream=cuda_stream,
+            )
+            cc.exclusive_scan(
+                d_in=d_in,
+                d_out=d_scan_out,
+                op=cc.OpKind.PLUS,
+                init_value=h_init,
+                num_items=h_in.size,
+                stream=cuda_stream,
+            )
+            cc.unary_transform(
+                d_in=d_in,
+                d_out=d_transform_out,
+                op=cc.OpKind.NEGATE,
+                num_items=h_in.size,
+                stream=cuda_stream,
+            )
+            cc.histogram_even(
+                d_samples=d_in,
+                d_histogram=d_hist,
+                num_output_levels=5,
+                lower_level=np.int32(worker_id * 10),
+                upper_level=np.int32(worker_id * 10 + 16),
+                num_samples=h_in.size,
+                stream=cuda_stream,
+            )
+            keys = cc.DoubleBuffer(d_keys_in, d_keys_tmp)
+            cc.radix_sort(
+                d_in_keys=keys,
+                d_out_keys=None,
+                d_in_values=None,
+                d_out_values=None,
+                num_items=d_keys_in.size,
+                order=cc.SortOrder.ASCENDING,
+                stream=cuda_stream,
+            )
+
+            stream.synchronize()
+            assert int(d_reduce_out.get()[0]) == int(h_in.sum() + worker_id)
+            expected_scan = np.empty_like(h_in)
+            expected_scan[0] = worker_id
+            expected_scan[1:] = worker_id + np.cumsum(h_in[:-1])
+            np.testing.assert_array_equal(d_scan_out.get(), expected_scan)
+            np.testing.assert_array_equal(d_transform_out.get(), -h_in)
+            assert int(d_hist.sum().get()) == h_in.size
+            np.testing.assert_array_equal(keys.current().get(), np.sort(h_keys))
+
+        return thread
+
+    for _ in range(STRESS_ITERATIONS):
+        _run_threaded([make_thread(worker_id) for worker_id in range(STRESS_THREADS)])
+
+
+def test_cache_clear_while_active_operations_is_not_a_supported_contract():
+    pytest.skip(
+        "clear_all_caches() while cached operations are active is an unsupported "
+        "contract decision; see ST-19 in stress_tests.md."
+    )

From e461b597054a0695db414cdb7fa730ab2a128a62 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Fri, 12 Jun 2026 12:21:29 -0500
Subject: [PATCH 02/16] Decouple from numba.cuda further in tests in favor of
 cupy and cuda.core and add minimal extra testing

---
 ci/matrix.yaml                                |  2 +
 ci/test_cuda_compute_minimal_python.sh        | 51 +++++++++++++
 .../tests/compute/test_binary_search.py       | 35 +++++++++
 .../cuda_cccl/tests/compute/test_bindings.py  |  2 +
 .../compute/test_deferred_annotations.py      |  2 +
 .../tests/compute/test_func_caching.py        |  8 ++
 .../cuda_cccl/tests/compute/test_histogram.py |  2 +
 .../cuda_cccl/tests/compute/test_iterators.py | 10 ++-
 .../tests/compute/test_merge_sort.py          | 55 +++++++-------
 .../tests/compute/test_nested_struct.py       |  5 ++
 .../cuda_cccl/tests/compute/test_no_numba.py  | 75 +-----------------
 .../compute/test_permutation_iterator.py      |  6 ++
 .../tests/compute/test_radix_sort.py          | 76 ++++++++++---------
 python/cuda_cccl/tests/compute/test_reduce.py | 38 +++++-----
 python/cuda_cccl/tests/compute/test_scan.py   | 16 ++--
 .../tests/compute/test_segmented_reduce.py    |  3 +
 .../tests/compute/test_segmented_sort.py      | 51 +++++++------
 python/cuda_cccl/tests/compute/test_select.py | 31 ++++++++
 .../tests/compute/test_shuffle_iterator.py    |  1 +
 .../compute/test_struct_field_validation.py   |  2 +
 .../tests/compute/test_three_way_partition.py | 46 +++++++++++
 .../cuda_cccl/tests/compute/test_transform.py |  5 ++
 .../tests/compute/test_unique_by_key.py       | 69 ++++++++---------
 .../test_void_ptr_wrapper_validation.py       |  4 +-
 .../tests/compute/test_zip_iterator.py        | 11 ++-
 25 files changed, 379 insertions(+), 227 deletions(-)
 create mode 100755 ci/test_cuda_compute_minimal_python.sh

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 6146d863c87..1fcb1df75ae 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -84,6 +84,7 @@ workflows:
     - {jobs: ['test'], project: 'python', ctk: ['12.X',        '13.X'], py_version: ['3.10'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']}
     - {jobs: ['test'], project: 'python', ctk: ['12.X','13.0', '13.X'], py_version: ['3.14'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']}
     - {jobs: ['test'], project: 'python', py_version: '3.14', gpu: 'h100', cxx: 'gcc13'}
+    - {jobs: ['test_py_compute_minimal'], project: 'python', ctk: '13.X', py_version: '3.14', gpu: 'l4', cxx: 'gcc13'}
     # CCCL packaging:
     - {jobs: ['test'], project: 'packaging', ctk: '12.0', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080', args: '-min-cmake'}
     - {jobs: ['test'], project: 'packaging', ctk: '12.X', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080'}
@@ -539,6 +540,7 @@ jobs:
   test_py_headers:  { name: "Test cuda.cccl.headers",      gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_headers'} }
   test_py_coop:     { name: "Test cuda.coop._experimental",  gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_coop'} }
   test_py_par:      { name: "Test cuda.compute",     gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute'} }
+  test_py_compute_minimal: { name: "Test cuda.compute minimal", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute_minimal'} }
   test_py_examples: { name: "Test cuda.cccl.examples",     gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_examples'} }
 
   # Run jobs for 'target' project (ci/util/build_and_test_targets.sh):
diff --git a/ci/test_cuda_compute_minimal_python.sh b/ci/test_cuda_compute_minimal_python.sh
new file mode 100755
index 00000000000..c48b313ed31
--- /dev/null
+++ b/ci/test_cuda_compute_minimal_python.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+repo_root="$(cd "$ci_dir/.." && pwd)"
+source "$ci_dir/pyenv_helper.sh"
+
+# Parse common arguments
+source "$ci_dir/util/python/common_arg_parser.sh"
+parse_python_args "$@"
+require_py_version "Usage: $0 -py-version <python_version>"
+
+cuda_major_version=$(nvcc --version | grep release | awk '{print $6}' | tr -d ',' | cut -d '.' -f 1 | cut -d 'V' -f 2)
+
+# Setup Python environment
+setup_python_env "${py_version}"
+
+# Fetch or build the cuda_cccl wheel:
+if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
+  wheel_artifact_name=$("$ci_dir/util/workflow/get_wheel_artifact_name.sh")
+  "$ci_dir/util/artifacts/download.sh" "${wheel_artifact_name}" /home/coder/cccl/
+  wheelhouse_dir="/home/coder/cccl/wheelhouse"
+else
+  "$ci_dir/build_cuda_cccl_python.sh" -py-version "${py_version}"
+  wheelhouse_dir="${repo_root}/wheelhouse"
+fi
+
+# Install cuda_cccl with the minimal CUDA extra. This intentionally avoids the
+# full cu* extras because those pull in numba/numba-cuda. In a clean minimal
+# environment, the test phase below runs only tests marked no_numba.
+CUDA_CCCL_WHEEL_PATH="$(ls "${wheelhouse_dir}"/cuda_cccl-*.whl)"
+python -m pip install "${CUDA_CCCL_WHEEL_PATH}[minimal-cu${cuda_major_version}]"
+python -m pip install pytest pytest-xdist "cupy-cuda${cuda_major_version}x"
+
+if python - <<'PY'
+try:
+    import numba.cuda  # noqa: F401
+except Exception as exc:
+    print(f"numba.cuda unavailable; running no_numba subset: {exc!r}")
+    raise SystemExit(1)
+else:
+    print("numba.cuda available; running full compute test suite.")
+PY
+then
+  cd "${repo_root}/python/cuda_cccl/tests/"
+  python -m pytest -n 6 -v compute/ -m "not large"
+else
+  cd "${repo_root}/python/cuda_cccl/tests/"
+  python -m pytest -n 6 -v compute/ -m "not large and no_numba"
+fi
diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py
index a4f7d047a9b..94863c3fe74 100644
--- a/python/cuda_cccl/tests/compute/test_binary_search.py
+++ b/python/cuda_cccl/tests/compute/test_binary_search.py
@@ -6,6 +6,8 @@
 import pytest
 
 import cuda.compute
+from cuda.compute._cpp_compile import compile_cpp_op_code
+from cuda.compute.op import RawOp
 
 DTYPE_LIST = [
     np.int32,
@@ -39,6 +41,39 @@ def disable_sass_check(monkeypatch):
     )
 
 
+def _raw_less_i32_op() -> RawOp:
+    source = """
+extern "C" __device__ void less_i32(void* lhs, void* rhs, void* result) {
+    int lhs_value = *static_cast<int*>(lhs);
+    int rhs_value = *static_cast<int*>(rhs);
+    *static_cast<bool*>(result) = lhs_value < rhs_value;
+}
+"""
+    return RawOp(ltoir=compile_cpp_op_code(source), name="less_i32")
+
+
+@pytest.mark.no_numba
+def test_lower_bound_raw_op_minimal():
+    h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32)
+    h_values = np.array([0, 3, 4, 10], dtype=np.int32)
+
+    d_data = cp.asarray(h_data)
+    d_values = cp.asarray(h_values)
+    d_out = cp.empty(len(h_values), dtype=np.uintp)
+
+    cuda.compute.lower_bound(
+        d_data=d_data,
+        num_items=len(d_data),
+        d_values=d_values,
+        num_values=len(d_values),
+        d_out=d_out,
+        comp=_raw_less_i32_op(),
+    )
+
+    expected = np.searchsorted(h_data, h_values, side="left").astype(np.uintp)
+    np.testing.assert_array_equal(d_out.get(), expected)
+
+
 @pytest.mark.parametrize("dtype", DTYPE_LIST)
 @pytest.mark.parametrize(
     "num_items,num_values", [(0, 0), (0, 128), (128, 0), (512, 128)]
diff --git a/python/cuda_cccl/tests/compute/test_bindings.py b/python/cuda_cccl/tests/compute/test_bindings.py
index 2f16da620ef..ccfbd8d0bc3 100644
--- a/python/cuda_cccl/tests/compute/test_bindings.py
+++ b/python/cuda_cccl/tests/compute/test_bindings.py
@@ -4,6 +4,8 @@
 
 import cuda.compute._bindings as bindings
 
+pytestmark = pytest.mark.no_numba
+
 
 @pytest.fixture(
     params=[
diff --git a/python/cuda_cccl/tests/compute/test_deferred_annotations.py b/python/cuda_cccl/tests/compute/test_deferred_annotations.py
index c30aeda4068..a584b8ffbe2 100644
--- a/python/cuda_cccl/tests/compute/test_deferred_annotations.py
+++ b/python/cuda_cccl/tests/compute/test_deferred_annotations.py
@@ -7,10 +7,12 @@
 
 import cupy as cp
 import numpy as np
+import pytest
 
 from cuda.compute import OpKind, TransformIterator, gpu_struct, reduce_into
 
 
+@pytest.mark.no_numba
 def test_deferred_annotations():
     # test that we can use @gpu_struct with deferred annotations
     # GH: #6421
diff --git a/python/cuda_cccl/tests/compute/test_func_caching.py b/python/cuda_cccl/tests/compute/test_func_caching.py
index 0da32901a32..5d12458790d 100644
--- a/python/cuda_cccl/tests/compute/test_func_caching.py
+++ b/python/cuda_cccl/tests/compute/test_func_caching.py
@@ -1,10 +1,12 @@
 import numpy as np
+import pytest
 
 from cuda.compute._caching import CachableFunction
 
 global_x = 1
 
 
+@pytest.mark.no_numba
 def test_func_caching_basic():
     def func(x):
         return x
@@ -19,6 +21,7 @@ def func(x):
     assert f1 == f2
 
 
+@pytest.mark.no_numba
 def test_func_caching_different_names():
     def func(x):
         return x
@@ -33,6 +36,7 @@ def func2(x):
     assert f1 != f2
 
 
+@pytest.mark.no_numba
 def test_func_caching_different_code():
     def func(x):
         return x
@@ -46,6 +50,7 @@ def func(x):
     assert f1 != f2
 
 
+@pytest.mark.no_numba
 def test_func_caching_with_closure():
     def factory(x):
         def func(y):
@@ -61,6 +66,7 @@ def func(y):
     assert f1 != f3
 
 
+@pytest.mark.no_numba
 def test_func_caching_with_global_variable():
     global global_x
 
@@ -107,6 +113,7 @@ def func(x):
     assert CachableFunction(func1) != CachableFunction(func3)
 
 
+@pytest.mark.no_numba
 def test_func_caching_with_global_np_ufunc():
     def make_func():
         def func(x):
@@ -126,6 +133,7 @@ def func(x):
     assert CachableFunction(func1) != CachableFunction(func2)
 
 
+@pytest.mark.no_numba
 def test_func_caching_with_aliased_np_ufunc():
     def make_func1():
         amin = np.argmin
diff --git a/python/cuda_cccl/tests/compute/test_histogram.py b/python/cuda_cccl/tests/compute/test_histogram.py
index 0a9d96674e6..5c104114754 100644
--- a/python/cuda_cccl/tests/compute/test_histogram.py
+++ b/python/cuda_cccl/tests/compute/test_histogram.py
@@ -12,6 +12,8 @@
     CountingIterator,
 )
 
+pytestmark = pytest.mark.no_numba
+
 DTYPE_LIST = [
     np.uint8,
     np.uint16,
diff --git a/python/cuda_cccl/tests/compute/test_iterators.py b/python/cuda_cccl/tests/compute/test_iterators.py
index 49e2ab39d15..c3b6acc9fc9 100644
--- a/python/cuda_cccl/tests/compute/test_iterators.py
+++ b/python/cuda_cccl/tests/compute/test_iterators.py
@@ -4,7 +4,6 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import cupy as cp
-import numba.cuda
 import numpy as np
 import pytest
 
@@ -22,6 +21,7 @@
 )
 
 
+@pytest.mark.no_numba
 def test_constant_iterator_equality():
     it1 = ConstantIterator(np.int32(0))
     it2 = ConstantIterator(np.int32(0))
@@ -32,6 +32,7 @@ def test_constant_iterator_equality():
     assert it1.kind != it4.kind
 
 
+@pytest.mark.no_numba
 def test_counting_iterator_equality():
     it1 = CountingIterator(np.int32(0))
     it2 = CountingIterator(np.int32(0))
@@ -42,6 +43,7 @@ def test_counting_iterator_equality():
     assert it1.kind != it4.kind
 
 
+@pytest.mark.no_numba
 def test_cache_modified_input_iterator_equality():
     ary1 = cp.asarray([0, 1, 2], dtype="int32")
     ary2 = cp.asarray([3, 4, 5], dtype="int32")
@@ -122,11 +124,14 @@ def reverse_iterator_array(request):
     if array_type == "cupy":
         array = cp.array(base_array)
     else:
+        import numba.cuda
+
         array = numba.cuda.to_device(base_array)
 
     return array
 
 
+@pytest.mark.no_numba
 def test_reverse_input_iterator_equality():
     ary1 = cp.asarray([0, 1, 2], dtype="int32")
     ary2 = cp.asarray([3, 4, 5], dtype="int32")
@@ -141,6 +146,7 @@ def test_reverse_input_iterator_equality():
     assert it1.kind != it4.kind
 
 
+@pytest.mark.no_numba
 def test_reverse_output_iterator_equality():
     ary1 = cp.asarray([0, 1, 2], dtype="int32")
     ary2 = cp.asarray([3, 4, 5], dtype="int32")
@@ -155,6 +161,7 @@ def test_reverse_output_iterator_equality():
     assert it1.kind != it4.kind
 
 
+@pytest.mark.no_numba
 @pytest.mark.parametrize(
     "shape, itemsize, expected",
     [
@@ -179,6 +186,7 @@ def test_compute_c_contiguous_strides_in_bytes(shape, itemsize, expected):
     assert result == expected
 
 
+@pytest.mark.no_numba
 @pytest.mark.parametrize(
     "shape, dtype",
     [
diff --git a/python/cuda_cccl/tests/compute/test_merge_sort.py b/python/cuda_cccl/tests/compute/test_merge_sort.py
index 33d7d15de58..db97d5aca01 100644
--- a/python/cuda_cccl/tests/compute/test_merge_sort.py
+++ b/python/cuda_cccl/tests/compute/test_merge_sort.py
@@ -5,7 +5,6 @@
 from typing import List
 
 import cupy as cp
-import numba.cuda
 import numpy as np
 import pytest
 
@@ -82,11 +81,11 @@ def compare_op(lhs, rhs):
 def test_merge_sort_keys(dtype, num_items, op):
     h_in_keys = random_array(num_items, dtype)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_keys = cp.asarray(h_in_keys)
 
     merge_sort_device(d_in_keys, None, d_in_keys, None, op, num_items)
 
-    h_out_keys = d_in_keys.copy_to_host()
+    h_out_keys = d_in_keys.get()
     h_in_keys.sort()
 
     np.testing.assert_array_equal(h_out_keys, h_in_keys)
@@ -102,13 +101,13 @@ def test_merge_sort_pairs(dtype, num_items, op, monkeypatch):
     h_in_keys = random_array(num_items, dtype)
     h_in_items = random_array(num_items, np.float32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_items = numba.cuda.to_device(h_in_items)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_items = cp.asarray(h_in_items)
 
     merge_sort_device(d_in_keys, d_in_items, d_in_keys, d_in_items, op, num_items)
 
-    h_out_keys = d_in_keys.copy_to_host()
-    h_out_items = d_in_items.copy_to_host()
+    h_out_keys = d_in_keys.get()
+    h_out_items = d_in_items.get()
 
     argsort = np.argsort(h_in_keys, stable=True)
     h_in_keys = np.array(h_in_keys)[argsort]
@@ -123,12 +122,12 @@ def test_merge_sort_keys_copy(dtype, num_items, op):
     h_in_keys = random_array(num_items, dtype)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_out_keys = cp.asarray(h_out_keys)
 
     merge_sort_device(d_in_keys, None, d_out_keys, None, op, num_items)
 
-    h_out_keys = d_out_keys.copy_to_host()
+    h_out_keys = d_out_keys.get()
     h_in_keys.sort()
 
     np.testing.assert_array_equal(h_out_keys, h_in_keys)
@@ -146,15 +145,15 @@ def test_merge_sort_pairs_copy(dtype, num_items, op, monkeypatch):
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_items = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_items = numba.cuda.to_device(h_in_items)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_items = numba.cuda.to_device(h_out_items)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_items = cp.asarray(h_in_items)
+    d_out_keys = cp.asarray(h_out_keys)
+    d_out_items = cp.asarray(h_out_items)
 
     merge_sort_device(d_in_keys, d_in_items, d_out_keys, d_out_items, op, num_items)
 
-    h_out_keys = d_out_keys.copy_to_host()
-    h_out_items = d_out_items.copy_to_host()
+    h_out_keys = d_out_keys.get()
+    h_out_items = d_out_items.get()
 
     argsort = np.argsort(h_in_keys, stable=True)
     h_in_keys = np.array(h_in_keys)[argsort]
@@ -225,11 +224,11 @@ def compare_complex(lhs, rhs):
     imaginary = random_array(num_items, np.int64, max_value)
 
     h_in_keys = real + 1j * imaginary
-    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_keys = cp.asarray(h_in_keys)
 
     merge_sort_device(d_in_keys, None, d_in_keys, None, compare_complex, num_items)
 
-    h_out_keys = d_in_keys.copy_to_host()
+    h_out_keys = d_in_keys.get()
     h_in_keys = h_in_keys[np.argsort(h_in_keys.real, stable=True)]
 
     np.testing.assert_array_equal(h_out_keys, h_in_keys)
@@ -240,15 +239,15 @@ def test_merge_sort_keys_copy_iterator_input(dtype, num_items, op):
     h_in_keys = random_array(num_items, dtype)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_out_keys = cp.asarray(h_out_keys)
 
     i_input = CacheModifiedInputIterator(d_in_keys, modifier="stream")
 
     merge_sort_device(i_input, None, d_out_keys, None, op, num_items)
 
     h_in_keys.sort()
-    h_out_keys = d_out_keys.copy_to_host()
+    h_out_keys = d_out_keys.get()
 
     np.testing.assert_array_equal(h_out_keys, h_in_keys)
 
@@ -265,10 +264,10 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch)
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_items = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_items = numba.cuda.to_device(h_in_items)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_items = numba.cuda.to_device(h_out_items)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_items = cp.asarray(h_in_items)
+    d_out_keys = cp.asarray(h_out_keys)
+    d_out_items = cp.asarray(h_out_items)
 
     i_input_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream")
     i_input_items = CacheModifiedInputIterator(d_in_items, modifier="stream")
@@ -277,8 +276,8 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch)
         i_input_keys, i_input_items, d_out_keys, d_out_items, op, num_items
     )
 
-    h_out_keys = d_out_keys.copy_to_host()
-    h_out_items = d_out_items.copy_to_host()
+    h_out_keys = d_out_keys.get()
+    h_out_items = d_out_items.get()
 
     argsort = np.argsort(h_in_keys, stable=True)
     h_in_keys = np.array(h_in_keys)[argsort]
@@ -307,6 +306,7 @@ def test_merge_sort_with_stream(cuda_stream):
     np.testing.assert_array_equal(got, h_in_keys)
 
 
+@pytest.mark.no_numba
 def test_merge_sort_well_known_less():
     dtype = np.int32
 
@@ -326,6 +326,7 @@ def test_merge_sort_well_known_less():
     np.testing.assert_equal(d_out_keys.get(), expected)
 
 
+@pytest.mark.no_numba
 def test_merge_sort_well_known_greater():
     dtype = np.int32
 
diff --git a/python/cuda_cccl/tests/compute/test_nested_struct.py b/python/cuda_cccl/tests/compute/test_nested_struct.py
index 9cbfa40e393..715873af3c8 100644
--- a/python/cuda_cccl/tests/compute/test_nested_struct.py
+++ b/python/cuda_cccl/tests/compute/test_nested_struct.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 import cupy as cp
 import numpy as np
+import pytest
 
 import cuda.compute
 from cuda.compute import ZipIterator, gpu_struct
@@ -149,6 +150,7 @@ def sum_pixels(p1, p2):
     assert result["color"]["b"] == expected_b
 
 
+@pytest.mark.no_numba
 def test_dict_init_nested_struct():
     """Test initializing a nested struct with a dictionary."""
     Inner = gpu_struct({"a": np.int32, "b": np.float32})
@@ -162,6 +164,7 @@ def test_dict_init_nested_struct():
     assert np.isclose(obj.inner.b, 3.14)
 
 
+@pytest.mark.no_numba
 def test_dict_init_per_field():
     """Test initializing a struct with a dictionary for a nested field."""
     Inner = gpu_struct({"a": np.int32, "b": np.float32})
@@ -175,6 +178,7 @@ def test_dict_init_per_field():
     assert np.isclose(obj.inner.b, 3.14)
 
 
+@pytest.mark.no_numba
 def test_dict_init_deeply_nested():
     """Test initializing deeply nested structs (3+ levels) with dictionaries."""
     Level1 = gpu_struct({"value": np.int32})
@@ -189,6 +193,7 @@ def test_dict_init_deeply_nested():
     assert obj.middle.nested.value == 42
 
 
+@pytest.mark.no_numba
 def test_dict_init_mixed():
     """Test mixed initialization with some dicts and some direct values."""
     Inner1 = gpu_struct({"a": np.int32, "b": np.int32})
diff --git a/python/cuda_cccl/tests/compute/test_no_numba.py b/python/cuda_cccl/tests/compute/test_no_numba.py
index 8f1d271e1ed..bbab965c8bf 100644
--- a/python/cuda_cccl/tests/compute/test_no_numba.py
+++ b/python/cuda_cccl/tests/compute/test_no_numba.py
@@ -1,12 +1,6 @@
-import cupy as cp
-import numpy as np
 import pytest
 
-import cuda.compute
-from cuda.compute import OpKind
-
-# Mainly, these tests check that we can use algorithms with OpKind
-# operators while not requiring numba to be installed.
+# Check that tests marked no_numba fail fast if they import numba.
 pytestmark = pytest.mark.no_numba
 
 
@@ -16,70 +10,3 @@ def test_import_numba_raises():
         ImportError, match="This test is marked 'no_numba' but attempted to import it"
     ):
         import numba.cuda  # noqa: F401
-
-
-def test_reduce_op_kind():
-    num_items = 100
-    h_input = np.arange(num_items, dtype=np.int32)
-    d_input = cp.array(h_input)
-    d_output = cp.empty(1, dtype=np.int32)
-
-    h_init = np.array(0, dtype=np.int32)
-    cuda.compute.reduce_into(
-        d_in=d_input, d_out=d_output, num_items=num_items, op=OpKind.PLUS, h_init=h_init
-    )
-
-    result = d_output.get()[0]
-    expected = np.sum(h_input)
-    assert result == expected
-
-
-def test_binary_transform_op_kind():
-    num_items = 100
-    h_input1 = np.arange(num_items, dtype=np.int32)
-    h_input2 = np.arange(num_items, dtype=np.int32) * 2
-    d_input1 = cp.array(h_input1)
-    d_input2 = cp.array(h_input2)
-    d_output = cp.empty(num_items, dtype=np.int32)
-
-    cuda.compute.binary_transform(
-        d_in1=d_input1,
-        d_in2=d_input2,
-        d_out=d_output,
-        op=OpKind.PLUS,
-        num_items=num_items,
-    )
-
-    result = d_output.get()
-    expected = h_input1 + h_input2
-    assert np.array_equal(result, expected)
-
-
-def test_segmented_sort_op_kind():
-    # Create segments: [3, 1, 4] | [1, 5, 9, 2] | [6, 5]
-    num_items = 9
-    h_keys = np.array([3, 1, 4, 1, 5, 9, 2, 6, 5], dtype=np.int32)
-    h_offsets = np.array([0, 3, 7, 9], dtype=np.int32)
-
-    d_keys_in = cp.array(h_keys)
-    d_keys_out = cp.empty(num_items, dtype=np.int32)
-    d_offsets = cp.array(h_offsets)
-
-    num_segments = len(h_offsets) - 1
-
-    cuda.compute.segmented_sort(
-        d_in_keys=d_keys_in,
-        d_out_keys=d_keys_out,
-        d_in_values=None,
-        d_out_values=None,
-        num_items=num_items,
-        num_segments=num_segments,
-        start_offsets_in=d_offsets[:-1],
-        end_offsets_in=d_offsets[1:],
-        order=cuda.compute.SortOrder.ASCENDING,
-    )
-
-    result = d_keys_out.get()
-    # Expected: [1, 3, 4] | [1, 2, 5, 9] | [5, 6]
-    expected = np.array([1, 3, 4, 1, 2, 5, 9, 5, 6], dtype=np.int32)
-    assert np.array_equal(result, expected)
diff --git a/python/cuda_cccl/tests/compute/test_permutation_iterator.py b/python/cuda_cccl/tests/compute/test_permutation_iterator.py
index 74d74a7b4e6..226904b75da 100644
--- a/python/cuda_cccl/tests/compute/test_permutation_iterator.py
+++ b/python/cuda_cccl/tests/compute/test_permutation_iterator.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 import cupy as cp
 import numpy as np
+import pytest
 
 import cuda.compute
 from cuda.compute.iterators import (
@@ -12,6 +13,7 @@
 )
 
 
+@pytest.mark.no_numba
 def test_permutation_iterator_equality():
     values1 = cp.asarray([10, 20, 30, 40, 50], dtype="int32")
     values2 = cp.asarray([100, 200, 300], dtype="int32")
@@ -37,6 +39,7 @@ def test_permutation_iterator_equality():
     assert it1.kind != it5.kind
 
 
+@pytest.mark.no_numba
 def test_permutation_iterator_with_array_values():
     values = cp.asarray([10, 20, 30, 40, 50], dtype="int32")
     indices = cp.asarray([2, 0, 4, 1], dtype="int32")
@@ -54,6 +57,7 @@ def test_permutation_iterator_with_array_values():
     assert d_output[0] == values[indices].sum()
 
 
+@pytest.mark.no_numba
 def test_permutation_iterator_with_iterator_values():
     values_it = CountingIterator(np.int32(10))
     indices = cp.asarray([2, 0, 4, 1], dtype="int32")
@@ -157,6 +161,7 @@ def op(a):
     assert cp.all(d_out == expected)
 
 
+@pytest.mark.no_numba
 def test_caching_permutation_iterator():
     """Test that iterator compilation is cached across instances with the same structure."""
     from cuda.compute._cpp_compile import compile_cpp_op_code
@@ -204,6 +209,7 @@ def test_caching_permutation_iterator():
     )
 
 
+@pytest.mark.no_numba
 def test_permutation_iterator_advance():
     """Test PermutationIterator.__add__ only advances indices, not values."""
     # Create values array [10, 20, 30, 40, 50, 60, 70]
diff --git a/python/cuda_cccl/tests/compute/test_radix_sort.py b/python/cuda_cccl/tests/compute/test_radix_sort.py
index 62b8e7dcb44..67186d3a4f7 100644
--- a/python/cuda_cccl/tests/compute/test_radix_sort.py
+++ b/python/cuda_cccl/tests/compute/test_radix_sort.py
@@ -6,11 +6,11 @@
 from typing import Tuple
 
 import cupy as cp
-import numba
 import numpy as np
 import pytest
 
 import cuda.compute
+from cuda.core import Device
 from cuda.compute import (
     DoubleBuffer,
     SortOrder,
@@ -148,7 +148,7 @@ def host_sort(h_in_keys, h_in_values, order, begin_bit=None, end_bit=None) -> Tu
     DTYPE_SIZE,
 )
 def test_radix_sort_keys(dtype, num_items, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -164,12 +164,12 @@ def test_radix_sort_keys(dtype, num_items, monkeypatch):
     h_in_keys = random_array(num_items, dtype, max_value=20)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_out_keys = cp.asarray(h_out_keys)
 
     radix_sort_device(d_in_keys, d_out_keys, None, None, order, num_items)
 
-    h_out_keys = d_out_keys.copy_to_host()
+    h_out_keys = d_out_keys.get()
 
     h_in_keys, _ = host_sort(h_in_keys, None, order)
 
@@ -195,17 +195,17 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch):
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_values = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_values = numba.cuda.to_device(h_in_values)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_values = numba.cuda.to_device(h_out_values)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_values = cp.asarray(h_in_values)
+    d_out_keys = cp.asarray(h_out_keys)
+    d_out_values = cp.asarray(h_out_values)
 
     radix_sort_device(
         d_in_keys, d_out_keys, d_in_values, d_out_values, order, num_items
     )
 
-    h_out_keys = d_out_keys.copy_to_host()
-    h_out_values = d_out_values.copy_to_host()
+    h_out_keys = d_out_keys.get()
+    h_out_values = d_out_values.get()
 
     h_in_keys, h_in_values = host_sort(h_in_keys, h_in_values, order)
 
@@ -218,7 +218,7 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch):
     DTYPE_SIZE,
 )
 def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -234,14 +234,14 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch):
     h_in_keys = random_array(num_items, dtype, max_value=20)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_out_keys = cp.asarray(h_out_keys)
 
     keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys)
 
     radix_sort_device(keys_double_buffer, None, None, None, order, num_items)
 
-    h_out_keys = keys_double_buffer.current().copy_to_host()
+    h_out_keys = keys_double_buffer.current().get()
 
     h_in_keys, _ = host_sort(h_in_keys, None, order)
 
@@ -253,7 +253,7 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch):
     DTYPE_SIZE,
 )
 def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # NOTE: int16 failures seen only with NVRTC 13.1:
     if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)):
         import cuda.compute._cccl_interop
@@ -270,10 +270,10 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_values = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_values = numba.cuda.to_device(h_in_values)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_values = numba.cuda.to_device(h_out_values)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_values = cp.asarray(h_in_values)
+    d_out_keys = cp.asarray(h_out_keys)
+    d_out_values = cp.asarray(h_out_values)
 
     keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys)
     values_double_buffer = DoubleBuffer(d_in_values, d_out_values)
@@ -282,8 +282,8 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
         keys_double_buffer, None, values_double_buffer, None, order, num_items
     )
 
-    h_out_keys = keys_double_buffer.current().copy_to_host()
-    h_out_values = values_double_buffer.current().copy_to_host()
+    h_out_keys = keys_double_buffer.current().get()
+    h_out_values = values_double_buffer.current().get()
 
     h_in_keys, h_in_values = host_sort(h_in_keys, h_in_values, order)
 
@@ -304,7 +304,7 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
     DTYPE_SIZE_BIT_WINDOW,
 )
 def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # NOTE: int16 failures seen only with NVRTC 13.1:
     if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)):
         import cuda.compute._cccl_interop
@@ -329,10 +329,10 @@ def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch):
         h_out_keys = np.empty(num_items, dtype=dtype)
         h_out_values = np.empty(num_items, dtype=np.float32)
 
-        d_in_keys = numba.cuda.to_device(h_in_keys)
-        d_in_values = numba.cuda.to_device(h_in_values)
-        d_out_keys = numba.cuda.to_device(h_out_keys)
-        d_out_values = numba.cuda.to_device(h_out_values)
+        d_in_keys = cp.asarray(h_in_keys)
+        d_in_values = cp.asarray(h_in_values)
+        d_out_keys = cp.asarray(h_out_keys)
+        d_out_values = cp.asarray(h_out_values)
 
         radix_sort_device(
             d_in_keys,
@@ -345,8 +345,8 @@ def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch):
             end_bit,
         )
 
-        h_out_keys = d_out_keys.copy_to_host()
-        h_out_values = d_out_values.copy_to_host()
+        h_out_keys = d_out_keys.get()
+        h_out_values = d_out_values.get()
 
         h_in_keys, h_in_values = host_sort(
             h_in_keys, h_in_values, order, begin_bit, end_bit
@@ -384,10 +384,10 @@ def test_radix_sort_pairs_double_buffer_bit_window(dtype, num_items, monkeypatch
         h_out_keys = np.empty(num_items, dtype=dtype)
         h_out_values = np.empty(num_items, dtype=np.float32)
 
-        d_in_keys = numba.cuda.to_device(h_in_keys)
-        d_in_values = numba.cuda.to_device(h_in_values)
-        d_out_keys = numba.cuda.to_device(h_out_keys)
-        d_out_values = numba.cuda.to_device(h_out_values)
+        d_in_keys = cp.asarray(h_in_keys)
+        d_in_values = cp.asarray(h_in_values)
+        d_out_keys = cp.asarray(h_out_keys)
+        d_out_values = cp.asarray(h_out_values)
 
         keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys)
         values_double_buffer = DoubleBuffer(d_in_values, d_out_values)
@@ -403,8 +403,8 @@ def test_radix_sort_pairs_double_buffer_bit_window(dtype, num_items, monkeypatch
             end_bit,
         )
 
-        h_out_keys = keys_double_buffer.current().copy_to_host()
-        h_out_values = values_double_buffer.current().copy_to_host()
+        h_out_keys = keys_double_buffer.current().get()
+        h_out_values = values_double_buffer.current().get()
 
         h_in_keys, h_in_values = host_sort(
             h_in_keys, h_in_values, order, begin_bit, end_bit
@@ -469,8 +469,9 @@ def test_radix_sort_with_stream(cuda_stream):
     np.testing.assert_array_equal(got, h_in_keys)
 
 
+@pytest.mark.no_numba
 def test_radix_sort(monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -518,8 +519,9 @@ def test_radix_sort(monkeypatch):
     np.testing.assert_array_equal(h_out_items, h_in_values)
 
 
+@pytest.mark.no_numba
 def test_radix_sort_double_buffer(monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
diff --git a/python/cuda_cccl/tests/compute/test_reduce.py b/python/cuda_cccl/tests/compute/test_reduce.py
index df3f024f0ee..15f11428506 100644
--- a/python/cuda_cccl/tests/compute/test_reduce.py
+++ b/python/cuda_cccl/tests/compute/test_reduce.py
@@ -6,7 +6,6 @@
 import random
 
 import cupy as cp
-import numba.cuda
 import numpy as np
 import pytest
 from cupy.cuda import runtime
@@ -73,15 +72,15 @@ def add_op(a, b):
 def test_device_reduce(dtype, num_items, op):
     init_value = 42
     h_init = np.array([init_value], dtype=dtype)
-    d_output = numba.cuda.device_array(1, dtype=dtype)
+    d_output = cp.empty(1, dtype=dtype)
 
     h_input = random_int(num_items, dtype)
-    d_input = numba.cuda.to_device(h_input)
+    d_input = cp.asarray(h_input)
 
     cuda.compute.reduce_into(
         d_in=d_input, d_out=d_output, num_items=d_input.size, op=op, h_init=h_init
     )
-    h_output = d_output.copy_to_host()
+    h_output = d_output.get()
     assert h_output[0] == pytest.approx(
         sum(h_input) + init_value, rel=0.08 if dtype == np.float16 else 0
     )  # obtained relative error value from c2h/include/c2h/check_results.cuh
@@ -94,10 +93,10 @@ def test_device_reduce_with_lambda():
     num_items = 1024
 
     h_init = np.array([init_value], dtype=dtype)
-    d_output = numba.cuda.device_array(1, dtype=dtype)
+    d_output = cp.empty(1, dtype=dtype)
 
     h_input = random_int(num_items, dtype)
-    d_input = numba.cuda.to_device(h_input)
+    d_input = cp.asarray(h_input)
 
     # Use a lambda function directly as the reducer
     cuda.compute.reduce_into(
@@ -107,7 +106,7 @@ def test_device_reduce_with_lambda():
         op=lambda a, b: a + b,
         h_init=h_init,
     )
-    h_output = d_output.copy_to_host()
+    h_output = d_output.get()
     assert h_output[0] == sum(h_input) + init_value
 
 
@@ -118,10 +117,10 @@ def test_device_reduce_with_lambda_variable():
     num_items = 1024
 
     h_init = np.array([init_value], dtype=dtype)
-    d_output = numba.cuda.device_array(1, dtype=dtype)
+    d_output = cp.empty(1, dtype=dtype)
 
     h_input = random_int(num_items, dtype)
-    d_input = numba.cuda.to_device(h_input)
+    d_input = cp.asarray(h_input)
 
     # Use a lambda function assigned to a variable as the reducer
     cuda.compute.reduce_into(
@@ -131,24 +130,24 @@ def test_device_reduce_with_lambda_variable():
         op=add_op_lambda,
         h_init=h_init,
     )
-    h_output = d_output.copy_to_host()
+    h_output = d_output.get()
     assert h_output[0] == sum(h_input) + init_value
 
 
 def test_complex_device_reduce():
     h_init = np.array([40.0 + 2.0j], dtype=complex)
-    d_output = numba.cuda.device_array(1, dtype=complex)
+    d_output = cp.empty(1, dtype=complex)
 
     for num_items in [42, 420000]:
         real_imag = np.random.random((2, num_items))
         h_input = real_imag[0] + 1j * real_imag[1]
-        d_input = numba.cuda.to_device(h_input)
+        d_input = cp.asarray(h_input)
         assert d_input.size == num_items
         cuda.compute.reduce_into(
             d_in=d_input, d_out=d_output, num_items=num_items, op=add_op, h_init=h_init
         )
 
-        result = d_output.copy_to_host()[0]
+        result = d_output.get()[0]
         expected = np.sum(h_input, initial=h_init[0])
         assert result == pytest.approx(expected)
 
@@ -162,11 +161,11 @@ def _test_device_sum_with_iterator(
 
     if use_numpy_array:
         h_input = np.array(l_varr, dtype_inp)
-        d_input = numba.cuda.to_device(h_input)
+        d_input = cp.asarray(h_input)
     else:
         d_input = i_input
 
-    d_output = numba.cuda.device_array(1, dtype_out)  # to store device sum
+    d_output = cp.empty(1, dtype_out)  # to store device sum
 
     h_init = np.array([start_sum_with], dtype_out)
 
@@ -174,7 +173,7 @@ def _test_device_sum_with_iterator(
         d_in=d_input, d_out=d_output, num_items=len(l_varr), op=add_op, h_init=h_init
     )
 
-    h_output = d_output.copy_to_host()
+    h_output = d_output.get()
     assert h_output[0] == expected_result
 
 
@@ -216,7 +215,7 @@ def test_device_sum_cache_modified_input_it(
     l_varr = [rng.randrange(100) for _ in range(num_items)]
     dtype_inp = np.dtype(supported_value_type)
     dtype_out = dtype_inp
-    input_devarr = numba.cuda.to_device(np.array(l_varr, dtype=dtype_inp))
+    input_devarr = cp.asarray(np.array(l_varr, dtype=dtype_inp))
     i_input = CacheModifiedInputIterator(input_devarr, modifier="stream")
     _test_device_sum_with_iterator(
         l_varr, start_sum_with, i_input, dtype_inp, dtype_out, use_numpy_array
@@ -691,6 +690,7 @@ def add_op(x, y):
         )
 
 
+@pytest.mark.no_numba
 def test_device_reduce_well_known_plus():
     dtype = np.int32
     h_init = np.array([0], dtype=dtype)
@@ -709,6 +709,7 @@ def test_device_reduce_well_known_plus():
     assert (d_output == expected_output).all()
 
 
+@pytest.mark.no_numba
 def test_device_reduce_well_known_minimum():
     dtype = np.int32
     h_init = np.array([100], dtype=dtype)
@@ -727,6 +728,7 @@ def test_device_reduce_well_known_minimum():
     assert (d_output == expected_output).all()
 
 
+@pytest.mark.no_numba
 def test_device_reduce_well_known_maximum():
     dtype = np.int32
     h_init = np.array([-100], dtype=dtype)
@@ -925,6 +927,7 @@ def sqrt(x: dtype) -> dtype:
     np.testing.assert_allclose(d_output.get(), expected.get(), atol=1e-6)
 
 
+@pytest.mark.no_numba
 def test_reduce_with_not_guaranteed_determinism(floating_array):
     dtype = floating_array.dtype
     h_init = np.array([0], dtype=dtype)
@@ -942,6 +945,7 @@ def test_reduce_with_not_guaranteed_determinism(floating_array):
     )
 
 
+@pytest.mark.no_numba
 def test_reduce_bool():
     h_init = np.array([False])
     d_input = cp.array([True, False, True])
diff --git a/python/cuda_cccl/tests/compute/test_scan.py b/python/cuda_cccl/tests/compute/test_scan.py
index af6d941b11b..97f5fae630a 100644
--- a/python/cuda_cccl/tests/compute/test_scan.py
+++ b/python/cuda_cccl/tests/compute/test_scan.py
@@ -4,11 +4,11 @@
 
 
 import cupy as cp
-import numba.cuda
 import numpy as np
 import pytest
 
 import cuda.compute
+from cuda.core import Device
 from cuda.compute import (
     CountingIterator,
     OpKind,
@@ -52,7 +52,7 @@ def scan_device(d_input, d_output, num_items, op, h_init, force_inclusive, strea
     [True, False],
 )
 def test_scan_array_input(force_inclusive, input_array, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip sass verification if input is complex
     # as LDL/STL instructions are emitted for complex types.
     # Also skip for:
@@ -214,6 +214,7 @@ def op(a, b):
     np.testing.assert_allclose(expected, got, rtol=1e-5)
 
 
+@pytest.mark.no_numba
 def test_exclusive_scan_well_known_plus():
     dtype = np.int32
     h_init = np.array([0], dtype=dtype)
@@ -232,8 +233,9 @@ def test_exclusive_scan_well_known_plus():
     np.testing.assert_equal(d_output.get(), expected)
 
 
+@pytest.mark.no_numba
 def test_inclusive_scan_well_known_plus(monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip SASS check for CC 9.0+, due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -354,7 +356,7 @@ def add_op(a, b):
 
 
 def test_reverse_input_iterator(monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip SASS check for CC 9.0+, due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -409,6 +411,7 @@ def add_op(a, b):
     np.testing.assert_equal(d_output.get(), expected)
 
 
+@pytest.mark.no_numba
 @pytest.mark.parametrize(
     "force_inclusive",
     [True, False],
@@ -430,13 +433,14 @@ def test_future_init_value(force_inclusive):
     np.testing.assert_array_equal(expected, got)
 
 
+@pytest.mark.no_numba
 def test_no_init_value(monkeypatch):
     force_inclusive = True
     num_items = 1024
     dtype = np.dtype("int32")
 
     # Skip SASS check for CC 9.0 due to LDL/STL CI failure.
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     if cc_major >= 9:
         import cuda.compute._cccl_interop
 
@@ -456,6 +460,7 @@ def test_no_init_value(monkeypatch):
     np.testing.assert_array_equal(expected, got)
 
 
+@pytest.mark.no_numba
 def test_no_init_value_iterator():
     force_inclusive = True
     num_items = 1024
@@ -493,6 +498,7 @@ def test_inclusive_scan_with_lambda():
     np.testing.assert_array_equal(d_output.get(), expected)
 
 
+@pytest.mark.no_numba
 @pytest.mark.parametrize("force_inclusive", [True, False])
 def test_scan_bool_maximum(force_inclusive):
     h_init = np.array([False], dtype=np.bool_)
diff --git a/python/cuda_cccl/tests/compute/test_segmented_reduce.py b/python/cuda_cccl/tests/compute/test_segmented_reduce.py
index 950d860a14d..815c98b8edb 100644
--- a/python/cuda_cccl/tests/compute/test_segmented_reduce.py
+++ b/python/cuda_cccl/tests/compute/test_segmented_reduce.py
@@ -269,6 +269,7 @@ def _plus(a, b):
         )
 
 
+@pytest.mark.no_numba
 def test_segmented_reduce_well_known_plus(monkeypatch):
     # Disable SASS verification for this test (LDL instruction in SASS).
     monkeypatch.setattr(
@@ -299,6 +300,7 @@ def test_segmented_reduce_well_known_plus(monkeypatch):
     np.testing.assert_equal(d_output.get(), expected)
 
 
+@pytest.mark.no_numba
 def test_segmented_reduce_well_known_maximum(monkeypatch):
     # Disable SASS verification for this test (LDL instruction in SASS).
     monkeypatch.setattr(
@@ -329,6 +331,7 @@ def test_segmented_reduce_well_known_maximum(monkeypatch):
     np.testing.assert_equal(d_output.get(), expected)
 
 
+@pytest.mark.no_numba
 def test_segmented_reduce_bool_maximum(monkeypatch):
     # Disable SASS verification for this test (LDL instruction in SASS).
     monkeypatch.setattr(
diff --git a/python/cuda_cccl/tests/compute/test_segmented_sort.py b/python/cuda_cccl/tests/compute/test_segmented_sort.py
index 2bdd03abb42..9c2ab9f2de4 100644
--- a/python/cuda_cccl/tests/compute/test_segmented_sort.py
+++ b/python/cuda_cccl/tests/compute/test_segmented_sort.py
@@ -5,12 +5,13 @@
 from typing import Tuple
 
 import cupy as cp
-import numba
 import numpy as np
 import pytest
 
 import cuda.compute
 
+pytestmark = pytest.mark.no_numba
+
 DTYPE_LIST = [
     np.uint8,
     np.int16,
@@ -118,8 +119,8 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch):
     h_in_keys = random_array(num_items, dtype, max_value=50)
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
+    d_in_keys = cp.asarray(h_in_keys)
+    d_out_keys = cp.asarray(np.empty_like(h_in_keys))
 
     cuda.compute.segmented_sort(
         d_in_keys=d_in_keys,
@@ -133,7 +134,7 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch):
         order=order,
     )
 
-    h_out_keys = d_out_keys.copy_to_host()
+    h_out_keys = d_out_keys.get()
     expected_keys, _ = host_segmented_sort(
         h_in_keys, None, start_offsets, end_offsets, order
     )
@@ -153,10 +154,10 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size):
 
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_vals = numba.cuda.to_device(h_in_vals)
-    d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
-    d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals))
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_vals = cp.asarray(h_in_vals)
+    d_out_keys = cp.asarray(np.empty_like(h_in_keys))
+    d_out_vals = cp.asarray(np.empty_like(h_in_vals))
 
     cuda.compute.segmented_sort(
         d_in_keys=d_in_keys,
@@ -170,8 +171,8 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size):
         order=order,
     )
 
-    h_out_keys = d_out_keys.copy_to_host()
-    h_out_vals = d_out_vals.copy_to_host()
+    h_out_keys = d_out_keys.get()
+    h_out_vals = d_out_vals.get()
 
     expected_keys, expected_vals = host_segmented_sort(
         h_in_keys, h_in_vals, start_offsets, end_offsets, order
@@ -189,8 +190,8 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size):
     h_in_keys = random_array(num_items, dtype, max_value=20)
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
+    d_in_keys = cp.asarray(h_in_keys)
+    d_tmp_keys = cp.asarray(np.empty_like(h_in_keys))
     keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys)
 
     cuda.compute.segmented_sort(
@@ -205,7 +206,7 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size):
         order=order,
     )
 
-    h_out_keys = keys_db.current().copy_to_host()
+    h_out_keys = keys_db.current().get()
     expected_keys, _ = host_segmented_sort(
         h_in_keys, None, start_offsets, end_offsets, order
     )
@@ -224,10 +225,10 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size):
 
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_vals = numba.cuda.to_device(h_in_vals)
-    d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
-    d_tmp_vals = numba.cuda.to_device(np.empty_like(h_in_vals))
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_vals = cp.asarray(h_in_vals)
+    d_tmp_keys = cp.asarray(np.empty_like(h_in_keys))
+    d_tmp_vals = cp.asarray(np.empty_like(h_in_vals))
 
     keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys)
     vals_db = cuda.compute.DoubleBuffer(d_in_vals, d_tmp_vals)
@@ -244,8 +245,8 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size):
         order=order,
     )
 
-    h_out_keys = keys_db.current().copy_to_host()
-    h_out_vals = vals_db.current().copy_to_host()
+    h_out_keys = keys_db.current().get()
+    h_out_vals = vals_db.current().get()
 
     expected_keys, expected_vals = host_segmented_sort(
         h_in_keys, h_in_vals, start_offsets, end_offsets, order
@@ -297,10 +298,10 @@ def test_segmented_sort_variable_segment_sizes(num_segments):
     h_in_keys = random_array(num_items, np.int32, max_value=100)
     h_in_vals = random_array(num_items, np.float32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_vals = numba.cuda.to_device(h_in_vals)
-    d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
-    d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals))
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_vals = cp.asarray(h_in_vals)
+    d_out_keys = cp.asarray(np.empty_like(h_in_keys))
+    d_out_vals = cp.asarray(np.empty_like(h_in_vals))
 
     cuda.compute.segmented_sort(
         d_in_keys=d_in_keys,
@@ -314,8 +315,8 @@ def test_segmented_sort_variable_segment_sizes(num_segments):
         order=order,
     )
 
-    h_out_keys = d_out_keys.copy_to_host()
-    h_out_vals = d_out_vals.copy_to_host()
+    h_out_keys = d_out_keys.get()
+    h_out_vals = d_out_vals.get()
     expected_keys, expected_vals = host_segmented_sort(
         h_in_keys, h_in_vals, start_offsets, end_offsets, order
     )
diff --git a/python/cuda_cccl/tests/compute/test_select.py b/python/cuda_cccl/tests/compute/test_select.py
index f2483860a4a..d467d33c635 100644
--- a/python/cuda_cccl/tests/compute/test_select.py
+++ b/python/cuda_cccl/tests/compute/test_select.py
@@ -7,6 +7,8 @@
 import pytest
 
 import cuda.compute
+from cuda.compute._cpp_compile import compile_cpp_op_code
+from cuda.compute.op import RawOp
 from cuda.compute import CacheModifiedInputIterator, ZipIterator, gpu_struct
 
 DTYPE_LIST = [
@@ -59,6 +61,35 @@ def _host_select(h_in: np.ndarray, cond):
     return selected, np.int64(selected.size)
 
 
+def _raw_even_i32_op() -> RawOp:
+    source = """
+extern "C" __device__ void is_even_i32(void* x, void* result) {
+    int value = *static_cast<int*>(x);
+    *static_cast<bool*>(result) = (value % 2) == 0;
+}
+"""
+    return RawOp(ltoir=compile_cpp_op_code(source), name="is_even_i32")
+
+
+@pytest.mark.no_numba
+def test_select_raw_op_minimal():
+    h_in = np.arange(10, dtype=np.int32)
+    d_in = cp.asarray(h_in)
+    d_out = cp.empty_like(d_in)
+    d_num_selected = cp.empty(2, dtype=np.uint64)
+
+    cuda.compute.select(
+        d_in=d_in,
+        d_out=d_out,
+        d_num_selected_out=d_num_selected,
+        cond=_raw_even_i32_op(),
+        num_items=len(d_in),
+    )
+
+    num_selected = int(d_num_selected[0].get())
+    np.testing.assert_array_equal(d_out.get()[:num_selected], h_in[h_in % 2 == 0])
+
+
 @pytest.mark.parametrize("dtype,num_items", select_params)
 def test_select_basic(dtype, num_items):
     h_in = random_array(num_items, dtype, max_value=100)
diff --git a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py
index 5c82fe1ad38..715f9e51235 100644
--- a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py
+++ b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py
@@ -87,6 +87,7 @@ def test_shuffle_iterator_with_permutation_iterator():
     assert sorted(result) == sorted(d_values.get())
 
 
+@pytest.mark.no_numba
 def test_shuffle_iterator_invalid_num_items():
     with pytest.raises(ValueError, match="num_items must be > 0"):
         ShuffleIterator(0, seed=42)
diff --git a/python/cuda_cccl/tests/compute/test_struct_field_validation.py b/python/cuda_cccl/tests/compute/test_struct_field_validation.py
index 3ddaee54d41..cf1edd7e681 100644
--- a/python/cuda_cccl/tests/compute/test_struct_field_validation.py
+++ b/python/cuda_cccl/tests/compute/test_struct_field_validation.py
@@ -14,6 +14,8 @@
 
 from cuda.compute import gpu_struct
 
+pytestmark = pytest.mark.no_numba
+
 
 def test_newline_in_field_name_is_rejected():
     """Field names with newlines must be rejected — they are the exec() injection vector."""
diff --git a/python/cuda_cccl/tests/compute/test_three_way_partition.py b/python/cuda_cccl/tests/compute/test_three_way_partition.py
index 3fb5b102275..9f2683f6798 100644
--- a/python/cuda_cccl/tests/compute/test_three_way_partition.py
+++ b/python/cuda_cccl/tests/compute/test_three_way_partition.py
@@ -7,6 +7,8 @@
 import pytest
 
 import cuda.compute
+from cuda.compute._cpp_compile import compile_cpp_op_code
+from cuda.compute.op import RawOp
 from cuda.compute import CacheModifiedInputIterator, gpu_struct
 
 DTYPE_LIST = [
@@ -65,6 +67,50 @@ def _host_three_way_partition(h_in: np.ndarray, less_than_op, greater_equal_op):
     )
 
 
+def _raw_less_than_i32(name: str, threshold: int) -> RawOp:
+    source = f"""
+extern "C" __device__ void {name}(void* x, void* result) {{
+    int value = *static_cast<int*>(x);
+    *static_cast<bool*>(result) = value < {threshold};
+}}
+"""
+    return RawOp(ltoir=compile_cpp_op_code(source), name=name)
+
+
+@pytest.mark.no_numba
+def test_three_way_partition_raw_op_minimal():
+    h_in = np.arange(10, dtype=np.int32)
+    d_in = cp.asarray(h_in)
+    d_first = cp.empty_like(d_in)
+    d_second = cp.empty_like(d_in)
+    d_unselected = cp.empty_like(d_in)
+    d_num_selected = cp.empty(2, dtype=np.uint64)
+
+    cuda.compute.three_way_partition(
+        d_in=d_in,
+        d_first_part_out=d_first,
+        d_second_part_out=d_second,
+        d_unselected_out=d_unselected,
+        d_num_selected_out=d_num_selected,
+        select_first_part_op=_raw_less_than_i32("less_than_3_i32", 3),
+        select_second_part_op=_raw_less_than_i32("less_than_6_i32", 6),
+        num_items=len(d_in),
+    )
+
+    selected = d_num_selected.get()
+    first_count = int(selected[0])
+    second_count = int(selected[1])
+    unselected_count = len(h_in) - first_count - second_count
+
+    np.testing.assert_array_equal(d_first.get()[:first_count], h_in[h_in < 3])
+    np.testing.assert_array_equal(
+        d_second.get()[:second_count], h_in[(h_in >= 3) & (h_in < 6)]
+    )
+    np.testing.assert_array_equal(
+        d_unselected.get()[:unselected_count], h_in[h_in >= 6]
+    )
+
+
 @pytest.mark.parametrize("dtype,num_items", three_way_partition_params)
 def test_three_way_partition_basic(dtype, num_items, monkeypatch):
     # NOTE: the SASS check failure is seen only with NVRTC 13.1:
diff --git a/python/cuda_cccl/tests/compute/test_transform.py b/python/cuda_cccl/tests/compute/test_transform.py
index c7c3ca2818a..b8429726d77 100644
--- a/python/cuda_cccl/tests/compute/test_transform.py
+++ b/python/cuda_cccl/tests/compute/test_transform.py
@@ -266,6 +266,7 @@ def op2(a):
     np.testing.assert_allclose(expected, got)
 
 
+@pytest.mark.no_numba
 def test_unary_transform_well_known_negate():
     """Test unary transform with well-known NEGATE operation."""
     dtype = np.int32
@@ -282,6 +283,7 @@ def test_unary_transform_well_known_negate():
     np.testing.assert_equal(d_output.get(), expected)
 
 
+@pytest.mark.no_numba
 def test_unary_transform_well_known_identity():
     """Test unary transform with well-known IDENTITY operation."""
     dtype = np.int32
@@ -298,6 +300,7 @@ def test_unary_transform_well_known_identity():
     np.testing.assert_equal(d_output.get(), expected)
 
 
+@pytest.mark.no_numba
 @pytest.mark.parametrize("dtype", [np.int32, np.float16])
 def test_binary_transform_well_known_plus(dtype):
     """Test binary transform with well-known PLUS operation."""
@@ -319,6 +322,7 @@ def test_binary_transform_well_known_plus(dtype):
     np.testing.assert_equal(d_output.get(), expected)
 
 
+@pytest.mark.no_numba
 def test_binary_transform_well_known_multiplies():
     """Test binary transform with well-known MULTIPLIES operation."""
     dtype = np.int32
@@ -586,6 +590,7 @@ def test_binary_transform_with_lambda():
     np.testing.assert_array_equal(d_out.get(), expected)
 
 
+@pytest.mark.no_numba
 def test_binary_transform_bool_equal_to():
     d_input1 = cp.array([True, False, True, False], dtype=np.bool_)
     d_input2 = cp.array([True, True, False, False], dtype=np.bool_)
diff --git a/python/cuda_cccl/tests/compute/test_unique_by_key.py b/python/cuda_cccl/tests/compute/test_unique_by_key.py
index 83a4a17db67..5f7d310d71e 100644
--- a/python/cuda_cccl/tests/compute/test_unique_by_key.py
+++ b/python/cuda_cccl/tests/compute/test_unique_by_key.py
@@ -4,11 +4,11 @@
 
 
 import cupy as cp
-import numba.cuda
 import numpy as np
 import pytest
 
 import cuda.compute
+from cuda.core import Device
 from cuda.compute import (
     CacheModifiedInputIterator,
     DiscardIterator,
@@ -125,7 +125,7 @@ def compare_op(lhs, rhs):
 
 @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params)
 def test_unique_by_key(dtype, num_items, op, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -143,11 +143,11 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch):
     h_out_items = np.empty(num_items, dtype=np.float32)
     h_out_num_selected = np.empty(1, np.int32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_items = numba.cuda.to_device(h_in_items)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_items = numba.cuda.to_device(h_out_items)
-    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_items = cp.asarray(h_in_items)
+    d_out_keys = cp.asarray(h_out_keys)
+    d_out_items = cp.asarray(h_out_items)
+    d_out_num_selected = cp.asarray(h_out_num_selected)
 
     unique_by_key_device(
         d_in_keys,
@@ -159,10 +159,10 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch):
         num_items,
     )
 
-    h_out_num_selected = d_out_num_selected.copy_to_host()
+    h_out_num_selected = d_out_num_selected.get()
     num_selected = h_out_num_selected[0]
-    h_out_keys = d_out_keys.copy_to_host()[:num_selected]
-    h_out_items = d_out_items.copy_to_host()[:num_selected]
+    h_out_keys = d_out_keys.get()[:num_selected]
+    h_out_items = d_out_items.get()[:num_selected]
 
     expected_keys, expected_items = unique_by_key_host(h_in_keys, h_in_items)
 
@@ -172,7 +172,7 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch):
 
 @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params)
 def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip sass verification for CC 9.0+, due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -190,11 +190,11 @@ def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch):
     h_out_items = np.empty(num_items, dtype=np.float32)
     h_out_num_selected = np.empty(1, np.int64)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_items = numba.cuda.to_device(h_in_items)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_items = numba.cuda.to_device(h_out_items)
-    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_items = cp.asarray(h_in_items)
+    d_out_keys = cp.asarray(h_out_keys)
+    d_out_items = cp.asarray(h_out_items)
+    d_out_num_selected = cp.asarray(h_out_num_selected)
 
     i_in_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream")
     i_in_items = CacheModifiedInputIterator(d_in_items, modifier="stream")
@@ -209,10 +209,10 @@ def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch):
         num_items,
     )
 
-    h_out_num_selected = d_out_num_selected.copy_to_host()
+    h_out_num_selected = d_out_num_selected.get()
     num_selected = h_out_num_selected[0]
-    h_out_keys = d_out_keys.copy_to_host()[:num_selected]
-    h_out_items = d_out_items.copy_to_host()[:num_selected]
+    h_out_keys = d_out_keys.get()[:num_selected]
+    h_out_items = d_out_items.get()[:num_selected]
 
     expected_keys, expected_items = unique_by_key_host(h_in_keys, h_in_items)
 
@@ -226,9 +226,9 @@ def test_unique_by_key_keys_only():
     h_out_keys = np.empty(num_items, dtype=np.int32)
     h_out_num_selected = np.empty(1, np.int32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_out_keys = cp.asarray(h_out_keys)
+    d_out_num_selected = cp.asarray(h_out_num_selected)
 
     unique_by_key_device(
         d_in_keys,
@@ -240,9 +240,9 @@ def test_unique_by_key_keys_only():
         num_items,
     )
 
-    h_out_num_selected = d_out_num_selected.copy_to_host()
+    h_out_num_selected = d_out_num_selected.get()
     num_selected = h_out_num_selected[0]
-    h_out_keys = d_out_keys.copy_to_host()[:num_selected]
+    h_out_keys = d_out_keys.get()[:num_selected]
 
     expected_keys, _ = unique_by_key_host(
         h_in_keys,
@@ -267,11 +267,11 @@ def compare_complex(lhs, rhs):
     h_out_items = np.empty(num_items, dtype=np.float32)
     h_out_num_selected = np.empty(1, np.int32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_items = numba.cuda.to_device(h_in_items)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_items = numba.cuda.to_device(h_out_items)
-    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
+    d_in_keys = cp.asarray(h_in_keys)
+    d_in_items = cp.asarray(h_in_items)
+    d_out_keys = cp.asarray(h_out_keys)
+    d_out_items = cp.asarray(h_out_items)
+    d_out_num_selected = cp.asarray(h_out_num_selected)
 
     unique_by_key_device(
         d_in_keys,
@@ -283,10 +283,10 @@ def compare_complex(lhs, rhs):
         num_items,
     )
 
-    h_out_num_selected = d_out_num_selected.copy_to_host()
+    h_out_num_selected = d_out_num_selected.get()
     num_selected = h_out_num_selected[0]
-    h_out_keys = d_out_keys.copy_to_host()[:num_selected]
-    h_out_items = d_out_items.copy_to_host()[:num_selected]
+    h_out_keys = d_out_keys.get()[:num_selected]
+    h_out_items = d_out_items.get()[:num_selected]
 
     expected_keys, expected_items = unique_by_key_host(
         h_in_keys, h_in_items, compare_complex
@@ -363,7 +363,7 @@ def struct_compare_op(lhs, rhs):
 
 
 def test_unique_by_key_with_stream(cuda_stream, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -417,8 +417,9 @@ def test_unique_by_key_with_stream(cuda_stream, monkeypatch):
     np.testing.assert_array_equal(h_out_items, expected_items)
 
 
+@pytest.mark.no_numba
 def test_unique_by_key_well_known_equal_to(monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
diff --git a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py
index e08709b6e1a..e4d8185eb91 100644
--- a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py
+++ b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py
@@ -12,7 +12,9 @@
 """
 
 import pytest
-from numba import types
+
+numba = pytest.importorskip("numba")
+types = numba.types
 
 from cuda.compute._odr_helpers import _ArgMode, _ArgSpec, _create_void_ptr_wrapper
 from cuda.compute._utils import sanitize_identifier
diff --git a/python/cuda_cccl/tests/compute/test_zip_iterator.py b/python/cuda_cccl/tests/compute/test_zip_iterator.py
index 85630bb4f5f..aea1f167f9f 100644
--- a/python/cuda_cccl/tests/compute/test_zip_iterator.py
+++ b/python/cuda_cccl/tests/compute/test_zip_iterator.py
@@ -6,6 +6,7 @@
 import pytest
 
 import cuda.compute
+from cuda.core import Device
 from cuda.compute import (
     CountingIterator,
     TransformIterator,
@@ -261,10 +262,8 @@ def min_pairs(p1, p2):
 @pytest.mark.parametrize("num_items", [10, 1000])
 def test_output_zip_iterator_with_scan(monkeypatch, num_items):
     """Test ZipIterator as output iterator with scan operations."""
-    import numba.cuda
-
     # Skip SASS check for CC 8.0+ due to LDL/STL CI failure.
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     if cc_major >= 8:
         monkeypatch.setattr(
             cuda.compute._cccl_interop,
@@ -426,9 +425,7 @@ def sum_nested_zips(v1, v2):
     ],
 )
 def test_nested_output_zip_iterator_with_scan(monkeypatch, num_items, dtype_map):
-    import numba.cuda
-
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = Device().compute_capability
     if cc_major >= 8:
         monkeypatch.setattr(
             cuda.compute._cccl_interop,
@@ -502,6 +499,7 @@ def g(x):
     assert it1.kind != it2.kind
 
 
+@pytest.mark.no_numba
 def test_caching_zip_iterator():
     """Test that iterator compilation is cached across instances with the same structure."""
     from cuda.compute._cpp_compile import compile_cpp_op_code
@@ -577,6 +575,7 @@ def test_caching_zip_iterator():
     assert len(set(kinds)) == 1, "Same CountingIterator types should have same kind"
 
 
+@pytest.mark.no_numba
 def test_compilation_caching_across_iterator_types():
     """Test that compilation caching works across different iterator types."""
     from cuda.compute import ConstantIterator

From d97d809f8eaf9b8fc60fe9ebb337ef52284526c8 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Fri, 12 Jun 2026 12:48:11 -0500
Subject: [PATCH 03/16] Stop wrapping binary search comparator in python
 callable

---
 .../cuda/compute/algorithms/_binary_search.py | 21 +------
 .../tests/compute/test_binary_search.py       | 60 +++++++++++++++++++
 2 files changed, 63 insertions(+), 18 deletions(-)

diff --git a/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py b/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py
index 6cadd994ceb..23a99c7bc40 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py
@@ -16,19 +16,6 @@
 from ..typing import DeviceArrayLike, IteratorT, Operator
 
 
-def _normalize_comp(comp: Operator | None) -> OpAdapter:
-    # Use a lambda for the default comparator rather than OpKind.LESS
-    # because well-known ops don't carry type information needed by
-    # the binary search JIT compilation.
-    if comp is None or comp is OpKind.LESS:
-
-        def _default_less(a, b):
-            return a < b
-
-        return make_op_adapter(_default_less)
-    return make_op_adapter(comp)
-
-
 class _BinarySearch:
     __slots__ = [
         "build_result",
@@ -96,9 +83,7 @@ def __call__(
         set_cccl_iterator_state(self.d_out_cccl, d_out)
 
         # Update op state for stateful ops
-        comp_adapter = (
-            _normalize_comp(comp) if comp is not None else _normalize_comp(None)
-        )
+        comp_adapter = make_op_adapter(OpKind.LESS if comp is None else comp)
         self.op_cccl.state = comp_adapter.get_state()
 
         stream_handle = protocols.validate_and_get_stream(stream)
@@ -154,7 +139,7 @@ def make_lower_bound(
     See Also:
         :func:`lower_bound`
     """
-    comp_adapter = _normalize_comp(comp)
+    comp_adapter = make_op_adapter(OpKind.LESS if comp is None else comp)
     return _make_binary_search(
         d_data,
         d_values,
@@ -193,7 +178,7 @@ def make_upper_bound(
     See Also:
         :func:`upper_bound`
     """
-    comp_adapter = _normalize_comp(comp)
+    comp_adapter = make_op_adapter(OpKind.LESS if comp is None else comp)
     return _make_binary_search(
         d_data,
         d_values,
diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py
index a4f7d047a9b..585a998c44c 100644
--- a/python/cuda_cccl/tests/compute/test_binary_search.py
+++ b/python/cuda_cccl/tests/compute/test_binary_search.py
@@ -6,6 +6,7 @@
 import pytest
 
 import cuda.compute
+from cuda.compute import OpKind
 
 DTYPE_LIST = [
     np.int32,
@@ -39,6 +40,65 @@ def disable_sass_check(monkeypatch):
     )
 
 
+@pytest.mark.parametrize(
+    "search, side",
+    [
+        (cuda.compute.lower_bound, "left"),
+        (cuda.compute.upper_bound, "right"),
+    ],
+)
+def test_binary_search_explicit_opkind_less(search, side):
+    h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32)
+    h_values = np.array([0, 3, 4, 10], dtype=np.int32)
+
+    d_data = cp.asarray(h_data)
+    d_values = cp.asarray(h_values)
+    d_out = cp.empty(len(h_values), dtype=np.uintp)
+
+    search(
+        d_data=d_data,
+        num_items=len(d_data),
+        d_values=d_values,
+        num_values=len(d_values),
+        d_out=d_out,
+        comp=OpKind.LESS,
+    )
+
+    expected = np.searchsorted(h_data, h_values, side=side).astype(np.uintp)
+    np.testing.assert_array_equal(d_out.get(), expected)
+
+
+@pytest.mark.parametrize(
+    "search, side",
+    [
+        (cuda.compute.lower_bound, "left"),
+        (cuda.compute.upper_bound, "right"),
+    ],
+)
+def test_binary_search_custom_comparator(search, side):
+    h_data = np.array([9, 7, 3, 3, 1], dtype=np.int32)
+    h_values = np.array([10, 4, 3, 0], dtype=np.int32)
+
+    def greater(lhs, rhs):
+        return lhs > rhs
+
+    d_data = cp.asarray(h_data)
+    d_values = cp.asarray(h_values)
+    d_out = cp.empty(len(h_values), dtype=np.uintp)
+
+    search(
+        d_data=d_data,
+        num_items=len(d_data),
+        d_values=d_values,
+        num_values=len(d_values),
+        d_out=d_out,
+        comp=greater,
+    )
+
+    expected = np.searchsorted(-h_data, -h_values, side=side).astype(np.uintp)
+    np.testing.assert_array_equal(d_out.get(), expected)
+
+
 @pytest.mark.parametrize("dtype", DTYPE_LIST)
 @pytest.mark.parametrize(
     "num_items,num_values", [(0, 0), (0, 128), (128, 0), (512, 128)]

From e0c72ab4a780c945ff2eaf849356bd4f0e121405 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Fri, 12 Jun 2026 15:50:11 -0500
Subject: [PATCH 04/16] Add CI for 3.14t

---
 ci/matrix.yaml                                |  5 +-
 ci/test_cuda_compute_python.sh                |  4 +-
 ci/windows/build_common_python.psm1           |  5 +-
 ci/windows/build_cuda_cccl_python.ps1         |  5 +-
 ci/windows/test_cuda_cccl_examples_python.ps1 | 54 ++++++++---------
 ci/windows/test_cuda_cccl_headers_python.ps1  | 54 ++++++++---------
 ci/windows/test_cuda_compute_python.ps1       | 58 +++++++++----------
 ci/windows/test_cuda_coop_python.ps1          | 50 ++++++++--------
 python/cuda_cccl/pyproject.toml               |  1 +
 .../compute/test_free_threading_stress.py     |  1 +
 10 files changed, 121 insertions(+), 116 deletions(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 1fcb1df75ae..18b1a5e1515 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -82,9 +82,10 @@ workflows:
     - {jobs: ['test'], project: 'cccl_c_stf', ctk: '13.X', cxx: 'gcc13', gpu: ['rtx2080', 'l4', 'h100']}
     # Python -- pinned to gcc13 / msvc2022 for consistency across CTK images
     - {jobs: ['test'], project: 'python', ctk: ['12.X',        '13.X'], py_version: ['3.10'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']}
-    - {jobs: ['test'], project: 'python', ctk: ['12.X','13.0', '13.X'], py_version: ['3.14'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']}
-    - {jobs: ['test'], project: 'python', py_version: '3.14', gpu: 'h100', cxx: 'gcc13'}
+    - {jobs: ['test'], project: 'python', ctk: ['12.X','13.0', '13.X'], py_version: ['3.14', '3.14t'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']}
+    - {jobs: ['test'], project: 'python', py_version: ['3.14', '3.14t'], gpu: 'h100', cxx: 'gcc13'}
     - {jobs: ['test_py_compute_minimal'], project: 'python', ctk: '13.X', py_version: '3.14', gpu: 'l4', cxx: 'gcc13'}
+    - {jobs: ['test_py_compute_minimal'], project: 'python', ctk: '13.X', py_version: '3.14t', gpu: 'l4', cxx: 'gcc13'}
     # CCCL packaging:
     - {jobs: ['test'], project: 'packaging', ctk: '12.0', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080', args: '-min-cmake'}
     - {jobs: ['test'], project: 'packaging', ctk: '12.X', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080'}
diff --git a/ci/test_cuda_compute_python.sh b/ci/test_cuda_compute_python.sh
index bd6ad432178..c829e72518a 100755
--- a/ci/test_cuda_compute_python.sh
+++ b/ci/test_cuda_compute_python.sh
@@ -35,5 +35,5 @@ if [[ "${CCCL_PYTHON_USE_V2:-}" =~ ^(1|true|TRUE|on|ON)$ ]]; then
 fi
 
 cd "/home/coder/cccl/python/cuda_cccl/tests/"
-python -m pytest "${pytest_extra[@]}" -n 6 -v compute/ -m "not large"
-python -m pytest "${pytest_extra[@]}" -n 0 -v compute/ -m "large"
+python -m pytest "${pytest_extra[@]}" -n 6 -v compute/ -m "not large and not free_threading"
+python -m pytest "${pytest_extra[@]}" -n 0 -v compute/ -m "large and not free_threading"
diff --git a/ci/windows/build_common_python.psm1 b/ci/windows/build_common_python.psm1
index dab9258761a..060da49d4bd 100644
--- a/ci/windows/build_common_python.psm1
+++ b/ci/windows/build_common_python.psm1
@@ -4,12 +4,13 @@ function Get-Python {
         Returns the path of the Python interpreter satisfying the supplied
         version, installing it via uv if necessary.
     .PARAMETER Version
-        A string in the form 'M.m' (e.g., '3.10', '3.13').
+        A string in the form 'M.m' (e.g., '3.10', '3.13') or a free-threaded
+        version such as '3.14t'.
     #>
     [CmdletBinding()]
     param(
         [Parameter(Mandatory, Position = 0)]
-        [ValidatePattern('^\d+\.\d+$')]
+        [ValidatePattern('^\d+\.\d+t?$')]
         [string]$Version
     )
 
diff --git a/ci/windows/build_cuda_cccl_python.ps1 b/ci/windows/build_cuda_cccl_python.ps1
index 3d6dadd632a..3e825d38b9c 100644
--- a/ci/windows/build_cuda_cccl_python.ps1
+++ b/ci/windows/build_cuda_cccl_python.ps1
@@ -21,7 +21,8 @@
 
 .PARAMETER PyVersion
     **Required.** The Python version to use for building the wheel, expressed
-    as `<major>.<minor>` (e.g. `3.11`).
+    as `<major>.<minor>` (e.g. `3.11`) or a free-threaded version such as
+    `3.14t`.
 
 .PARAMETER OnlyCudaMajor
     Optional. Restricts the build to a single CUDA major version (`12` or `13`).
@@ -49,7 +50,7 @@
 Param(
     [Parameter(Mandatory = $true)]
     [Alias("py-version")]
-    [ValidatePattern("^\d+\.\d+$")]
+    [ValidatePattern("^\d+\.\d+t?$")]
     [string]$PyVersion,
 
     [Parameter(Mandatory = $false)]
diff --git a/ci/windows/test_cuda_cccl_examples_python.ps1 b/ci/windows/test_cuda_cccl_examples_python.ps1
index 0c108328822..8e50dc5d7bf 100644
--- a/ci/windows/test_cuda_cccl_examples_python.ps1
+++ b/ci/windows/test_cuda_cccl_examples_python.ps1
@@ -1,27 +1,27 @@
-Param(
-    [Parameter(Mandatory = $true)]
-    [Alias("py-version")]
-    [ValidatePattern("^\d+\.\d+$")]
-    [string]$PyVersion
-)
-
-$ErrorActionPreference = "Stop"
-
-# Import shared helpers
-Import-Module "$PSScriptRoot/build_common.psm1"
-Import-Module "$PSScriptRoot/build_common_python.psm1"
-
-$python = Get-Python -Version $PyVersion
-$cudaMajor = Get-CudaMajor
-
-$repoRoot = Get-RepoRoot
-
-${wheelPath} = Get-CudaCcclWheel
-& $python -m pip install -U pip pytest pytest-xdist
-& $python -m pip install "${wheelPath}[test-cu$cudaMajor]"
-
-Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
-try {
-    & $python -m pytest -n 6 test_examples.py
-}
-finally { Pop-Location }
+Param(
+    [Parameter(Mandatory = $true)]
+    [Alias("py-version")]
+    [ValidatePattern("^\d+\.\d+t?$")]
+    [string]$PyVersion
+)
+
+$ErrorActionPreference = "Stop"
+
+# Import shared helpers
+Import-Module "$PSScriptRoot/build_common.psm1"
+Import-Module "$PSScriptRoot/build_common_python.psm1"
+
+$python = Get-Python -Version $PyVersion
+$cudaMajor = Get-CudaMajor
+
+$repoRoot = Get-RepoRoot
+
+${wheelPath} = Get-CudaCcclWheel
+& $python -m pip install -U pip pytest pytest-xdist
+& $python -m pip install "${wheelPath}[test-cu$cudaMajor]"
+
+Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
+try {
+    & $python -m pytest -n 6 test_examples.py
+}
+finally { Pop-Location }
diff --git a/ci/windows/test_cuda_cccl_headers_python.ps1 b/ci/windows/test_cuda_cccl_headers_python.ps1
index 04a6adacc28..3a2fd40d51b 100644
--- a/ci/windows/test_cuda_cccl_headers_python.ps1
+++ b/ci/windows/test_cuda_cccl_headers_python.ps1
@@ -1,27 +1,27 @@
-Param(
-    [Parameter(Mandatory = $true)]
-    [Alias("py-version")]
-    [ValidatePattern("^\d+\.\d+$")]
-    [string]$PyVersion
-)
-
-$ErrorActionPreference = "Stop"
-
-# Import shared helpers
-Import-Module "$PSScriptRoot/build_common.psm1"
-Import-Module "$PSScriptRoot/build_common_python.psm1"
-
-$python = Get-Python -Version $PyVersion
-$cudaMajor = Get-CudaMajor
-
-$repoRoot = Get-RepoRoot
-
-${wheelPath} = Get-CudaCcclWheel
-& $python -m pip install -U pip pytest pytest-xdist
-& $python -m pip install "${wheelPath}[test-cu$cudaMajor]"
-
-Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
-try {
-    & $python -m pytest -n auto -v headers/
-}
-finally { Pop-Location }
+Param(
+    [Parameter(Mandatory = $true)]
+    [Alias("py-version")]
+    [ValidatePattern("^\d+\.\d+t?$")]
+    [string]$PyVersion
+)
+
+$ErrorActionPreference = "Stop"
+
+# Import shared helpers
+Import-Module "$PSScriptRoot/build_common.psm1"
+Import-Module "$PSScriptRoot/build_common_python.psm1"
+
+$python = Get-Python -Version $PyVersion
+$cudaMajor = Get-CudaMajor
+
+$repoRoot = Get-RepoRoot
+
+${wheelPath} = Get-CudaCcclWheel
+& $python -m pip install -U pip pytest pytest-xdist
+& $python -m pip install "${wheelPath}[test-cu$cudaMajor]"
+
+Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
+try {
+    & $python -m pytest -n auto -v headers/
+}
+finally { Pop-Location }
diff --git a/ci/windows/test_cuda_compute_python.ps1 b/ci/windows/test_cuda_compute_python.ps1
index 796d5128141..f8a9f2f7509 100644
--- a/ci/windows/test_cuda_compute_python.ps1
+++ b/ci/windows/test_cuda_compute_python.ps1
@@ -1,29 +1,29 @@
-Param(
-    [Parameter(Mandatory = $true)]
-    [Alias("py-version")]
-    [ValidatePattern("^\d+\.\d+$")]
-    [string]$PyVersion
-)
-
-$ErrorActionPreference = "Stop"
-
-# Import shared helpers
-Import-Module "$PSScriptRoot/build_common.psm1"
-Import-Module "$PSScriptRoot/build_common_python.psm1"
-
-$python = Get-Python -Version $PyVersion
-$cudaMajor = Get-CudaMajor
-
-$repoRoot = Get-RepoRoot
-
-$wheelPath = Get-CudaCcclWheel
-
-& $python -m pip install -U pip pytest pytest-xdist
-& $python -m pip install "$wheelPath[test-cu$cudaMajor]"
-
-Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
-try {
-    & $python -m pytest -n 6 -v compute/ -m "not large"
-    & $python -m pytest -n 0 -v compute/ -m "large"
-}
-finally { Pop-Location }
+Param(
+    [Parameter(Mandatory = $true)]
+    [Alias("py-version")]
+    [ValidatePattern("^\d+\.\d+t?$")]
+    [string]$PyVersion
+)
+
+$ErrorActionPreference = "Stop"
+
+# Import shared helpers
+Import-Module "$PSScriptRoot/build_common.psm1"
+Import-Module "$PSScriptRoot/build_common_python.psm1"
+
+$python = Get-Python -Version $PyVersion
+$cudaMajor = Get-CudaMajor
+
+$repoRoot = Get-RepoRoot
+
+$wheelPath = Get-CudaCcclWheel
+
+& $python -m pip install -U pip pytest pytest-xdist
+& $python -m pip install "$wheelPath[test-cu$cudaMajor]"
+
+Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
+try {
+    & $python -m pytest -n 6 -v compute/ -m "not large and not free_threading"
+    & $python -m pytest -n 0 -v compute/ -m "large and not free_threading"
+}
+finally { Pop-Location }
diff --git a/ci/windows/test_cuda_coop_python.ps1 b/ci/windows/test_cuda_coop_python.ps1
index 7fb5f9628bc..b0168ffcb32 100644
--- a/ci/windows/test_cuda_coop_python.ps1
+++ b/ci/windows/test_cuda_coop_python.ps1
@@ -1,25 +1,25 @@
-Param(
-    [Parameter(Mandatory = $true)]
-    [Alias("py-version")]
-    [ValidatePattern("^\d+\.\d+$")]
-    [string]$PyVersion
-)
-
-$ErrorActionPreference = "Stop"
-
-# Import shared helpers
-Import-Module "$PSScriptRoot/build_common.psm1"
-Import-Module "$PSScriptRoot/build_common_python.psm1"
-
-$python = Get-Python -Version $PyVersion
-$cudaMajor = Get-CudaMajor
-
-${wheelPath} = Get-CudaCcclWheel
-& $python -m pip install -U pip pytest pytest-xdist
-& $python -m pip install "${wheelPath}[test-cu$cudaMajor]"
-
-Push-Location (Join-Path (Get-RepoRoot) "python/cuda_cccl/tests")
-try {
-    & $python -m pytest -n auto -v coop/_experimental/
-}
-finally { Pop-Location }
+Param(
+    [Parameter(Mandatory = $true)]
+    [Alias("py-version")]
+    [ValidatePattern("^\d+\.\d+t?$")]
+    [string]$PyVersion
+)
+
+$ErrorActionPreference = "Stop"
+
+# Import shared helpers
+Import-Module "$PSScriptRoot/build_common.psm1"
+Import-Module "$PSScriptRoot/build_common_python.psm1"
+
+$python = Get-Python -Version $PyVersion
+$cudaMajor = Get-CudaMajor
+
+${wheelPath} = Get-CudaCcclWheel
+& $python -m pip install -U pip pytest pytest-xdist
+& $python -m pip install "${wheelPath}[test-cu$cudaMajor]"
+
+Push-Location (Join-Path (Get-RepoRoot) "python/cuda_cccl/tests")
+try {
+    & $python -m pytest -n auto -v coop/_experimental/
+}
+finally { Pop-Location }
diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index 967bc86d58b..e90b58382c9 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -157,4 +157,5 @@ markers = [
   "no_verify_sass: skip SASS verification check",
   "large: tests requiring large device memory allocations",
   "no_numba: tests that should not import numba or numba.cuda",
+  "free_threading: tests requiring free-threaded CPython with the GIL disabled",
 ]
diff --git a/python/cuda_cccl/tests/compute/test_free_threading_stress.py b/python/cuda_cccl/tests/compute/test_free_threading_stress.py
index fa2e389a627..1e40996fbe5 100644
--- a/python/cuda_cccl/tests/compute/test_free_threading_stress.py
+++ b/python/cuda_cccl/tests/compute/test_free_threading_stress.py
@@ -17,6 +17,7 @@
 
 
 pytestmark = [
+    pytest.mark.free_threading,
     pytest.mark.no_numba,
     pytest.mark.no_verify_sass(
         reason="Free-threading stress tests intentionally run concurrent workers."

From dda2d2aa5147e04b6d12be20fda5bf7074d43810 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Fri, 12 Jun 2026 15:51:27 -0500
Subject: [PATCH 05/16] Add benchmarks to measure host side overhead

---
 .../benchmarks/compute/host/build_time.py     |   63 +
 .../benchmarks/compute/host/common.py         | 1263 +++++++++++++++++
 .../compute/host/compare_results.py           |  258 ++++
 .../benchmarks/compute/host/oneshot_cached.py |   75 +
 .../benchmarks/compute/host/twoshot_call.py   |   77 +
 5 files changed, 1736 insertions(+)
 create mode 100644 python/cuda_cccl/benchmarks/compute/host/build_time.py
 create mode 100644 python/cuda_cccl/benchmarks/compute/host/common.py
 create mode 100644 python/cuda_cccl/benchmarks/compute/host/compare_results.py
 create mode 100644 python/cuda_cccl/benchmarks/compute/host/oneshot_cached.py
 create mode 100644 python/cuda_cccl/benchmarks/compute/host/twoshot_call.py

diff --git a/python/cuda_cccl/benchmarks/compute/host/build_time.py b/python/cuda_cccl/benchmarks/compute/host/build_time.py
new file mode 100644
index 00000000000..b445f57d879
--- /dev/null
+++ b/python/cuda_cccl/benchmarks/compute/host/build_time.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from __future__ import annotations
+
+import argparse
+import time
+
+import cuda.compute as cc
+
+from common import (
+    TimingResult,
+    add_case_filter,
+    add_json_output,
+    print_results,
+    select_cases,
+    synchronize,
+    write_results_json,
+)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Measure cold cuda.compute make_* build time."
+    )
+    parser.add_argument(
+        "--repeat",
+        type=int,
+        default=10,
+        help="Number of cold build samples. Defaults to 10.",
+    )
+    add_case_filter(parser)
+    add_json_output(parser)
+    args = parser.parse_args()
+
+    results = []
+    for case in select_cases(args.case):
+        state = case.setup()
+        synchronize()
+
+        samples_ns = []
+        for _ in range(args.repeat):
+            cc.clear_all_caches()
+            start = time.perf_counter_ns()
+            case.make_wrapper(state)
+            end = time.perf_counter_ns()
+            samples_ns.append(end - start)
+
+        results.append(TimingResult(case.name, samples_ns=samples_ns, number=1))
+
+    print_results(results)
+    if args.json is not None:
+        write_results_json(
+            args.json,
+            benchmark="build_time",
+            results=results,
+            config={"repeat": args.repeat, "case": args.case},
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/cuda_cccl/benchmarks/compute/host/common.py b/python/cuda_cccl/benchmarks/compute/host/common.py
new file mode 100644
index 00000000000..a6d7d4dea1b
--- /dev/null
+++ b/python/cuda_cccl/benchmarks/compute/host/common.py
@@ -0,0 +1,1263 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from __future__ import annotations
+
+import argparse
+import json
+import platform
+import statistics
+import sys
+import time
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Any, Callable, Iterable, Literal
+
+import cupy as cp
+import numpy as np
+
+import cuda.compute as cc
+from cuda.compute._cpp_compile import compile_cpp_op_code
+from cuda.compute.op import RawOp
+
+NOOP_TEMP_STORAGE_BYTES = 1
+NUM_ITEMS = 128
+NUM_SEGMENTS = 4
+MIN_SAMPLES_FOR_NOISE_ESTIMATE = 5
+
+NoopReturnKind = Literal["none", "temp_storage_bytes", "temp_storage_and_selector"]
+
+
+@dataclass(frozen=True)
+class TimingResult:
+    name: str
+    samples_ns: list[float]
+    number: int
+
+    @property
+    def min_ns(self) -> float:
+        return min(self.samples_ns)
+
+    @property
+    def median_ns(self) -> float:
+        return statistics.median(self.samples_ns)
+
+    @property
+    def mean_ns(self) -> float:
+        return statistics.mean(self.samples_ns)
+
+    @property
+    def stdev_ns(self) -> float | None:
+        if len(self.samples_ns) < MIN_SAMPLES_FOR_NOISE_ESTIMATE:
+            return None
+        return statistics.stdev(self.samples_ns)
+
+    @property
+    def relative_noise(self) -> float | None:
+        stdev_ns = self.stdev_ns
+        mean_ns = self.mean_ns
+        if stdev_ns is None or mean_ns <= 0:
+            return None
+        return stdev_ns / mean_ns
+
+    def as_json(self) -> dict[str, Any]:
+        return {
+            "name": self.name,
+            "unit": "ns",
+            "number": self.number,
+            "samples": self.samples_ns,
+            "min": self.min_ns,
+            "median": self.median_ns,
+            "mean": self.mean_ns,
+            "stdev": self.stdev_ns,
+            "relative_noise": self.relative_noise,
+        }
+
+
+@dataclass(frozen=True)
+class HostBenchmarkCase:
+    name: str
+    setup: Callable[[], SimpleNamespace]
+    make_wrapper: Callable[[SimpleNamespace], Any]
+    oneshot: Callable[[SimpleNamespace], None]
+    twoshot: Callable[[SimpleNamespace, Any], None]
+    noop_return_kind: NoopReturnKind
+    skip_reason: str | None = None
+
+
+class NoopBuildResult:
+    """Proxy that skips native compute while preserving wrapper host work."""
+
+    def __init__(self, real_build_result: Any, return_kind: NoopReturnKind):
+        self._real_build_result = real_build_result
+        self._return_kind = return_kind
+
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self._real_build_result, name)
+
+    def compute(self, *args, **kwargs):
+        return _noop_return(self._return_kind)
+
+    def compute_even(self, *args, **kwargs):
+        return _noop_return(self._return_kind)
+
+
+def _noop_return(return_kind: NoopReturnKind):
+    if return_kind == "none":
+        return None
+    if return_kind == "temp_storage_bytes":
+        return NOOP_TEMP_STORAGE_BYTES
+    if return_kind == "temp_storage_and_selector":
+        return NOOP_TEMP_STORAGE_BYTES, -1
+    raise ValueError(f"Unsupported no-op return kind: {return_kind}")
+
+
+def patch_wrapper_to_skip_native_compute(
+    wrapper: Any, return_kind: NoopReturnKind
+) -> None:
+    """Patch a cached wrapper so measured calls skip native compute."""
+    if hasattr(wrapper, "build_result"):
+        wrapper.build_result = NoopBuildResult(wrapper.build_result, return_kind)
+
+    if hasattr(wrapper, "device_reduce_fn"):
+        wrapper.device_reduce_fn = lambda *args, **kwargs: _noop_return(return_kind)
+
+    if hasattr(wrapper, "device_scan_fn"):
+        wrapper.device_scan_fn = lambda *args, **kwargs: _noop_return(return_kind)
+
+    if hasattr(wrapper, "partitioner"):
+        patch_wrapper_to_skip_native_compute(wrapper.partitioner, return_kind)
+
+
+def make_tiny_temp_storage() -> cp.ndarray:
+    return cp.empty(NOOP_TEMP_STORAGE_BYTES, dtype=cp.uint8)
+
+
+def synchronize() -> None:
+    cp.cuda.Device().synchronize()
+
+
+def measure_call(
+    name: str,
+    fn: Callable[[], None],
+    *,
+    repeat: int,
+    number: int,
+) -> TimingResult:
+    samples_ns = []
+    for _ in range(repeat):
+        start = time.perf_counter_ns()
+        for _ in range(number):
+            fn()
+        end = time.perf_counter_ns()
+        samples_ns.append((end - start) / number)
+    return TimingResult(name=name, samples_ns=samples_ns, number=number)
+
+
+def print_results(results: Iterable[TimingResult]) -> None:
+    rows = list(results)
+    name_width = max((len(row.name) for row in rows), default=4)
+    print(
+        f"{'case':<{name_width}}  {'median':>12}  {'min':>12}  "
+        f"{'mean':>12}  {'noise':>8}  {'repeat':>6}  {'number':>6}"
+    )
+    print("-" * (name_width + 68))
+    for result in rows:
+        print(
+            f"{result.name:<{name_width}}  "
+            f"{_format_ns(result.median_ns):>12}  "
+            f"{_format_ns(result.min_ns):>12}  "
+            f"{_format_ns(result.mean_ns):>12}  "
+            f"{_format_percentage(result.relative_noise):>8}  "
+            f"{len(result.samples_ns):>6}  "
+            f"{result.number:>6}"
+        )
+
+
+def _format_ns(ns: float) -> str:
+    if ns < 1_000:
+        return f"{ns:.1f} ns"
+    if ns < 1_000_000:
+        return f"{ns / 1_000:.2f} us"
+    return f"{ns / 1_000_000:.2f} ms"
+
+
+def _format_percentage(value: float | None) -> str:
+    if value is None:
+        return "inf"
+    return f"{value * 100.0:.2f}%"
+
+
+def add_case_filter(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument(
+        "--case",
+        action="append",
+        choices=[case.name for case in CASES],
+        help="Benchmark case to run. May be passed multiple times.",
+    )
+
+
+def add_json_output(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument(
+        "--json",
+        type=Path,
+        help="Write structured benchmark results to this JSON file.",
+    )
+
+
+def write_results_json(
+    path: Path,
+    *,
+    benchmark: str,
+    results: Iterable[TimingResult],
+    config: dict[str, Any],
+) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    payload = {
+        "schema": "cuda.compute.host_benchmark.v1",
+        "benchmark": benchmark,
+        "created_at": datetime.now(timezone.utc).isoformat(),
+        "config": config,
+        "environment": _environment_info(),
+        "results": [result.as_json() for result in results],
+    }
+    path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
+
+
+def _environment_info() -> dict[str, Any]:
+    device_count = cp.cuda.runtime.getDeviceCount()
+    devices = []
+    for device_id in range(device_count):
+        props = cp.cuda.runtime.getDeviceProperties(device_id)
+        name = props["name"]
+        if isinstance(name, bytes):
+            name = name.decode()
+        devices.append(
+            {
+                "id": device_id,
+                "name": name,
+                "compute_capability": [
+                    int(props["major"]),
+                    int(props["minor"]),
+                ],
+            }
+        )
+
+    return {
+        "python": sys.version,
+        "platform": platform.platform(),
+        "devices": devices,
+    }
+
+
+def select_cases(case_names: list[str] | None) -> list[HostBenchmarkCase]:
+    if not case_names:
+        selected_cases = CASES
+    else:
+        selected = set(case_names)
+        selected_cases = [case for case in CASES if case.name in selected]
+
+    runnable = []
+    skipped_by_reason: dict[str, list[str]] = {}
+    for case in selected_cases:
+        if case.skip_reason is None:
+            runnable.append(case)
+        else:
+            skipped_by_reason.setdefault(case.skip_reason, []).append(case.name)
+
+    for reason, names in skipped_by_reason.items():
+        print(f"Skipping {len(names)} benchmark case(s): {', '.join(names)}")
+        print(f"  Reason: {reason}")
+
+    return runnable
+
+
+def _numba_cuda_skip_reason() -> str | None:
+    try:
+        import numba.cuda  # noqa: F401
+    except Exception as exc:
+        return f"numba.cuda is not available: {exc}"
+    return None
+
+
+_NUMBA_CUDA_SKIP_REASON = _numba_cuda_skip_reason()
+
+
+def _raw_predicate_i32(name: str) -> RawOp:
+    source = f"""
+extern "C" __device__ void {name}(void* x, void* result) {{
+    int value = *static_cast<int*>(x);
+    *static_cast<bool*>(result) = value < {NUM_ITEMS // 2};
+}}
+"""
+    return RawOp(ltoir=compile_cpp_op_code(source), name=name)
+
+
+def _raw_plus_i32() -> RawOp:
+    source = """
+extern "C" __device__ void host_bench_plus_i32(
+    void* lhs,
+    void* rhs,
+    void* result
+) {
+    *static_cast<int*>(result) =
+        *static_cast<int*>(lhs) + *static_cast<int*>(rhs);
+}
+"""
+    return RawOp(ltoir=compile_cpp_op_code(source), name="host_bench_plus_i32")
+
+
+def _raw_identity_i32() -> RawOp:
+    source = """
+extern "C" __device__ void host_bench_identity_i32(void* x, void* result) {
+    *static_cast<int*>(result) = *static_cast<int*>(x);
+}
+"""
+    return RawOp(ltoir=compile_cpp_op_code(source), name="host_bench_identity_i32")
+
+
+def _raw_less_i32() -> RawOp:
+    source = """
+extern "C" __device__ void host_bench_less_i32(
+    void* lhs,
+    void* rhs,
+    void* result
+) {
+    *static_cast<bool*>(result) =
+        *static_cast<int*>(lhs) < *static_cast<int*>(rhs);
+}
+"""
+    return RawOp(ltoir=compile_cpp_op_code(source), name="host_bench_less_i32")
+
+
+def _raw_equal_i32() -> RawOp:
+    source = """
+extern "C" __device__ void host_bench_equal_i32(
+    void* lhs,
+    void* rhs,
+    void* result
+) {
+    *static_cast<bool*>(result) =
+        *static_cast<int*>(lhs) == *static_cast<int*>(rhs);
+}
+"""
+    return RawOp(ltoir=compile_cpp_op_code(source), name="host_bench_equal_i32")
+
+
+def _py_plus_i32(lhs, rhs):
+    return lhs + rhs
+
+
+def _py_identity_i32(x):
+    return x
+
+
+def _py_less_i32(lhs, rhs):
+    return lhs < rhs
+
+
+def _py_equal_i32(lhs, rhs):
+    return lhs == rhs
+
+
+def _py_predicate_i32(x):
+    return x < NUM_ITEMS // 2
+
+
+def _setup_unary_input_output() -> SimpleNamespace:
+    d_in = cp.arange(NUM_ITEMS, dtype=cp.int32)
+    d_out = cp.empty_like(d_in)
+    return SimpleNamespace(d_in=d_in, d_out=d_out, num_items=NUM_ITEMS)
+
+
+def _setup_binary_input_output() -> SimpleNamespace:
+    d_in1 = cp.arange(NUM_ITEMS, dtype=cp.int32)
+    d_in2 = cp.arange(NUM_ITEMS, dtype=cp.int32)
+    d_out = cp.empty_like(d_in1)
+    return SimpleNamespace(d_in1=d_in1, d_in2=d_in2, d_out=d_out, num_items=NUM_ITEMS)
+
+
+def _setup_reduce() -> SimpleNamespace:
+    state = _setup_unary_input_output()
+    state.h_init = np.array([0], dtype=np.int32)
+    state.op = cc.OpKind.PLUS
+    state.temp_storage = make_tiny_temp_storage()
+    return state
+
+
+def _make_reduce(state: SimpleNamespace):
+    return cc.make_reduce_into(
+        d_in=state.d_in,
+        d_out=state.d_out[:1],
+        op=state.op,
+        h_init=state.h_init,
+    )
+
+
+def _oneshot_reduce(state: SimpleNamespace) -> None:
+    cc.reduce_into(
+        d_in=state.d_in,
+        d_out=state.d_out[:1],
+        num_items=state.num_items,
+        op=state.op,
+        h_init=state.h_init,
+    )
+
+
+def _twoshot_reduce(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_in=state.d_in,
+        d_out=state.d_out[:1],
+        num_items=state.num_items,
+        op=state.op,
+        h_init=state.h_init,
+    )
+
+
+def _setup_scan() -> SimpleNamespace:
+    state = _setup_unary_input_output()
+    state.h_init = np.array([0], dtype=np.int32)
+    state.op = cc.OpKind.PLUS
+    state.temp_storage = make_tiny_temp_storage()
+    return state
+
+
+def _make_scan(state: SimpleNamespace):
+    return cc.make_exclusive_scan(
+        d_in=state.d_in,
+        d_out=state.d_out,
+        op=state.op,
+        init_value=state.h_init,
+    )
+
+
+def _oneshot_scan(state: SimpleNamespace) -> None:
+    cc.exclusive_scan(
+        d_in=state.d_in,
+        d_out=state.d_out,
+        op=state.op,
+        init_value=state.h_init,
+        num_items=state.num_items,
+    )
+
+
+def _twoshot_scan(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_in=state.d_in,
+        d_out=state.d_out,
+        op=state.op,
+        init_value=state.h_init,
+        num_items=state.num_items,
+    )
+
+
+def _setup_segmented_reduce() -> SimpleNamespace:
+    d_in = cp.arange(NUM_ITEMS, dtype=cp.int32)
+    d_out = cp.empty(NUM_SEGMENTS, dtype=cp.int32)
+    offsets = cp.asarray(
+        np.linspace(0, NUM_ITEMS, NUM_SEGMENTS + 1, dtype=np.int64)
+    )
+    return SimpleNamespace(
+        d_in=d_in,
+        d_out=d_out,
+        start_offsets=offsets[:-1],
+        end_offsets=offsets[1:],
+        num_segments=NUM_SEGMENTS,
+        h_init=np.array([0], dtype=np.int32),
+        op=cc.OpKind.PLUS,
+        temp_storage=make_tiny_temp_storage(),
+    )
+
+
+def _make_segmented_reduce(state: SimpleNamespace):
+    return cc.make_segmented_reduce(
+        d_in=state.d_in,
+        d_out=state.d_out,
+        start_offsets_in=state.start_offsets,
+        end_offsets_in=state.end_offsets,
+        op=state.op,
+        h_init=state.h_init,
+    )
+
+
+def _oneshot_segmented_reduce(state: SimpleNamespace) -> None:
+    cc.segmented_reduce(
+        d_in=state.d_in,
+        d_out=state.d_out,
+        num_segments=state.num_segments,
+        start_offsets_in=state.start_offsets,
+        end_offsets_in=state.end_offsets,
+        op=state.op,
+        h_init=state.h_init,
+    )
+
+
+def _twoshot_segmented_reduce(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_in=state.d_in,
+        d_out=state.d_out,
+        num_segments=state.num_segments,
+        start_offsets_in=state.start_offsets,
+        end_offsets_in=state.end_offsets,
+        op=state.op,
+        h_init=state.h_init,
+    )
+
+
+def _make_unary_transform(state: SimpleNamespace):
+    return cc.make_unary_transform(
+        d_in=state.d_in,
+        d_out=state.d_out,
+        op=state.op,
+    )
+
+
+def _oneshot_unary_transform(state: SimpleNamespace) -> None:
+    cc.unary_transform(
+        d_in=state.d_in,
+        d_out=state.d_out,
+        op=state.op,
+        num_items=state.num_items,
+    )
+
+
+def _twoshot_unary_transform(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        d_in=state.d_in,
+        d_out=state.d_out,
+        op=state.op,
+        num_items=state.num_items,
+    )
+
+
+def _make_binary_transform(state: SimpleNamespace):
+    return cc.make_binary_transform(
+        d_in1=state.d_in1,
+        d_in2=state.d_in2,
+        d_out=state.d_out,
+        op=state.op,
+    )
+
+
+def _oneshot_binary_transform(state: SimpleNamespace) -> None:
+    cc.binary_transform(
+        d_in1=state.d_in1,
+        d_in2=state.d_in2,
+        d_out=state.d_out,
+        op=state.op,
+        num_items=state.num_items,
+    )
+
+
+def _twoshot_binary_transform(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        d_in1=state.d_in1,
+        d_in2=state.d_in2,
+        d_out=state.d_out,
+        op=state.op,
+        num_items=state.num_items,
+    )
+
+
+def _setup_histogram() -> SimpleNamespace:
+    d_samples = cp.arange(NUM_ITEMS, dtype=cp.int32)
+    num_output_levels = 17
+    d_histogram = cp.empty(num_output_levels - 1, dtype=cp.int32)
+    lower_level = np.int32(0)
+    upper_level = np.int32(NUM_ITEMS)
+    return SimpleNamespace(
+        d_samples=d_samples,
+        d_histogram=d_histogram,
+        num_output_levels=num_output_levels,
+        h_num_output_levels=np.array([num_output_levels], dtype=np.int32),
+        lower_level=lower_level,
+        upper_level=upper_level,
+        h_lower_level=np.array([lower_level], dtype=np.int32),
+        h_upper_level=np.array([upper_level], dtype=np.int32),
+        num_samples=NUM_ITEMS,
+        temp_storage=make_tiny_temp_storage(),
+    )
+
+
+def _make_histogram(state: SimpleNamespace):
+    return cc.make_histogram_even(
+        d_samples=state.d_samples,
+        d_histogram=state.d_histogram,
+        h_num_output_levels=state.h_num_output_levels,
+        h_lower_level=state.h_lower_level,
+        h_upper_level=state.h_upper_level,
+        num_samples=state.num_samples,
+    )
+
+
+def _oneshot_histogram(state: SimpleNamespace) -> None:
+    cc.histogram_even(
+        d_samples=state.d_samples,
+        d_histogram=state.d_histogram,
+        num_output_levels=state.num_output_levels,
+        lower_level=state.lower_level,
+        upper_level=state.upper_level,
+        num_samples=state.num_samples,
+    )
+
+
+def _twoshot_histogram(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_samples=state.d_samples,
+        d_histogram=state.d_histogram,
+        h_num_output_levels=state.h_num_output_levels,
+        h_lower_level=state.h_lower_level,
+        h_upper_level=state.h_upper_level,
+        num_samples=state.num_samples,
+    )
+
+
+def _setup_binary_search() -> SimpleNamespace:
+    d_data = cp.arange(NUM_ITEMS, dtype=cp.int32)
+    d_values = cp.arange(0, NUM_ITEMS, 2, dtype=cp.int32)
+    d_out = cp.empty(d_values.size, dtype=np.uintp)
+    return SimpleNamespace(
+        d_data=d_data,
+        d_values=d_values,
+        d_out=d_out,
+        num_items=NUM_ITEMS,
+        num_values=int(d_values.size),
+        comp=cc.OpKind.LESS,
+    )
+
+
+def _make_lower_bound(state: SimpleNamespace):
+    return cc.make_lower_bound(
+        d_data=state.d_data,
+        d_values=state.d_values,
+        d_out=state.d_out,
+        comp=state.comp,
+    )
+
+
+def _oneshot_lower_bound(state: SimpleNamespace) -> None:
+    cc.lower_bound(
+        d_data=state.d_data,
+        num_items=state.num_items,
+        d_values=state.d_values,
+        num_values=state.num_values,
+        d_out=state.d_out,
+        comp=state.comp,
+    )
+
+
+def _twoshot_lower_bound(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        d_data=state.d_data,
+        num_items=state.num_items,
+        d_values=state.d_values,
+        num_values=state.num_values,
+        d_out=state.d_out,
+        comp=state.comp,
+    )
+
+
+def _setup_select() -> SimpleNamespace:
+    state = _setup_unary_input_output()
+    state.d_num_selected = cp.empty(1, dtype=np.uint64)
+    state.cond = cc.OpKind.LOGICAL_NOT
+    state.temp_storage = make_tiny_temp_storage()
+    return state
+
+
+def _make_select(state: SimpleNamespace):
+    return cc.make_select(
+        d_in=state.d_in,
+        d_out=state.d_out,
+        d_num_selected_out=state.d_num_selected,
+        cond=state.cond,
+    )
+
+
+def _oneshot_select(state: SimpleNamespace) -> None:
+    cc.select(
+        d_in=state.d_in,
+        d_out=state.d_out,
+        d_num_selected_out=state.d_num_selected,
+        cond=state.cond,
+        num_items=state.num_items,
+    )
+
+
+def _twoshot_select(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_in=state.d_in,
+        d_out=state.d_out,
+        d_num_selected_out=state.d_num_selected,
+        cond=state.cond,
+        num_items=state.num_items,
+    )
+
+
+def _setup_three_way_partition() -> SimpleNamespace:
+    state = _setup_unary_input_output()
+    state.d_first = cp.empty_like(state.d_in)
+    state.d_second = cp.empty_like(state.d_in)
+    state.d_unselected = cp.empty_like(state.d_in)
+    state.d_num_selected = cp.empty(2, dtype=np.uint64)
+    state.first_op = cc.OpKind.LOGICAL_NOT
+    state.second_op = cc.OpKind.LOGICAL_NOT
+    state.temp_storage = make_tiny_temp_storage()
+    return state
+
+
+def _make_three_way_partition(state: SimpleNamespace):
+    return cc.make_three_way_partition(
+        d_in=state.d_in,
+        d_first_part_out=state.d_first,
+        d_second_part_out=state.d_second,
+        d_unselected_out=state.d_unselected,
+        d_num_selected_out=state.d_num_selected,
+        select_first_part_op=state.first_op,
+        select_second_part_op=state.second_op,
+    )
+
+
+def _oneshot_three_way_partition(state: SimpleNamespace) -> None:
+    cc.three_way_partition(
+        d_in=state.d_in,
+        d_first_part_out=state.d_first,
+        d_second_part_out=state.d_second,
+        d_unselected_out=state.d_unselected,
+        d_num_selected_out=state.d_num_selected,
+        select_first_part_op=state.first_op,
+        select_second_part_op=state.second_op,
+        num_items=state.num_items,
+    )
+
+
+def _twoshot_three_way_partition(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_in=state.d_in,
+        d_first_part_out=state.d_first,
+        d_second_part_out=state.d_second,
+        d_unselected_out=state.d_unselected,
+        d_num_selected_out=state.d_num_selected,
+        select_first_part_op=state.first_op,
+        select_second_part_op=state.second_op,
+        num_items=state.num_items,
+    )
+
+
+def _setup_unique_by_key() -> SimpleNamespace:
+    d_keys = cp.arange(NUM_ITEMS, dtype=cp.int32)
+    d_items = cp.arange(NUM_ITEMS, dtype=cp.int32)
+    return SimpleNamespace(
+        d_in_keys=d_keys,
+        d_in_items=d_items,
+        d_out_keys=cp.empty_like(d_keys),
+        d_out_items=cp.empty_like(d_items),
+        d_num_selected=cp.empty(1, dtype=np.uint64),
+        op=cc.OpKind.EQUAL_TO,
+        num_items=NUM_ITEMS,
+        temp_storage=make_tiny_temp_storage(),
+    )
+
+
+def _make_unique_by_key(state: SimpleNamespace):
+    return cc.make_unique_by_key(
+        d_in_keys=state.d_in_keys,
+        d_in_items=state.d_in_items,
+        d_out_keys=state.d_out_keys,
+        d_out_items=state.d_out_items,
+        d_out_num_selected=state.d_num_selected,
+        op=state.op,
+    )
+
+
+def _oneshot_unique_by_key(state: SimpleNamespace) -> None:
+    cc.unique_by_key(
+        d_in_keys=state.d_in_keys,
+        d_in_items=state.d_in_items,
+        d_out_keys=state.d_out_keys,
+        d_out_items=state.d_out_items,
+        d_out_num_selected=state.d_num_selected,
+        op=state.op,
+        num_items=state.num_items,
+    )
+
+
+def _twoshot_unique_by_key(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_in_keys=state.d_in_keys,
+        d_in_items=state.d_in_items,
+        d_out_keys=state.d_out_keys,
+        d_out_items=state.d_out_items,
+        d_out_num_selected=state.d_num_selected,
+        op=state.op,
+        num_items=state.num_items,
+    )
+
+
+def _setup_sort() -> SimpleNamespace:
+    d_in_keys = cp.arange(NUM_ITEMS, 0, -1, dtype=cp.int32)
+    d_out_keys = cp.empty_like(d_in_keys)
+    return SimpleNamespace(
+        d_in_keys=d_in_keys,
+        d_out_keys=d_out_keys,
+        op=cc.OpKind.LESS,
+        num_items=NUM_ITEMS,
+        temp_storage=make_tiny_temp_storage(),
+    )
+
+
+def _make_merge_sort(state: SimpleNamespace):
+    return cc.make_merge_sort(
+        d_in_keys=state.d_in_keys,
+        d_out_keys=state.d_out_keys,
+        op=state.op,
+    )
+
+
+def _oneshot_merge_sort(state: SimpleNamespace) -> None:
+    cc.merge_sort(
+        d_in_keys=state.d_in_keys,
+        d_out_keys=state.d_out_keys,
+        num_items=state.num_items,
+        op=state.op,
+    )
+
+
+def _twoshot_merge_sort(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_in_keys=state.d_in_keys,
+        d_in_values=None,
+        d_out_keys=state.d_out_keys,
+        d_out_values=None,
+        num_items=state.num_items,
+        op=state.op,
+    )
+
+
+def _make_radix_sort(state: SimpleNamespace):
+    return cc.make_radix_sort(
+        d_in_keys=state.d_in_keys,
+        d_out_keys=state.d_out_keys,
+        d_in_values=None,
+        d_out_values=None,
+        order=cc.SortOrder.ASCENDING,
+    )
+
+
+def _oneshot_radix_sort(state: SimpleNamespace) -> None:
+    cc.radix_sort(
+        d_in_keys=state.d_in_keys,
+        d_out_keys=state.d_out_keys,
+        num_items=state.num_items,
+        order=cc.SortOrder.ASCENDING,
+    )
+
+
+def _twoshot_radix_sort(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_in_keys=state.d_in_keys,
+        d_out_keys=state.d_out_keys,
+        d_in_values=None,
+        d_out_values=None,
+        num_items=state.num_items,
+    )
+
+
+def _setup_segmented_sort() -> SimpleNamespace:
+    state = _setup_sort()
+    offsets = cp.asarray(
+        np.linspace(0, NUM_ITEMS, NUM_SEGMENTS + 1, dtype=np.int64)
+    )
+    state.start_offsets = offsets[:-1]
+    state.end_offsets = offsets[1:]
+    state.num_segments = NUM_SEGMENTS
+    return state
+
+
+def _make_segmented_sort(state: SimpleNamespace):
+    return cc.make_segmented_sort(
+        d_in_keys=state.d_in_keys,
+        d_out_keys=state.d_out_keys,
+        d_in_values=None,
+        d_out_values=None,
+        start_offsets_in=state.start_offsets,
+        end_offsets_in=state.end_offsets,
+        order=cc.SortOrder.ASCENDING,
+    )
+
+
+def _oneshot_segmented_sort(state: SimpleNamespace) -> None:
+    cc.segmented_sort(
+        d_in_keys=state.d_in_keys,
+        d_out_keys=state.d_out_keys,
+        d_in_values=None,
+        d_out_values=None,
+        num_items=state.num_items,
+        num_segments=state.num_segments,
+        start_offsets_in=state.start_offsets,
+        end_offsets_in=state.end_offsets,
+        order=cc.SortOrder.ASCENDING,
+    )
+
+
+def _twoshot_segmented_sort(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_in_keys=state.d_in_keys,
+        d_out_keys=state.d_out_keys,
+        d_in_values=None,
+        d_out_values=None,
+        num_items=state.num_items,
+        num_segments=state.num_segments,
+        start_offsets_in=state.start_offsets,
+        end_offsets_in=state.end_offsets,
+    )
+
+
+def _setup_with_values(
+    setup_fn: Callable[[], SimpleNamespace], **values: Any
+) -> Callable[[], SimpleNamespace]:
+    def setup() -> SimpleNamespace:
+        state = setup_fn()
+        for name, value in values.items():
+            setattr(state, name, value)
+        return state
+
+    return setup
+
+
+def _setup_with_factories(
+    setup_fn: Callable[[], SimpleNamespace], **factories: Callable[[], Any]
+) -> Callable[[], SimpleNamespace]:
+    def setup() -> SimpleNamespace:
+        state = setup_fn()
+        for name, factory in factories.items():
+            setattr(state, name, factory())
+        return state
+
+    return setup
+
+
+def _make_case(
+    name: str,
+    setup: Callable[[], SimpleNamespace],
+    make_wrapper: Callable[[SimpleNamespace], Any],
+    oneshot: Callable[[SimpleNamespace], None],
+    twoshot: Callable[[SimpleNamespace, Any], None],
+    noop_return_kind: NoopReturnKind,
+    skip_reason: str | None = None,
+) -> HostBenchmarkCase:
+    return HostBenchmarkCase(
+        name,
+        setup,
+        make_wrapper,
+        oneshot,
+        twoshot,
+        noop_return_kind,
+        skip_reason,
+    )
+
+
+CASES = [
+    _make_case(
+        "reduce.plus",
+        _setup_with_values(_setup_reduce, op=cc.OpKind.PLUS),
+        _make_reduce,
+        _oneshot_reduce,
+        _twoshot_reduce,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "reduce.raw_cpp",
+        _setup_with_factories(_setup_reduce, op=_raw_plus_i32),
+        _make_reduce,
+        _oneshot_reduce,
+        _twoshot_reduce,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "reduce.python",
+        _setup_with_values(_setup_reduce, op=_py_plus_i32),
+        _make_reduce,
+        _oneshot_reduce,
+        _twoshot_reduce,
+        "temp_storage_bytes",
+        _NUMBA_CUDA_SKIP_REASON,
+    ),
+    _make_case(
+        "exclusive_scan.plus",
+        _setup_with_values(_setup_scan, op=cc.OpKind.PLUS),
+        _make_scan,
+        _oneshot_scan,
+        _twoshot_scan,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "exclusive_scan.raw_cpp",
+        _setup_with_factories(_setup_scan, op=_raw_plus_i32),
+        _make_scan,
+        _oneshot_scan,
+        _twoshot_scan,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "exclusive_scan.python",
+        _setup_with_values(_setup_scan, op=_py_plus_i32),
+        _make_scan,
+        _oneshot_scan,
+        _twoshot_scan,
+        "temp_storage_bytes",
+        _NUMBA_CUDA_SKIP_REASON,
+    ),
+    _make_case(
+        "segmented_reduce.plus",
+        _setup_with_values(_setup_segmented_reduce, op=cc.OpKind.PLUS),
+        _make_segmented_reduce,
+        _oneshot_segmented_reduce,
+        _twoshot_segmented_reduce,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "segmented_reduce.raw_cpp",
+        _setup_with_factories(_setup_segmented_reduce, op=_raw_plus_i32),
+        _make_segmented_reduce,
+        _oneshot_segmented_reduce,
+        _twoshot_segmented_reduce,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "segmented_reduce.python",
+        _setup_with_values(_setup_segmented_reduce, op=_py_plus_i32),
+        _make_segmented_reduce,
+        _oneshot_segmented_reduce,
+        _twoshot_segmented_reduce,
+        "temp_storage_bytes",
+        _NUMBA_CUDA_SKIP_REASON,
+    ),
+    _make_case(
+        "unary_transform.identity",
+        _setup_with_values(_setup_unary_input_output, op=cc.OpKind.IDENTITY),
+        _make_unary_transform,
+        _oneshot_unary_transform,
+        _twoshot_unary_transform,
+        "none",
+    ),
+    _make_case(
+        "unary_transform.raw_cpp",
+        _setup_with_factories(_setup_unary_input_output, op=_raw_identity_i32),
+        _make_unary_transform,
+        _oneshot_unary_transform,
+        _twoshot_unary_transform,
+        "none",
+    ),
+    _make_case(
+        "unary_transform.python",
+        _setup_with_values(_setup_unary_input_output, op=_py_identity_i32),
+        _make_unary_transform,
+        _oneshot_unary_transform,
+        _twoshot_unary_transform,
+        "none",
+        _NUMBA_CUDA_SKIP_REASON,
+    ),
+    _make_case(
+        "binary_transform.plus",
+        _setup_with_values(_setup_binary_input_output, op=cc.OpKind.PLUS),
+        _make_binary_transform,
+        _oneshot_binary_transform,
+        _twoshot_binary_transform,
+        "none",
+    ),
+    _make_case(
+        "binary_transform.raw_cpp",
+        _setup_with_factories(_setup_binary_input_output, op=_raw_plus_i32),
+        _make_binary_transform,
+        _oneshot_binary_transform,
+        _twoshot_binary_transform,
+        "none",
+    ),
+    _make_case(
+        "binary_transform.python",
+        _setup_with_values(_setup_binary_input_output, op=_py_plus_i32),
+        _make_binary_transform,
+        _oneshot_binary_transform,
+        _twoshot_binary_transform,
+        "none",
+        _NUMBA_CUDA_SKIP_REASON,
+    ),
+    _make_case(
+        "histogram_even",
+        _setup_histogram,
+        _make_histogram,
+        _oneshot_histogram,
+        _twoshot_histogram,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "lower_bound.less",
+        _setup_with_values(_setup_binary_search, comp=cc.OpKind.LESS),
+        _make_lower_bound,
+        _oneshot_lower_bound,
+        _twoshot_lower_bound,
+        "none",
+    ),
+    _make_case(
+        "lower_bound.raw_cpp",
+        _setup_with_factories(_setup_binary_search, comp=_raw_less_i32),
+        _make_lower_bound,
+        _oneshot_lower_bound,
+        _twoshot_lower_bound,
+        "none",
+    ),
+    _make_case(
+        "lower_bound.python",
+        _setup_with_values(_setup_binary_search, comp=_py_less_i32),
+        _make_lower_bound,
+        _oneshot_lower_bound,
+        _twoshot_lower_bound,
+        "none",
+        _NUMBA_CUDA_SKIP_REASON,
+    ),
+    _make_case(
+        "select.logical_not",
+        _setup_with_values(_setup_select, cond=cc.OpKind.LOGICAL_NOT),
+        _make_select,
+        _oneshot_select,
+        _twoshot_select,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "select.raw_cpp",
+        _setup_with_factories(
+            _setup_select,
+            cond=lambda: _raw_predicate_i32("host_bench_select_predicate_i32"),
+        ),
+        _make_select,
+        _oneshot_select,
+        _twoshot_select,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "select.python",
+        _setup_with_values(_setup_select, cond=_py_predicate_i32),
+        _make_select,
+        _oneshot_select,
+        _twoshot_select,
+        "temp_storage_bytes",
+        _NUMBA_CUDA_SKIP_REASON,
+    ),
+    _make_case(
+        "three_way_partition.logical_not",
+        _setup_with_values(
+            _setup_three_way_partition,
+            first_op=cc.OpKind.LOGICAL_NOT,
+            second_op=cc.OpKind.LOGICAL_NOT,
+        ),
+        _make_three_way_partition,
+        _oneshot_three_way_partition,
+        _twoshot_three_way_partition,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "three_way_partition.raw_cpp",
+        _setup_with_factories(
+            _setup_three_way_partition,
+            first_op=lambda: _raw_predicate_i32("host_bench_partition_first_i32"),
+            second_op=lambda: _raw_predicate_i32("host_bench_partition_second_i32"),
+        ),
+        _make_three_way_partition,
+        _oneshot_three_way_partition,
+        _twoshot_three_way_partition,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "three_way_partition.python",
+        _setup_with_values(
+            _setup_three_way_partition,
+            first_op=_py_predicate_i32,
+            second_op=_py_predicate_i32,
+        ),
+        _make_three_way_partition,
+        _oneshot_three_way_partition,
+        _twoshot_three_way_partition,
+        "temp_storage_bytes",
+        _NUMBA_CUDA_SKIP_REASON,
+    ),
+    _make_case(
+        "unique_by_key.equal",
+        _setup_with_values(_setup_unique_by_key, op=cc.OpKind.EQUAL_TO),
+        _make_unique_by_key,
+        _oneshot_unique_by_key,
+        _twoshot_unique_by_key,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "unique_by_key.raw_cpp",
+        _setup_with_factories(_setup_unique_by_key, op=_raw_equal_i32),
+        _make_unique_by_key,
+        _oneshot_unique_by_key,
+        _twoshot_unique_by_key,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "unique_by_key.python",
+        _setup_with_values(_setup_unique_by_key, op=_py_equal_i32),
+        _make_unique_by_key,
+        _oneshot_unique_by_key,
+        _twoshot_unique_by_key,
+        "temp_storage_bytes",
+        _NUMBA_CUDA_SKIP_REASON,
+    ),
+    _make_case(
+        "merge_sort.less",
+        _setup_with_values(_setup_sort, op=cc.OpKind.LESS),
+        _make_merge_sort,
+        _oneshot_merge_sort,
+        _twoshot_merge_sort,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "merge_sort.raw_cpp",
+        _setup_with_factories(_setup_sort, op=_raw_less_i32),
+        _make_merge_sort,
+        _oneshot_merge_sort,
+        _twoshot_merge_sort,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "merge_sort.python",
+        _setup_with_values(_setup_sort, op=_py_less_i32),
+        _make_merge_sort,
+        _oneshot_merge_sort,
+        _twoshot_merge_sort,
+        "temp_storage_bytes",
+        _NUMBA_CUDA_SKIP_REASON,
+    ),
+    _make_case(
+        "radix_sort",
+        _setup_sort,
+        _make_radix_sort,
+        _oneshot_radix_sort,
+        _twoshot_radix_sort,
+        "temp_storage_and_selector",
+    ),
+    _make_case(
+        "segmented_sort",
+        _setup_segmented_sort,
+        _make_segmented_sort,
+        _oneshot_segmented_sort,
+        _twoshot_segmented_sort,
+        "temp_storage_and_selector",
+    ),
+]
diff --git a/python/cuda_cccl/benchmarks/compute/host/compare_results.py b/python/cuda_cccl/benchmarks/compute/host/compare_results.py
new file mode 100644
index 00000000000..7a670e11a01
--- /dev/null
+++ b/python/cuda_cccl/benchmarks/compute/host/compare_results.py
@@ -0,0 +1,258 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from enum import StrEnum
+from pathlib import Path
+from typing import Any
+
+SCHEMA = "cuda.compute.host_benchmark.v1"
+
+
+class _Color(StrEnum):
+    RED = "\033[31m"
+    GREEN = "\033[32m"
+    BLUE = "\033[34m"
+    YELLOW = "\033[33m"
+    RESET = "\033[0m"
+    NONE = ""
+
+
+class _Emoji(StrEnum):
+    YELLOW = "\U0001f7e1"
+    BLUE = "\U0001f535"
+    GREEN = "\U0001f7e2"
+    RED = "\U0001f534"
+    NONE = ""
+
+
+def _colorize(label: str, color: _Color, emoji: _Emoji, no_color: bool) -> str:
+    if no_color:
+        if emoji:
+            return f"{emoji} {label}"
+        return label
+    return f"{color}{label}{_Color.RESET}"
+
+
+def _format_ns(ns: float) -> str:
+    if ns < 1_000:
+        return f"{ns:.1f} ns"
+    if ns < 1_000_000:
+        return f"{ns / 1_000:.2f} us"
+    return f"{ns / 1_000_000:.2f} ms"
+
+
+def _format_percentage(value: float | None) -> str:
+    if value is None:
+        return "inf"
+    return f"{value * 100.0:.2f}%"
+
+
+def _load(path: Path) -> dict[str, Any]:
+    payload = json.loads(path.read_text(encoding="utf-8"))
+    schema = payload.get("schema")
+    if schema != SCHEMA:
+        raise ValueError(f"{path}: expected schema {SCHEMA!r}, got {schema!r}")
+    return payload
+
+
+def _result_map(payload: dict[str, Any]) -> dict[str, dict[str, Any]]:
+    return {result["name"]: result for result in payload["results"]}
+
+
+def _minimum_noise(ref_noise: float | None, cmp_noise: float | None) -> float | None:
+    if ref_noise is not None and cmp_noise is not None:
+        return min(ref_noise, cmp_noise)
+    if ref_noise is not None:
+        return ref_noise
+    return cmp_noise
+
+
+def _status(
+    ref_mean: float,
+    cmp_mean: float,
+    ref_noise: float | None,
+    cmp_noise: float | None,
+) -> tuple[str, float, float, float | None]:
+    diff = cmp_mean - ref_mean
+    frac_diff = diff / ref_mean
+    min_noise = _minimum_noise(ref_noise, cmp_noise)
+
+    if min_noise is None:
+        return "????", diff, frac_diff, min_noise
+    if abs(frac_diff) <= min_noise:
+        return "SAME", diff, frac_diff, min_noise
+    if diff < 0:
+        return "FAST", diff, frac_diff, min_noise
+    return "SLOW", diff, frac_diff, min_noise
+
+
+def _format_status(status: str, *, no_color: bool) -> str:
+    if status == "SAME":
+        return _colorize(status, _Color.BLUE, _Emoji.BLUE, no_color)
+    if status == "FAST":
+        return _colorize(status, _Color.GREEN, _Emoji.GREEN, no_color)
+    if status == "SLOW":
+        return _colorize(status, _Color.RED, _Emoji.RED, no_color)
+    return _colorize(status, _Color.YELLOW, _Emoji.YELLOW, no_color)
+
+
+def _print_table(rows: list[list[str]]) -> None:
+    headers = [
+        "case",
+        "ref mean",
+        "ref noise",
+        "cmp mean",
+        "cmp noise",
+        "diff",
+        "%diff",
+        "status",
+    ]
+    widths = [
+        max(len(row[index]) for row in [headers, *rows])
+        for index in range(len(headers))
+    ]
+
+    def print_row(row: list[str]) -> None:
+        formatted = []
+        for index, value in enumerate(row):
+            if index in (0, 7):
+                formatted.append(value.ljust(widths[index]))
+            else:
+                formatted.append(value.rjust(widths[index]))
+        print("  ".join(formatted))
+
+    print_row(headers)
+    print("  ".join("-" * width for width in widths))
+    for row in rows:
+        print_row(row)
+
+
+def compare(
+    ref_payload: dict[str, Any],
+    cmp_payload: dict[str, Any],
+    *,
+    no_color: bool,
+    threshold: float,
+) -> dict[str, int]:
+    ref_results = _result_map(ref_payload)
+    cmp_results = _result_map(cmp_payload)
+    common_names = sorted(set(ref_results) & set(cmp_results))
+
+    rows = []
+    counts = {"total": 0, "same": 0, "unknown": 0, "fast": 0, "slow": 0}
+    for name in common_names:
+        ref_result = ref_results[name]
+        cmp_result = cmp_results[name]
+        ref_mean = float(ref_result["mean"])
+        cmp_mean = float(cmp_result["mean"])
+        ref_noise = ref_result["relative_noise"]
+        cmp_noise = cmp_result["relative_noise"]
+        status, diff, frac_diff, _ = _status(
+            ref_mean,
+            cmp_mean,
+            ref_noise,
+            cmp_noise,
+        )
+
+        counts["total"] += 1
+        if status == "SAME":
+            counts["same"] += 1
+        elif status == "FAST":
+            counts["fast"] += 1
+        elif status == "SLOW":
+            counts["slow"] += 1
+        else:
+            counts["unknown"] += 1
+
+        if abs(frac_diff) < threshold:
+            continue
+
+        rows.append(
+            [
+                name,
+                _format_ns(ref_mean),
+                _format_percentage(ref_noise),
+                _format_ns(cmp_mean),
+                _format_percentage(cmp_noise),
+                _format_ns(diff),
+                _format_percentage(frac_diff),
+                _format_status(status, no_color=no_color),
+            ]
+        )
+
+    if rows:
+        _print_table(rows)
+    else:
+        print("No matching benchmark cases exceeded the display threshold.")
+
+    missing_in_cmp = sorted(set(ref_results) - set(cmp_results))
+    missing_in_ref = sorted(set(cmp_results) - set(ref_results))
+    if missing_in_cmp:
+        print(f"\nMissing from compare: {', '.join(missing_in_cmp)}")
+    if missing_in_ref:
+        print(f"\nMissing from reference: {', '.join(missing_in_ref)}")
+
+    print("\n# Summary\n")
+    print(f"- Total Matches: {counts['total']}")
+    print(f"  - Same    (diff <= min noise): {counts['same']}")
+    print(f"  - Fast    (cmp faster):        {counts['fast']}")
+    print(f"  - Slow    (cmp slower):        {counts['slow']}")
+    print(f"  - Unknown (missing noise):     {counts['unknown']}")
+    return counts
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Compare two cuda.compute host benchmark JSON outputs."
+    )
+    parser.add_argument("reference", type=Path)
+    parser.add_argument("compare", type=Path)
+    parser.add_argument(
+        "--threshold-diff",
+        type=float,
+        default=0.0,
+        help="Only show rows where absolute relative diff is at least this value.",
+    )
+    parser.add_argument(
+        "--fail-on-change",
+        action="store_true",
+        help="Return nonzero if any case is classified FAST or SLOW.",
+    )
+    parser.add_argument(
+        "--no-color",
+        action="store_true",
+        help="Use emoji instead of ANSI color codes.",
+    )
+    args = parser.parse_args()
+
+    ref_payload = _load(args.reference)
+    cmp_payload = _load(args.compare)
+    ref_benchmark = ref_payload["benchmark"]
+    cmp_benchmark = cmp_payload["benchmark"]
+    if ref_benchmark != cmp_benchmark:
+        print(
+            f"Benchmark types do not match: {ref_benchmark!r} vs {cmp_benchmark!r}",
+            file=sys.stderr,
+        )
+        return 1
+
+    print(f"# {ref_benchmark}\n")
+    counts = compare(
+        ref_payload,
+        cmp_payload,
+        no_color=args.no_color,
+        threshold=args.threshold_diff,
+    )
+    if args.fail_on_change:
+        return counts["fast"] + counts["slow"]
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/python/cuda_cccl/benchmarks/compute/host/oneshot_cached.py b/python/cuda_cccl/benchmarks/compute/host/oneshot_cached.py
new file mode 100644
index 00000000000..fc9733f0702
--- /dev/null
+++ b/python/cuda_cccl/benchmarks/compute/host/oneshot_cached.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from __future__ import annotations
+
+import argparse
+
+import cuda.compute as cc
+
+from common import (
+    add_case_filter,
+    add_json_output,
+    measure_call,
+    patch_wrapper_to_skip_native_compute,
+    print_results,
+    select_cases,
+    synchronize,
+    write_results_json,
+)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Measure cached cuda.compute public one-shot host overhead."
+    )
+    parser.add_argument(
+        "--repeat",
+        type=int,
+        default=20,
+        help="Number of timing samples.",
+    )
+    parser.add_argument(
+        "--number",
+        type=int,
+        default=100,
+        help="Number of calls per timing sample.",
+    )
+    add_case_filter(parser)
+    add_json_output(parser)
+    args = parser.parse_args()
+
+    results = []
+    for case in select_cases(args.case):
+        cc.clear_all_caches()
+        state = case.setup()
+        wrapper = case.make_wrapper(state)
+        patch_wrapper_to_skip_native_compute(wrapper, case.noop_return_kind)
+        synchronize()
+
+        results.append(
+            measure_call(
+                case.name,
+                lambda case=case, state=state: case.oneshot(state),
+                repeat=args.repeat,
+                number=args.number,
+            )
+        )
+
+    print_results(results)
+    if args.json is not None:
+        write_results_json(
+            args.json,
+            benchmark="oneshot_cached",
+            results=results,
+            config={
+                "repeat": args.repeat,
+                "number": args.number,
+                "case": args.case,
+            },
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/cuda_cccl/benchmarks/compute/host/twoshot_call.py b/python/cuda_cccl/benchmarks/compute/host/twoshot_call.py
new file mode 100644
index 00000000000..40370fa1aec
--- /dev/null
+++ b/python/cuda_cccl/benchmarks/compute/host/twoshot_call.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from __future__ import annotations
+
+import argparse
+
+import cuda.compute as cc
+
+from common import (
+    add_case_filter,
+    add_json_output,
+    measure_call,
+    patch_wrapper_to_skip_native_compute,
+    print_results,
+    select_cases,
+    synchronize,
+    write_results_json,
+)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Measure cached cuda.compute wrapper __call__ host overhead."
+    )
+    parser.add_argument(
+        "--repeat",
+        type=int,
+        default=20,
+        help="Number of timing samples.",
+    )
+    parser.add_argument(
+        "--number",
+        type=int,
+        default=1000,
+        help="Number of calls per timing sample.",
+    )
+    add_case_filter(parser)
+    add_json_output(parser)
+    args = parser.parse_args()
+
+    results = []
+    for case in select_cases(args.case):
+        cc.clear_all_caches()
+        state = case.setup()
+        wrapper = case.make_wrapper(state)
+        patch_wrapper_to_skip_native_compute(wrapper, case.noop_return_kind)
+        synchronize()
+
+        results.append(
+            measure_call(
+                case.name,
+                lambda case=case, state=state, wrapper=wrapper: case.twoshot(
+                    state, wrapper
+                ),
+                repeat=args.repeat,
+                number=args.number,
+            )
+        )
+
+    print_results(results)
+    if args.json is not None:
+        write_results_json(
+            args.json,
+            benchmark="twoshot_call",
+            results=results,
+            config={
+                "repeat": args.repeat,
+                "number": args.number,
+                "case": args.case,
+            },
+        )
+
+
+if __name__ == "__main__":
+    main()

From 1786eb0ca17f02f262a28644d3fa9b87eb885c0b Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Fri, 12 Jun 2026 16:18:45 -0500
Subject: [PATCH 06/16] Use pytest-benchmark instead

---
 .gitignore                                    |   2 +
 .../benchmarks/compute/host/build_time.py     |  63 -----
 .../compute/host/compare_results.py           | 258 ------------------
 .../{common.py => host_benchmark_cases.py}    | 192 +------------
 .../benchmarks/compute/host/oneshot_cached.py |  75 -----
 .../host/test_host_pytest_benchmark.py        |  95 +++++++
 .../benchmarks/compute/host/twoshot_call.py   |  77 ------
 python/cuda_cccl/benchmarks/compute/pixi.toml |   1 +
 python/cuda_cccl/pyproject.toml               |  28 +-
 9 files changed, 123 insertions(+), 668 deletions(-)
 delete mode 100644 python/cuda_cccl/benchmarks/compute/host/build_time.py
 delete mode 100644 python/cuda_cccl/benchmarks/compute/host/compare_results.py
 rename python/cuda_cccl/benchmarks/compute/host/{common.py => host_benchmark_cases.py} (84%)
 delete mode 100644 python/cuda_cccl/benchmarks/compute/host/oneshot_cached.py
 create mode 100644 python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py
 delete mode 100644 python/cuda_cccl/benchmarks/compute/host/twoshot_call.py

diff --git a/.gitignore b/.gitignore
index 06687943b0e..0d8823d768b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,8 @@ __pycache__
 *.pyd
 wheelhouse/
 bench-artifacts/
+# Local results written by pytest-benchmark --benchmark-save.
+.benchmarks/
 CLAUDE.local.md
 .codegraph/*
 .cursor/rules/codegraph.mdc
diff --git a/python/cuda_cccl/benchmarks/compute/host/build_time.py b/python/cuda_cccl/benchmarks/compute/host/build_time.py
deleted file mode 100644
index b445f57d879..00000000000
--- a/python/cuda_cccl/benchmarks/compute/host/build_time.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-from __future__ import annotations
-
-import argparse
-import time
-
-import cuda.compute as cc
-
-from common import (
-    TimingResult,
-    add_case_filter,
-    add_json_output,
-    print_results,
-    select_cases,
-    synchronize,
-    write_results_json,
-)
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Measure cold cuda.compute make_* build time."
-    )
-    parser.add_argument(
-        "--repeat",
-        type=int,
-        default=10,
-        help="Number of cold build samples. Defaults to 10.",
-    )
-    add_case_filter(parser)
-    add_json_output(parser)
-    args = parser.parse_args()
-
-    results = []
-    for case in select_cases(args.case):
-        state = case.setup()
-        synchronize()
-
-        samples_ns = []
-        for _ in range(args.repeat):
-            cc.clear_all_caches()
-            start = time.perf_counter_ns()
-            case.make_wrapper(state)
-            end = time.perf_counter_ns()
-            samples_ns.append(end - start)
-
-        results.append(TimingResult(case.name, samples_ns=samples_ns, number=1))
-
-    print_results(results)
-    if args.json is not None:
-        write_results_json(
-            args.json,
-            benchmark="build_time",
-            results=results,
-            config={"repeat": args.repeat, "case": args.case},
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/cuda_cccl/benchmarks/compute/host/compare_results.py b/python/cuda_cccl/benchmarks/compute/host/compare_results.py
deleted file mode 100644
index 7a670e11a01..00000000000
--- a/python/cuda_cccl/benchmarks/compute/host/compare_results.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-from __future__ import annotations
-
-import argparse
-import json
-import sys
-from enum import StrEnum
-from pathlib import Path
-from typing import Any
-
-SCHEMA = "cuda.compute.host_benchmark.v1"
-
-
-class _Color(StrEnum):
-    RED = "\033[31m"
-    GREEN = "\033[32m"
-    BLUE = "\033[34m"
-    YELLOW = "\033[33m"
-    RESET = "\033[0m"
-    NONE = ""
-
-
-class _Emoji(StrEnum):
-    YELLOW = "\U0001f7e1"
-    BLUE = "\U0001f535"
-    GREEN = "\U0001f7e2"
-    RED = "\U0001f534"
-    NONE = ""
-
-
-def _colorize(label: str, color: _Color, emoji: _Emoji, no_color: bool) -> str:
-    if no_color:
-        if emoji:
-            return f"{emoji} {label}"
-        return label
-    return f"{color}{label}{_Color.RESET}"
-
-
-def _format_ns(ns: float) -> str:
-    if ns < 1_000:
-        return f"{ns:.1f} ns"
-    if ns < 1_000_000:
-        return f"{ns / 1_000:.2f} us"
-    return f"{ns / 1_000_000:.2f} ms"
-
-
-def _format_percentage(value: float | None) -> str:
-    if value is None:
-        return "inf"
-    return f"{value * 100.0:.2f}%"
-
-
-def _load(path: Path) -> dict[str, Any]:
-    payload = json.loads(path.read_text(encoding="utf-8"))
-    schema = payload.get("schema")
-    if schema != SCHEMA:
-        raise ValueError(f"{path}: expected schema {SCHEMA!r}, got {schema!r}")
-    return payload
-
-
-def _result_map(payload: dict[str, Any]) -> dict[str, dict[str, Any]]:
-    return {result["name"]: result for result in payload["results"]}
-
-
-def _minimum_noise(ref_noise: float | None, cmp_noise: float | None) -> float | None:
-    if ref_noise is not None and cmp_noise is not None:
-        return min(ref_noise, cmp_noise)
-    if ref_noise is not None:
-        return ref_noise
-    return cmp_noise
-
-
-def _status(
-    ref_mean: float,
-    cmp_mean: float,
-    ref_noise: float | None,
-    cmp_noise: float | None,
-) -> tuple[str, float, float, float | None]:
-    diff = cmp_mean - ref_mean
-    frac_diff = diff / ref_mean
-    min_noise = _minimum_noise(ref_noise, cmp_noise)
-
-    if min_noise is None:
-        return "????", diff, frac_diff, min_noise
-    if abs(frac_diff) <= min_noise:
-        return "SAME", diff, frac_diff, min_noise
-    if diff < 0:
-        return "FAST", diff, frac_diff, min_noise
-    return "SLOW", diff, frac_diff, min_noise
-
-
-def _format_status(status: str, *, no_color: bool) -> str:
-    if status == "SAME":
-        return _colorize(status, _Color.BLUE, _Emoji.BLUE, no_color)
-    if status == "FAST":
-        return _colorize(status, _Color.GREEN, _Emoji.GREEN, no_color)
-    if status == "SLOW":
-        return _colorize(status, _Color.RED, _Emoji.RED, no_color)
-    return _colorize(status, _Color.YELLOW, _Emoji.YELLOW, no_color)
-
-
-def _print_table(rows: list[list[str]]) -> None:
-    headers = [
-        "case",
-        "ref mean",
-        "ref noise",
-        "cmp mean",
-        "cmp noise",
-        "diff",
-        "%diff",
-        "status",
-    ]
-    widths = [
-        max(len(row[index]) for row in [headers, *rows])
-        for index in range(len(headers))
-    ]
-
-    def print_row(row: list[str]) -> None:
-        formatted = []
-        for index, value in enumerate(row):
-            if index in (0, 7):
-                formatted.append(value.ljust(widths[index]))
-            else:
-                formatted.append(value.rjust(widths[index]))
-        print("  ".join(formatted))
-
-    print_row(headers)
-    print("  ".join("-" * width for width in widths))
-    for row in rows:
-        print_row(row)
-
-
-def compare(
-    ref_payload: dict[str, Any],
-    cmp_payload: dict[str, Any],
-    *,
-    no_color: bool,
-    threshold: float,
-) -> dict[str, int]:
-    ref_results = _result_map(ref_payload)
-    cmp_results = _result_map(cmp_payload)
-    common_names = sorted(set(ref_results) & set(cmp_results))
-
-    rows = []
-    counts = {"total": 0, "same": 0, "unknown": 0, "fast": 0, "slow": 0}
-    for name in common_names:
-        ref_result = ref_results[name]
-        cmp_result = cmp_results[name]
-        ref_mean = float(ref_result["mean"])
-        cmp_mean = float(cmp_result["mean"])
-        ref_noise = ref_result["relative_noise"]
-        cmp_noise = cmp_result["relative_noise"]
-        status, diff, frac_diff, _ = _status(
-            ref_mean,
-            cmp_mean,
-            ref_noise,
-            cmp_noise,
-        )
-
-        counts["total"] += 1
-        if status == "SAME":
-            counts["same"] += 1
-        elif status == "FAST":
-            counts["fast"] += 1
-        elif status == "SLOW":
-            counts["slow"] += 1
-        else:
-            counts["unknown"] += 1
-
-        if abs(frac_diff) < threshold:
-            continue
-
-        rows.append(
-            [
-                name,
-                _format_ns(ref_mean),
-                _format_percentage(ref_noise),
-                _format_ns(cmp_mean),
-                _format_percentage(cmp_noise),
-                _format_ns(diff),
-                _format_percentage(frac_diff),
-                _format_status(status, no_color=no_color),
-            ]
-        )
-
-    if rows:
-        _print_table(rows)
-    else:
-        print("No matching benchmark cases exceeded the display threshold.")
-
-    missing_in_cmp = sorted(set(ref_results) - set(cmp_results))
-    missing_in_ref = sorted(set(cmp_results) - set(ref_results))
-    if missing_in_cmp:
-        print(f"\nMissing from compare: {', '.join(missing_in_cmp)}")
-    if missing_in_ref:
-        print(f"\nMissing from reference: {', '.join(missing_in_ref)}")
-
-    print("\n# Summary\n")
-    print(f"- Total Matches: {counts['total']}")
-    print(f"  - Same    (diff <= min noise): {counts['same']}")
-    print(f"  - Fast    (cmp faster):        {counts['fast']}")
-    print(f"  - Slow    (cmp slower):        {counts['slow']}")
-    print(f"  - Unknown (missing noise):     {counts['unknown']}")
-    return counts
-
-
-def main() -> int:
-    parser = argparse.ArgumentParser(
-        description="Compare two cuda.compute host benchmark JSON outputs."
-    )
-    parser.add_argument("reference", type=Path)
-    parser.add_argument("compare", type=Path)
-    parser.add_argument(
-        "--threshold-diff",
-        type=float,
-        default=0.0,
-        help="Only show rows where absolute relative diff is at least this value.",
-    )
-    parser.add_argument(
-        "--fail-on-change",
-        action="store_true",
-        help="Return nonzero if any case is classified FAST or SLOW.",
-    )
-    parser.add_argument(
-        "--no-color",
-        action="store_true",
-        help="Use emoji instead of ANSI color codes.",
-    )
-    args = parser.parse_args()
-
-    ref_payload = _load(args.reference)
-    cmp_payload = _load(args.compare)
-    ref_benchmark = ref_payload["benchmark"]
-    cmp_benchmark = cmp_payload["benchmark"]
-    if ref_benchmark != cmp_benchmark:
-        print(
-            f"Benchmark types do not match: {ref_benchmark!r} vs {cmp_benchmark!r}",
-            file=sys.stderr,
-        )
-        return 1
-
-    print(f"# {ref_benchmark}\n")
-    counts = compare(
-        ref_payload,
-        cmp_payload,
-        no_color=args.no_color,
-        threshold=args.threshold_diff,
-    )
-    if args.fail_on_change:
-        return counts["fast"] + counts["slow"]
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/python/cuda_cccl/benchmarks/compute/host/common.py b/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py
similarity index 84%
rename from python/cuda_cccl/benchmarks/compute/host/common.py
rename to python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py
index a6d7d4dea1b..3fa97827435 100644
--- a/python/cuda_cccl/benchmarks/compute/host/common.py
+++ b/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py
@@ -4,17 +4,9 @@
 
 from __future__ import annotations
 
-import argparse
-import json
-import platform
-import statistics
-import sys
-import time
 from dataclasses import dataclass
-from datetime import datetime, timezone
-from pathlib import Path
 from types import SimpleNamespace
-from typing import Any, Callable, Iterable, Literal
+from typing import Any, Callable, Literal
 
 import cupy as cp
 import numpy as np
@@ -26,57 +18,10 @@
 NOOP_TEMP_STORAGE_BYTES = 1
 NUM_ITEMS = 128
 NUM_SEGMENTS = 4
-MIN_SAMPLES_FOR_NOISE_ESTIMATE = 5
 
 NoopReturnKind = Literal["none", "temp_storage_bytes", "temp_storage_and_selector"]
 
 
-@dataclass(frozen=True)
-class TimingResult:
-    name: str
-    samples_ns: list[float]
-    number: int
-
-    @property
-    def min_ns(self) -> float:
-        return min(self.samples_ns)
-
-    @property
-    def median_ns(self) -> float:
-        return statistics.median(self.samples_ns)
-
-    @property
-    def mean_ns(self) -> float:
-        return statistics.mean(self.samples_ns)
-
-    @property
-    def stdev_ns(self) -> float | None:
-        if len(self.samples_ns) < MIN_SAMPLES_FOR_NOISE_ESTIMATE:
-            return None
-        return statistics.stdev(self.samples_ns)
-
-    @property
-    def relative_noise(self) -> float | None:
-        stdev_ns = self.stdev_ns
-        mean_ns = self.mean_ns
-        if stdev_ns is None or mean_ns <= 0:
-            return None
-        return stdev_ns / mean_ns
-
-    def as_json(self) -> dict[str, Any]:
-        return {
-            "name": self.name,
-            "unit": "ns",
-            "number": self.number,
-            "samples": self.samples_ns,
-            "min": self.min_ns,
-            "median": self.median_ns,
-            "mean": self.mean_ns,
-            "stdev": self.stdev_ns,
-            "relative_noise": self.relative_noise,
-        }
-
-
 @dataclass(frozen=True)
 class HostBenchmarkCase:
     name: str
@@ -140,141 +85,6 @@ def synchronize() -> None:
     cp.cuda.Device().synchronize()
 
 
-def measure_call(
-    name: str,
-    fn: Callable[[], None],
-    *,
-    repeat: int,
-    number: int,
-) -> TimingResult:
-    samples_ns = []
-    for _ in range(repeat):
-        start = time.perf_counter_ns()
-        for _ in range(number):
-            fn()
-        end = time.perf_counter_ns()
-        samples_ns.append((end - start) / number)
-    return TimingResult(name=name, samples_ns=samples_ns, number=number)
-
-
-def print_results(results: Iterable[TimingResult]) -> None:
-    rows = list(results)
-    name_width = max((len(row.name) for row in rows), default=4)
-    print(
-        f"{'case':<{name_width}}  {'median':>12}  {'min':>12}  "
-        f"{'mean':>12}  {'noise':>8}  {'repeat':>6}  {'number':>6}"
-    )
-    print("-" * (name_width + 68))
-    for result in rows:
-        print(
-            f"{result.name:<{name_width}}  "
-            f"{_format_ns(result.median_ns):>12}  "
-            f"{_format_ns(result.min_ns):>12}  "
-            f"{_format_ns(result.mean_ns):>12}  "
-            f"{_format_percentage(result.relative_noise):>8}  "
-            f"{len(result.samples_ns):>6}  "
-            f"{result.number:>6}"
-        )
-
-
-def _format_ns(ns: float) -> str:
-    if ns < 1_000:
-        return f"{ns:.1f} ns"
-    if ns < 1_000_000:
-        return f"{ns / 1_000:.2f} us"
-    return f"{ns / 1_000_000:.2f} ms"
-
-
-def _format_percentage(value: float | None) -> str:
-    if value is None:
-        return "inf"
-    return f"{value * 100.0:.2f}%"
-
-
-def add_case_filter(parser: argparse.ArgumentParser) -> None:
-    parser.add_argument(
-        "--case",
-        action="append",
-        choices=[case.name for case in CASES],
-        help="Benchmark case to run. May be passed multiple times.",
-    )
-
-
-def add_json_output(parser: argparse.ArgumentParser) -> None:
-    parser.add_argument(
-        "--json",
-        type=Path,
-        help="Write structured benchmark results to this JSON file.",
-    )
-
-
-def write_results_json(
-    path: Path,
-    *,
-    benchmark: str,
-    results: Iterable[TimingResult],
-    config: dict[str, Any],
-) -> None:
-    path.parent.mkdir(parents=True, exist_ok=True)
-    payload = {
-        "schema": "cuda.compute.host_benchmark.v1",
-        "benchmark": benchmark,
-        "created_at": datetime.now(timezone.utc).isoformat(),
-        "config": config,
-        "environment": _environment_info(),
-        "results": [result.as_json() for result in results],
-    }
-    path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
-
-
-def _environment_info() -> dict[str, Any]:
-    device_count = cp.cuda.runtime.getDeviceCount()
-    devices = []
-    for device_id in range(device_count):
-        props = cp.cuda.runtime.getDeviceProperties(device_id)
-        name = props["name"]
-        if isinstance(name, bytes):
-            name = name.decode()
-        devices.append(
-            {
-                "id": device_id,
-                "name": name,
-                "compute_capability": [
-                    int(props["major"]),
-                    int(props["minor"]),
-                ],
-            }
-        )
-
-    return {
-        "python": sys.version,
-        "platform": platform.platform(),
-        "devices": devices,
-    }
-
-
-def select_cases(case_names: list[str] | None) -> list[HostBenchmarkCase]:
-    if not case_names:
-        selected_cases = CASES
-    else:
-        selected = set(case_names)
-        selected_cases = [case for case in CASES if case.name in selected]
-
-    runnable = []
-    skipped_by_reason: dict[str, list[str]] = {}
-    for case in selected_cases:
-        if case.skip_reason is None:
-            runnable.append(case)
-        else:
-            skipped_by_reason.setdefault(case.skip_reason, []).append(case.name)
-
-    for reason, names in skipped_by_reason.items():
-        print(f"Skipping {len(names)} benchmark case(s): {', '.join(names)}")
-        print(f"  Reason: {reason}")
-
-    return runnable
-
-
 def _numba_cuda_skip_reason() -> str | None:
     try:
         import numba.cuda  # noqa: F401
diff --git a/python/cuda_cccl/benchmarks/compute/host/oneshot_cached.py b/python/cuda_cccl/benchmarks/compute/host/oneshot_cached.py
deleted file mode 100644
index fc9733f0702..00000000000
--- a/python/cuda_cccl/benchmarks/compute/host/oneshot_cached.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-from __future__ import annotations
-
-import argparse
-
-import cuda.compute as cc
-
-from common import (
-    add_case_filter,
-    add_json_output,
-    measure_call,
-    patch_wrapper_to_skip_native_compute,
-    print_results,
-    select_cases,
-    synchronize,
-    write_results_json,
-)
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Measure cached cuda.compute public one-shot host overhead."
-    )
-    parser.add_argument(
-        "--repeat",
-        type=int,
-        default=20,
-        help="Number of timing samples.",
-    )
-    parser.add_argument(
-        "--number",
-        type=int,
-        default=100,
-        help="Number of calls per timing sample.",
-    )
-    add_case_filter(parser)
-    add_json_output(parser)
-    args = parser.parse_args()
-
-    results = []
-    for case in select_cases(args.case):
-        cc.clear_all_caches()
-        state = case.setup()
-        wrapper = case.make_wrapper(state)
-        patch_wrapper_to_skip_native_compute(wrapper, case.noop_return_kind)
-        synchronize()
-
-        results.append(
-            measure_call(
-                case.name,
-                lambda case=case, state=state: case.oneshot(state),
-                repeat=args.repeat,
-                number=args.number,
-            )
-        )
-
-    print_results(results)
-    if args.json is not None:
-        write_results_json(
-            args.json,
-            benchmark="oneshot_cached",
-            results=results,
-            config={
-                "repeat": args.repeat,
-                "number": args.number,
-                "case": args.case,
-            },
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py b/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py
new file mode 100644
index 00000000000..1d64260400e
--- /dev/null
+++ b/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from __future__ import annotations
+
+import pytest
+
+import cuda.compute as cc
+
+from host_benchmark_cases import (
+    CASES,
+    HostBenchmarkCase,
+    patch_wrapper_to_skip_native_compute,
+    synchronize,
+)
+
+pytest.importorskip("pytest_benchmark")
+
+BUILD_TIME_ROUNDS = 10
+ONESHOT_ROUNDS = 20
+ONESHOT_ITERATIONS = 100
+TWOSHOT_ROUNDS = 20
+TWOSHOT_ITERATIONS = 1000
+
+
+def _case_params() -> list[pytest.ParameterSet]:
+    params = []
+    for case in CASES:
+        marks = []
+        if case.skip_reason is not None:
+            marks.append(pytest.mark.skip(reason=case.skip_reason))
+        params.append(pytest.param(case, id=case.name, marks=marks))
+    return params
+
+
+@pytest.mark.benchmark(group="cuda.compute.host.build_time")
+@pytest.mark.parametrize("case", _case_params())
+def test_build_time(benchmark, case: HostBenchmarkCase):
+    state = case.setup()
+    synchronize()
+
+    def setup() -> None:
+        cc.clear_all_caches()
+
+    def build():
+        return case.make_wrapper(state)
+
+    benchmark.pedantic(
+        build,
+        setup=setup,
+        rounds=BUILD_TIME_ROUNDS,
+        iterations=1,
+        warmup_rounds=0,
+    )
+
+
+@pytest.mark.benchmark(group="cuda.compute.host.oneshot_cached")
+@pytest.mark.parametrize("case", _case_params())
+def test_oneshot_cached_host_overhead(benchmark, case: HostBenchmarkCase):
+    cc.clear_all_caches()
+    state = case.setup()
+    wrapper = case.make_wrapper(state)
+    patch_wrapper_to_skip_native_compute(wrapper, case.noop_return_kind)
+    synchronize()
+
+    def call() -> None:
+        case.oneshot(state)
+
+    benchmark.pedantic(
+        call,
+        rounds=ONESHOT_ROUNDS,
+        iterations=ONESHOT_ITERATIONS,
+        warmup_rounds=0,
+    )
+
+
+@pytest.mark.benchmark(group="cuda.compute.host.twoshot_call")
+@pytest.mark.parametrize("case", _case_params())
+def test_twoshot_call_host_overhead(benchmark, case: HostBenchmarkCase):
+    cc.clear_all_caches()
+    state = case.setup()
+    wrapper = case.make_wrapper(state)
+    patch_wrapper_to_skip_native_compute(wrapper, case.noop_return_kind)
+    synchronize()
+
+    def call() -> None:
+        case.twoshot(state, wrapper)
+
+    benchmark.pedantic(
+        call,
+        rounds=TWOSHOT_ROUNDS,
+        iterations=TWOSHOT_ITERATIONS,
+        warmup_rounds=0,
+    )
diff --git a/python/cuda_cccl/benchmarks/compute/host/twoshot_call.py b/python/cuda_cccl/benchmarks/compute/host/twoshot_call.py
deleted file mode 100644
index 40370fa1aec..00000000000
--- a/python/cuda_cccl/benchmarks/compute/host/twoshot_call.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-from __future__ import annotations
-
-import argparse
-
-import cuda.compute as cc
-
-from common import (
-    add_case_filter,
-    add_json_output,
-    measure_call,
-    patch_wrapper_to_skip_native_compute,
-    print_results,
-    select_cases,
-    synchronize,
-    write_results_json,
-)
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Measure cached cuda.compute wrapper __call__ host overhead."
-    )
-    parser.add_argument(
-        "--repeat",
-        type=int,
-        default=20,
-        help="Number of timing samples.",
-    )
-    parser.add_argument(
-        "--number",
-        type=int,
-        default=1000,
-        help="Number of calls per timing sample.",
-    )
-    add_case_filter(parser)
-    add_json_output(parser)
-    args = parser.parse_args()
-
-    results = []
-    for case in select_cases(args.case):
-        cc.clear_all_caches()
-        state = case.setup()
-        wrapper = case.make_wrapper(state)
-        patch_wrapper_to_skip_native_compute(wrapper, case.noop_return_kind)
-        synchronize()
-
-        results.append(
-            measure_call(
-                case.name,
-                lambda case=case, state=state, wrapper=wrapper: case.twoshot(
-                    state, wrapper
-                ),
-                repeat=args.repeat,
-                number=args.number,
-            )
-        )
-
-    print_results(results)
-    if args.json is not None:
-        write_results_json(
-            args.json,
-            benchmark="twoshot_call",
-            results=results,
-            config={
-                "repeat": args.repeat,
-                "number": args.number,
-                "case": args.case,
-            },
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/cuda_cccl/benchmarks/compute/pixi.toml b/python/cuda_cccl/benchmarks/compute/pixi.toml
index 35ed8792506..b53fee44387 100644
--- a/python/cuda_cccl/benchmarks/compute/pixi.toml
+++ b/python/cuda_cccl/benchmarks/compute/pixi.toml
@@ -19,6 +19,7 @@ cuda-version = "13.1.*"
 python = "3.13.*"
 numpy = "*"
 cupy = "*"
+pytest-benchmark = "*"
 pyyaml = "*"
 pre-commit = "*"
 
diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index 967bc86d58b..98e0e6533c0 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -88,10 +88,30 @@ test-cu12 = [
 test-cu13 = ["cuda-cccl[cu13]", "pytest", "pytest-xdist", "cupy-cuda13x"]
 test-sysctk12 = ["cuda-cccl[sysctk12]", "pytest", "pytest-xdist", "cupy-cuda12x"]
 test-sysctk13 = ["cuda-cccl[sysctk13]", "pytest", "pytest-xdist", "cupy-cuda13x"]
-bench-cu12 = ["cuda-cccl[cu12]", "cuda-bench[cu12]", "cupy-cuda12x"]
-bench-cu13 = ["cuda-cccl[cu13]", "cuda-bench[cu13]", "cupy-cuda13x"]
-bench-sysctk12 = ["cuda-cccl[sysctk12]", "cuda-bench[cu12]", "cupy-cuda12x"]
-bench-sysctk13 = ["cuda-cccl[sysctk13]", "cuda-bench[cu13]", "cupy-cuda13x"]
+bench-cu12 = [
+  "cuda-cccl[cu12]",
+  "cuda-bench[cu12]",
+  "cupy-cuda12x",
+  "pytest-benchmark",
+]
+bench-cu13 = [
+  "cuda-cccl[cu13]",
+  "cuda-bench[cu13]",
+  "cupy-cuda13x",
+  "pytest-benchmark",
+]
+bench-sysctk12 = [
+  "cuda-cccl[sysctk12]",
+  "cuda-bench[cu12]",
+  "cupy-cuda12x",
+  "pytest-benchmark",
+]
+bench-sysctk13 = [
+  "cuda-cccl[sysctk13]",
+  "cuda-bench[cu13]",
+  "cupy-cuda13x",
+  "pytest-benchmark",
+]
 
 [project.urls]
 Homepage = "https://github.com/NVIDIA/cccl"

From a9219391427996f600bb66916ea76cc99ceb8891 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 12 Jun 2026 21:34:18 +0000
Subject: [PATCH 07/16] [pre-commit.ci] auto code formatting

---
 .../benchmarks/compute/host/host_benchmark_cases.py       | 8 ++------
 .../benchmarks/compute/host/test_host_pytest_benchmark.py | 5 ++---
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py b/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py
index 3fa97827435..769846405bf 100644
--- a/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py
+++ b/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py
@@ -269,9 +269,7 @@ def _twoshot_scan(state: SimpleNamespace, wrapper) -> None:
 def _setup_segmented_reduce() -> SimpleNamespace:
     d_in = cp.arange(NUM_ITEMS, dtype=cp.int32)
     d_out = cp.empty(NUM_SEGMENTS, dtype=cp.int32)
-    offsets = cp.asarray(
-        np.linspace(0, NUM_ITEMS, NUM_SEGMENTS + 1, dtype=np.int64)
-    )
+    offsets = cp.asarray(np.linspace(0, NUM_ITEMS, NUM_SEGMENTS + 1, dtype=np.int64))
     return SimpleNamespace(
         d_in=d_in,
         d_out=d_out,
@@ -687,9 +685,7 @@ def _twoshot_radix_sort(state: SimpleNamespace, wrapper) -> None:
 
 def _setup_segmented_sort() -> SimpleNamespace:
     state = _setup_sort()
-    offsets = cp.asarray(
-        np.linspace(0, NUM_ITEMS, NUM_SEGMENTS + 1, dtype=np.int64)
-    )
+    offsets = cp.asarray(np.linspace(0, NUM_ITEMS, NUM_SEGMENTS + 1, dtype=np.int64))
     state.start_offsets = offsets[:-1]
     state.end_offsets = offsets[1:]
     state.num_segments = NUM_SEGMENTS
diff --git a/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py b/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py
index 1d64260400e..4e62cdd3a6d 100644
--- a/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py
+++ b/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py
@@ -5,9 +5,6 @@
 from __future__ import annotations
 
 import pytest
-
-import cuda.compute as cc
-
 from host_benchmark_cases import (
     CASES,
     HostBenchmarkCase,
@@ -15,6 +12,8 @@
     synchronize,
 )
 
+import cuda.compute as cc
+
 pytest.importorskip("pytest_benchmark")
 
 BUILD_TIME_ROUNDS = 10

From 1571d769a36d7135ff760e9742face5a2a97c6a2 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Fri, 12 Jun 2026 16:49:10 -0500
Subject: [PATCH 08/16] Add a case that accepts a stream

---
 .../compute/host/host_benchmark_cases.py      | 460 ++++++++++++++++++
 .../host/test_host_pytest_benchmark.py        |  11 +-
 2 files changed, 466 insertions(+), 5 deletions(-)

diff --git a/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py b/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py
index 769846405bf..9550650567f 100644
--- a/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py
+++ b/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py
@@ -12,6 +12,7 @@
 import numpy as np
 
 import cuda.compute as cc
+from cuda.core import Device
 from cuda.compute._cpp_compile import compile_cpp_op_code
 from cuda.compute.op import RawOp
 
@@ -81,6 +82,12 @@ def make_tiny_temp_storage() -> cp.ndarray:
     return cp.empty(NOOP_TEMP_STORAGE_BYTES, dtype=cp.uint8)
 
 
+def make_stream():
+    device = Device()
+    device.set_current()
+    return device.create_stream()
+
+
 def synchronize() -> None:
     cp.cuda.Device().synchronize()
 
@@ -228,6 +235,29 @@ def _twoshot_reduce(state: SimpleNamespace, wrapper) -> None:
     )
 
 
+def _oneshot_reduce_stream(state: SimpleNamespace) -> None:
+    cc.reduce_into(
+        d_in=state.d_in,
+        d_out=state.d_out[:1],
+        num_items=state.num_items,
+        op=state.op,
+        h_init=state.h_init,
+        stream=state.stream,
+    )
+
+
+def _twoshot_reduce_stream(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_in=state.d_in,
+        d_out=state.d_out[:1],
+        num_items=state.num_items,
+        op=state.op,
+        h_init=state.h_init,
+        stream=state.stream,
+    )
+
+
 def _setup_scan() -> SimpleNamespace:
     state = _setup_unary_input_output()
     state.h_init = np.array([0], dtype=np.int32)
@@ -266,6 +296,29 @@ def _twoshot_scan(state: SimpleNamespace, wrapper) -> None:
     )
 
 
+def _oneshot_scan_stream(state: SimpleNamespace) -> None:
+    cc.exclusive_scan(
+        d_in=state.d_in,
+        d_out=state.d_out,
+        op=state.op,
+        init_value=state.h_init,
+        num_items=state.num_items,
+        stream=state.stream,
+    )
+
+
+def _twoshot_scan_stream(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_in=state.d_in,
+        d_out=state.d_out,
+        op=state.op,
+        init_value=state.h_init,
+        num_items=state.num_items,
+        stream=state.stream,
+    )
+
+
 def _setup_segmented_reduce() -> SimpleNamespace:
     d_in = cp.arange(NUM_ITEMS, dtype=cp.int32)
     d_out = cp.empty(NUM_SEGMENTS, dtype=cp.int32)
@@ -318,6 +371,33 @@ def _twoshot_segmented_reduce(state: SimpleNamespace, wrapper) -> None:
     )
 
 
+def _oneshot_segmented_reduce_stream(state: SimpleNamespace) -> None:
+    cc.segmented_reduce(
+        d_in=state.d_in,
+        d_out=state.d_out,
+        num_segments=state.num_segments,
+        start_offsets_in=state.start_offsets,
+        end_offsets_in=state.end_offsets,
+        op=state.op,
+        h_init=state.h_init,
+        stream=state.stream,
+    )
+
+
+def _twoshot_segmented_reduce_stream(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_in=state.d_in,
+        d_out=state.d_out,
+        num_segments=state.num_segments,
+        start_offsets_in=state.start_offsets,
+        end_offsets_in=state.end_offsets,
+        op=state.op,
+        h_init=state.h_init,
+        stream=state.stream,
+    )
+
+
 def _make_unary_transform(state: SimpleNamespace):
     return cc.make_unary_transform(
         d_in=state.d_in,
@@ -344,6 +424,26 @@ def _twoshot_unary_transform(state: SimpleNamespace, wrapper) -> None:
     )
 
 
+def _oneshot_unary_transform_stream(state: SimpleNamespace) -> None:
+    cc.unary_transform(
+        d_in=state.d_in,
+        d_out=state.d_out,
+        op=state.op,
+        num_items=state.num_items,
+        stream=state.stream,
+    )
+
+
+def _twoshot_unary_transform_stream(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        d_in=state.d_in,
+        d_out=state.d_out,
+        op=state.op,
+        num_items=state.num_items,
+        stream=state.stream,
+    )
+
+
 def _make_binary_transform(state: SimpleNamespace):
     return cc.make_binary_transform(
         d_in1=state.d_in1,
@@ -373,6 +473,28 @@ def _twoshot_binary_transform(state: SimpleNamespace, wrapper) -> None:
     )
 
 
+def _oneshot_binary_transform_stream(state: SimpleNamespace) -> None:
+    cc.binary_transform(
+        d_in1=state.d_in1,
+        d_in2=state.d_in2,
+        d_out=state.d_out,
+        op=state.op,
+        num_items=state.num_items,
+        stream=state.stream,
+    )
+
+
+def _twoshot_binary_transform_stream(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        d_in1=state.d_in1,
+        d_in2=state.d_in2,
+        d_out=state.d_out,
+        op=state.op,
+        num_items=state.num_items,
+        stream=state.stream,
+    )
+
+
 def _setup_histogram() -> SimpleNamespace:
     d_samples = cp.arange(NUM_ITEMS, dtype=cp.int32)
     num_output_levels = 17
@@ -427,6 +549,31 @@ def _twoshot_histogram(state: SimpleNamespace, wrapper) -> None:
     )
 
 
+def _oneshot_histogram_stream(state: SimpleNamespace) -> None:
+    cc.histogram_even(
+        d_samples=state.d_samples,
+        d_histogram=state.d_histogram,
+        num_output_levels=state.num_output_levels,
+        lower_level=state.lower_level,
+        upper_level=state.upper_level,
+        num_samples=state.num_samples,
+        stream=state.stream,
+    )
+
+
+def _twoshot_histogram_stream(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_samples=state.d_samples,
+        d_histogram=state.d_histogram,
+        h_num_output_levels=state.h_num_output_levels,
+        h_lower_level=state.h_lower_level,
+        h_upper_level=state.h_upper_level,
+        num_samples=state.num_samples,
+        stream=state.stream,
+    )
+
+
 def _setup_binary_search() -> SimpleNamespace:
     d_data = cp.arange(NUM_ITEMS, dtype=cp.int32)
     d_values = cp.arange(0, NUM_ITEMS, 2, dtype=cp.int32)
@@ -472,6 +619,30 @@ def _twoshot_lower_bound(state: SimpleNamespace, wrapper) -> None:
     )
 
 
+def _oneshot_lower_bound_stream(state: SimpleNamespace) -> None:
+    cc.lower_bound(
+        d_data=state.d_data,
+        num_items=state.num_items,
+        d_values=state.d_values,
+        num_values=state.num_values,
+        d_out=state.d_out,
+        comp=state.comp,
+        stream=state.stream,
+    )
+
+
+def _twoshot_lower_bound_stream(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        d_data=state.d_data,
+        num_items=state.num_items,
+        d_values=state.d_values,
+        num_values=state.num_values,
+        d_out=state.d_out,
+        comp=state.comp,
+        stream=state.stream,
+    )
+
+
 def _setup_select() -> SimpleNamespace:
     state = _setup_unary_input_output()
     state.d_num_selected = cp.empty(1, dtype=np.uint64)
@@ -510,6 +681,29 @@ def _twoshot_select(state: SimpleNamespace, wrapper) -> None:
     )
 
 
+def _oneshot_select_stream(state: SimpleNamespace) -> None:
+    cc.select(
+        d_in=state.d_in,
+        d_out=state.d_out,
+        d_num_selected_out=state.d_num_selected,
+        cond=state.cond,
+        num_items=state.num_items,
+        stream=state.stream,
+    )
+
+
+def _twoshot_select_stream(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_in=state.d_in,
+        d_out=state.d_out,
+        d_num_selected_out=state.d_num_selected,
+        cond=state.cond,
+        num_items=state.num_items,
+        stream=state.stream,
+    )
+
+
 def _setup_three_way_partition() -> SimpleNamespace:
     state = _setup_unary_input_output()
     state.d_first = cp.empty_like(state.d_in)
@@ -561,6 +755,35 @@ def _twoshot_three_way_partition(state: SimpleNamespace, wrapper) -> None:
     )
 
 
+def _oneshot_three_way_partition_stream(state: SimpleNamespace) -> None:
+    cc.three_way_partition(
+        d_in=state.d_in,
+        d_first_part_out=state.d_first,
+        d_second_part_out=state.d_second,
+        d_unselected_out=state.d_unselected,
+        d_num_selected_out=state.d_num_selected,
+        select_first_part_op=state.first_op,
+        select_second_part_op=state.second_op,
+        num_items=state.num_items,
+        stream=state.stream,
+    )
+
+
+def _twoshot_three_way_partition_stream(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_in=state.d_in,
+        d_first_part_out=state.d_first,
+        d_second_part_out=state.d_second,
+        d_unselected_out=state.d_unselected,
+        d_num_selected_out=state.d_num_selected,
+        select_first_part_op=state.first_op,
+        select_second_part_op=state.second_op,
+        num_items=state.num_items,
+        stream=state.stream,
+    )
+
+
 def _setup_unique_by_key() -> SimpleNamespace:
     d_keys = cp.arange(NUM_ITEMS, dtype=cp.int32)
     d_items = cp.arange(NUM_ITEMS, dtype=cp.int32)
@@ -612,6 +835,33 @@ def _twoshot_unique_by_key(state: SimpleNamespace, wrapper) -> None:
     )
 
 
+def _oneshot_unique_by_key_stream(state: SimpleNamespace) -> None:
+    cc.unique_by_key(
+        d_in_keys=state.d_in_keys,
+        d_in_items=state.d_in_items,
+        d_out_keys=state.d_out_keys,
+        d_out_items=state.d_out_items,
+        d_out_num_selected=state.d_num_selected,
+        op=state.op,
+        num_items=state.num_items,
+        stream=state.stream,
+    )
+
+
+def _twoshot_unique_by_key_stream(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_in_keys=state.d_in_keys,
+        d_in_items=state.d_in_items,
+        d_out_keys=state.d_out_keys,
+        d_out_items=state.d_out_items,
+        d_out_num_selected=state.d_num_selected,
+        op=state.op,
+        num_items=state.num_items,
+        stream=state.stream,
+    )
+
+
 def _setup_sort() -> SimpleNamespace:
     d_in_keys = cp.arange(NUM_ITEMS, 0, -1, dtype=cp.int32)
     d_out_keys = cp.empty_like(d_in_keys)
@@ -653,6 +903,29 @@ def _twoshot_merge_sort(state: SimpleNamespace, wrapper) -> None:
     )
 
 
+def _oneshot_merge_sort_stream(state: SimpleNamespace) -> None:
+    cc.merge_sort(
+        d_in_keys=state.d_in_keys,
+        d_out_keys=state.d_out_keys,
+        num_items=state.num_items,
+        op=state.op,
+        stream=state.stream,
+    )
+
+
+def _twoshot_merge_sort_stream(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_in_keys=state.d_in_keys,
+        d_in_values=None,
+        d_out_keys=state.d_out_keys,
+        d_out_values=None,
+        num_items=state.num_items,
+        op=state.op,
+        stream=state.stream,
+    )
+
+
 def _make_radix_sort(state: SimpleNamespace):
     return cc.make_radix_sort(
         d_in_keys=state.d_in_keys,
@@ -683,6 +956,28 @@ def _twoshot_radix_sort(state: SimpleNamespace, wrapper) -> None:
     )
 
 
+def _oneshot_radix_sort_stream(state: SimpleNamespace) -> None:
+    cc.radix_sort(
+        d_in_keys=state.d_in_keys,
+        d_out_keys=state.d_out_keys,
+        num_items=state.num_items,
+        order=cc.SortOrder.ASCENDING,
+        stream=state.stream,
+    )
+
+
+def _twoshot_radix_sort_stream(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_in_keys=state.d_in_keys,
+        d_out_keys=state.d_out_keys,
+        d_in_values=None,
+        d_out_values=None,
+        num_items=state.num_items,
+        stream=state.stream,
+    )
+
+
 def _setup_segmented_sort() -> SimpleNamespace:
     state = _setup_sort()
     offsets = cp.asarray(np.linspace(0, NUM_ITEMS, NUM_SEGMENTS + 1, dtype=np.int64))
@@ -732,6 +1027,36 @@ def _twoshot_segmented_sort(state: SimpleNamespace, wrapper) -> None:
     )
 
 
+def _oneshot_segmented_sort_stream(state: SimpleNamespace) -> None:
+    cc.segmented_sort(
+        d_in_keys=state.d_in_keys,
+        d_out_keys=state.d_out_keys,
+        d_in_values=None,
+        d_out_values=None,
+        num_items=state.num_items,
+        num_segments=state.num_segments,
+        start_offsets_in=state.start_offsets,
+        end_offsets_in=state.end_offsets,
+        order=cc.SortOrder.ASCENDING,
+        stream=state.stream,
+    )
+
+
+def _twoshot_segmented_sort_stream(state: SimpleNamespace, wrapper) -> None:
+    wrapper(
+        temp_storage=state.temp_storage,
+        d_in_keys=state.d_in_keys,
+        d_out_keys=state.d_out_keys,
+        d_in_values=None,
+        d_out_values=None,
+        num_items=state.num_items,
+        num_segments=state.num_segments,
+        start_offsets_in=state.start_offsets,
+        end_offsets_in=state.end_offsets,
+        stream=state.stream,
+    )
+
+
 def _setup_with_values(
     setup_fn: Callable[[], SimpleNamespace], **values: Any
 ) -> Callable[[], SimpleNamespace]:
@@ -756,6 +1081,17 @@ def setup() -> SimpleNamespace:
     return setup
 
 
+def _setup_with_stream(
+    setup_fn: Callable[[], SimpleNamespace],
+) -> Callable[[], SimpleNamespace]:
+    def setup() -> SimpleNamespace:
+        state = setup_fn()
+        state.stream = make_stream()
+        return state
+
+    return setup
+
+
 def _make_case(
     name: str,
     setup: Callable[[], SimpleNamespace],
@@ -1067,3 +1403,127 @@ def _make_case(
         "temp_storage_and_selector",
     ),
 ]
+
+
+STREAM_CASES = [
+    _make_case(
+        "reduce.plus.stream",
+        _setup_with_stream(_setup_with_values(_setup_reduce, op=cc.OpKind.PLUS)),
+        _make_reduce,
+        _oneshot_reduce_stream,
+        _twoshot_reduce_stream,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "exclusive_scan.plus.stream",
+        _setup_with_stream(_setup_with_values(_setup_scan, op=cc.OpKind.PLUS)),
+        _make_scan,
+        _oneshot_scan_stream,
+        _twoshot_scan_stream,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "segmented_reduce.plus.stream",
+        _setup_with_stream(
+            _setup_with_values(_setup_segmented_reduce, op=cc.OpKind.PLUS)
+        ),
+        _make_segmented_reduce,
+        _oneshot_segmented_reduce_stream,
+        _twoshot_segmented_reduce_stream,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "unary_transform.identity.stream",
+        _setup_with_stream(
+            _setup_with_values(_setup_unary_input_output, op=cc.OpKind.IDENTITY)
+        ),
+        _make_unary_transform,
+        _oneshot_unary_transform_stream,
+        _twoshot_unary_transform_stream,
+        "none",
+    ),
+    _make_case(
+        "binary_transform.plus.stream",
+        _setup_with_stream(
+            _setup_with_values(_setup_binary_input_output, op=cc.OpKind.PLUS)
+        ),
+        _make_binary_transform,
+        _oneshot_binary_transform_stream,
+        _twoshot_binary_transform_stream,
+        "none",
+    ),
+    _make_case(
+        "histogram_even.stream",
+        _setup_with_stream(_setup_histogram),
+        _make_histogram,
+        _oneshot_histogram_stream,
+        _twoshot_histogram_stream,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "lower_bound.less.stream",
+        _setup_with_stream(_setup_with_values(_setup_binary_search, comp=cc.OpKind.LESS)),
+        _make_lower_bound,
+        _oneshot_lower_bound_stream,
+        _twoshot_lower_bound_stream,
+        "none",
+    ),
+    _make_case(
+        "select.logical_not.stream",
+        _setup_with_stream(_setup_with_values(_setup_select, cond=cc.OpKind.LOGICAL_NOT)),
+        _make_select,
+        _oneshot_select_stream,
+        _twoshot_select_stream,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "three_way_partition.logical_not.stream",
+        _setup_with_stream(
+            _setup_with_values(
+                _setup_three_way_partition,
+                first_op=cc.OpKind.LOGICAL_NOT,
+                second_op=cc.OpKind.LOGICAL_NOT,
+            )
+        ),
+        _make_three_way_partition,
+        _oneshot_three_way_partition_stream,
+        _twoshot_three_way_partition_stream,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "unique_by_key.equal.stream",
+        _setup_with_stream(
+            _setup_with_values(_setup_unique_by_key, op=cc.OpKind.EQUAL_TO)
+        ),
+        _make_unique_by_key,
+        _oneshot_unique_by_key_stream,
+        _twoshot_unique_by_key_stream,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "merge_sort.less.stream",
+        _setup_with_stream(_setup_with_values(_setup_sort, op=cc.OpKind.LESS)),
+        _make_merge_sort,
+        _oneshot_merge_sort_stream,
+        _twoshot_merge_sort_stream,
+        "temp_storage_bytes",
+    ),
+    _make_case(
+        "radix_sort.stream",
+        _setup_with_stream(_setup_sort),
+        _make_radix_sort,
+        _oneshot_radix_sort_stream,
+        _twoshot_radix_sort_stream,
+        "temp_storage_and_selector",
+    ),
+    _make_case(
+        "segmented_sort.stream",
+        _setup_with_stream(_setup_segmented_sort),
+        _make_segmented_sort,
+        _oneshot_segmented_sort_stream,
+        _twoshot_segmented_sort_stream,
+        "temp_storage_and_selector",
+    ),
+]
+
+CALL_CASES = CASES + STREAM_CASES
diff --git a/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py b/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py
index 4e62cdd3a6d..1fb6b8953cf 100644
--- a/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py
+++ b/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py
@@ -6,6 +6,7 @@
 
 import pytest
 from host_benchmark_cases import (
+    CALL_CASES,
     CASES,
     HostBenchmarkCase,
     patch_wrapper_to_skip_native_compute,
@@ -23,9 +24,9 @@
 TWOSHOT_ITERATIONS = 1000
 
 
-def _case_params() -> list[pytest.ParameterSet]:
+def _case_params(cases: list[HostBenchmarkCase]) -> list[pytest.ParameterSet]:
     params = []
-    for case in CASES:
+    for case in cases:
         marks = []
         if case.skip_reason is not None:
             marks.append(pytest.mark.skip(reason=case.skip_reason))
@@ -34,7 +35,7 @@ def _case_params() -> list[pytest.ParameterSet]:
 
 
 @pytest.mark.benchmark(group="cuda.compute.host.build_time")
-@pytest.mark.parametrize("case", _case_params())
+@pytest.mark.parametrize("case", _case_params(CASES))
 def test_build_time(benchmark, case: HostBenchmarkCase):
     state = case.setup()
     synchronize()
@@ -55,7 +56,7 @@ def build():
 
 
 @pytest.mark.benchmark(group="cuda.compute.host.oneshot_cached")
-@pytest.mark.parametrize("case", _case_params())
+@pytest.mark.parametrize("case", _case_params(CALL_CASES))
 def test_oneshot_cached_host_overhead(benchmark, case: HostBenchmarkCase):
     cc.clear_all_caches()
     state = case.setup()
@@ -75,7 +76,7 @@ def call() -> None:
 
 
 @pytest.mark.benchmark(group="cuda.compute.host.twoshot_call")
-@pytest.mark.parametrize("case", _case_params())
+@pytest.mark.parametrize("case", _case_params(CALL_CASES))
 def test_twoshot_call_host_overhead(benchmark, case: HostBenchmarkCase):
     cc.clear_all_caches()
     state = case.setup()

From 8172fd6a09ae068632a7d7cc5782a26bbf2b6bc0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 12 Jun 2026 21:53:27 +0000
Subject: [PATCH 09/16] [pre-commit.ci] auto code formatting

---
 .../benchmarks/compute/host/host_benchmark_cases.py    | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py b/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py
index 9550650567f..ceaa51709d6 100644
--- a/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py
+++ b/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py
@@ -12,9 +12,9 @@
 import numpy as np
 
 import cuda.compute as cc
-from cuda.core import Device
 from cuda.compute._cpp_compile import compile_cpp_op_code
 from cuda.compute.op import RawOp
+from cuda.core import Device
 
 NOOP_TEMP_STORAGE_BYTES = 1
 NUM_ITEMS = 128
@@ -1462,7 +1462,9 @@ def _make_case(
     ),
     _make_case(
         "lower_bound.less.stream",
-        _setup_with_stream(_setup_with_values(_setup_binary_search, comp=cc.OpKind.LESS)),
+        _setup_with_stream(
+            _setup_with_values(_setup_binary_search, comp=cc.OpKind.LESS)
+        ),
         _make_lower_bound,
         _oneshot_lower_bound_stream,
         _twoshot_lower_bound_stream,
@@ -1470,7 +1472,9 @@ def _make_case(
     ),
     _make_case(
         "select.logical_not.stream",
-        _setup_with_stream(_setup_with_values(_setup_select, cond=cc.OpKind.LOGICAL_NOT)),
+        _setup_with_stream(
+            _setup_with_values(_setup_select, cond=cc.OpKind.LOGICAL_NOT)
+        ),
         _make_select,
         _oneshot_select_stream,
         _twoshot_select_stream,

From c564d3e13e9931f25052099ec0f461ccee824e08 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Fri, 12 Jun 2026 17:04:23 -0500
Subject: [PATCH 10/16] Address comments

---
 ci/test_cuda_compute_minimal_python.sh                     | 4 ++--
 python/cuda_cccl/tests/compute/test_binary_search.py       | 1 +
 python/cuda_cccl/tests/compute/test_three_way_partition.py | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ci/test_cuda_compute_minimal_python.sh b/ci/test_cuda_compute_minimal_python.sh
index c48b313ed31..672338bbf33 100755
--- a/ci/test_cuda_compute_minimal_python.sh
+++ b/ci/test_cuda_compute_minimal_python.sh
@@ -19,8 +19,8 @@ setup_python_env "${py_version}"
 # Fetch or build the cuda_cccl wheel:
 if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
   wheel_artifact_name=$("$ci_dir/util/workflow/get_wheel_artifact_name.sh")
-  "$ci_dir/util/artifacts/download.sh" "${wheel_artifact_name}" /home/coder/cccl/
-  wheelhouse_dir="/home/coder/cccl/wheelhouse"
+  "$ci_dir/util/artifacts/download.sh" "${wheel_artifact_name}" "${repo_root}/"
+  wheelhouse_dir="${repo_root}/wheelhouse"
 else
   "$ci_dir/build_cuda_cccl_python.sh" -py-version "${py_version}"
   wheelhouse_dir="${repo_root}/wheelhouse"
diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py
index 585a998c44c..1e24ca00116 100644
--- a/python/cuda_cccl/tests/compute/test_binary_search.py
+++ b/python/cuda_cccl/tests/compute/test_binary_search.py
@@ -47,6 +47,7 @@ def disable_sass_check(monkeypatch):
         (cuda.compute.upper_bound, "right"),
     ],
 )
+@pytest.mark.no_numba
 def test_binary_search_explicit_opkind_less(search, side):
     h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32)
     h_values = np.array([0, 3, 4, 10], dtype=np.int32)
diff --git a/python/cuda_cccl/tests/compute/test_three_way_partition.py b/python/cuda_cccl/tests/compute/test_three_way_partition.py
index 9f2683f6798..b9813558de3 100644
--- a/python/cuda_cccl/tests/compute/test_three_way_partition.py
+++ b/python/cuda_cccl/tests/compute/test_three_way_partition.py
@@ -71,7 +71,7 @@ def _raw_less_than_i32(name: str, threshold: int) -> RawOp:
     source = f"""
 extern "C" __device__ void {name}(void* x, void* result) {{
     int value = *static_cast<int*>(x);
-    *static_cast<bool*>(result) = value < {threshold};
+    *static_cast<unsigned char*>(result) = value < {threshold} ? 1 : 0;
 }}
 """
     return RawOp(ltoir=compile_cpp_op_code(source), name=name)

From 79df8e0a62cce4a6b943a369406da3987c989aa8 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Fri, 12 Jun 2026 17:11:35 -0500
Subject: [PATCH 11/16] fix merge conflict

---
 python/cuda_cccl/tests/compute/test_binary_search.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py
index 9b07eeb8815..1e24ca00116 100644
--- a/python/cuda_cccl/tests/compute/test_binary_search.py
+++ b/python/cuda_cccl/tests/compute/test_binary_search.py
@@ -47,10 +47,7 @@ def disable_sass_check(monkeypatch):
         (cuda.compute.upper_bound, "right"),
     ],
 )
-<<<<<<< HEAD
 @pytest.mark.no_numba
-=======
->>>>>>> main
 def test_binary_search_explicit_opkind_less(search, side):
     h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32)
     h_values = np.array([0, 3, 4, 10], dtype=np.int32)

From c2e7c01b5732f968520ef25c21503f244fb02a6b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 12 Jun 2026 22:16:44 +0000
Subject: [PATCH 12/16] [pre-commit.ci] auto code formatting

---
 python/cuda_cccl/tests/compute/test_radix_sort.py          | 2 +-
 python/cuda_cccl/tests/compute/test_scan.py                | 2 +-
 python/cuda_cccl/tests/compute/test_select.py              | 2 +-
 python/cuda_cccl/tests/compute/test_three_way_partition.py | 2 +-
 python/cuda_cccl/tests/compute/test_unique_by_key.py       | 2 +-
 python/cuda_cccl/tests/compute/test_zip_iterator.py        | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/cuda_cccl/tests/compute/test_radix_sort.py b/python/cuda_cccl/tests/compute/test_radix_sort.py
index 67186d3a4f7..d39a7e07fad 100644
--- a/python/cuda_cccl/tests/compute/test_radix_sort.py
+++ b/python/cuda_cccl/tests/compute/test_radix_sort.py
@@ -10,11 +10,11 @@
 import pytest
 
 import cuda.compute
-from cuda.core import Device
 from cuda.compute import (
     DoubleBuffer,
     SortOrder,
 )
+from cuda.core import Device
 
 
 def get_mark(dt, log_size):
diff --git a/python/cuda_cccl/tests/compute/test_scan.py b/python/cuda_cccl/tests/compute/test_scan.py
index 97f5fae630a..0eb3af34c49 100644
--- a/python/cuda_cccl/tests/compute/test_scan.py
+++ b/python/cuda_cccl/tests/compute/test_scan.py
@@ -8,7 +8,6 @@
 import pytest
 
 import cuda.compute
-from cuda.core import Device
 from cuda.compute import (
     CountingIterator,
     OpKind,
@@ -16,6 +15,7 @@
     TransformOutputIterator,
     gpu_struct,
 )
+from cuda.core import Device
 
 
 def scan_host(h_input: np.ndarray, op, h_init, force_inclusive):
diff --git a/python/cuda_cccl/tests/compute/test_select.py b/python/cuda_cccl/tests/compute/test_select.py
index d467d33c635..3dcaf98a507 100644
--- a/python/cuda_cccl/tests/compute/test_select.py
+++ b/python/cuda_cccl/tests/compute/test_select.py
@@ -7,9 +7,9 @@
 import pytest
 
 import cuda.compute
+from cuda.compute import CacheModifiedInputIterator, ZipIterator, gpu_struct
 from cuda.compute._cpp_compile import compile_cpp_op_code
 from cuda.compute.op import RawOp
-from cuda.compute import CacheModifiedInputIterator, ZipIterator, gpu_struct
 
 DTYPE_LIST = [
     np.uint8,
diff --git a/python/cuda_cccl/tests/compute/test_three_way_partition.py b/python/cuda_cccl/tests/compute/test_three_way_partition.py
index b9813558de3..8def513681b 100644
--- a/python/cuda_cccl/tests/compute/test_three_way_partition.py
+++ b/python/cuda_cccl/tests/compute/test_three_way_partition.py
@@ -7,9 +7,9 @@
 import pytest
 
 import cuda.compute
+from cuda.compute import CacheModifiedInputIterator, gpu_struct
 from cuda.compute._cpp_compile import compile_cpp_op_code
 from cuda.compute.op import RawOp
-from cuda.compute import CacheModifiedInputIterator, gpu_struct
 
 DTYPE_LIST = [
     np.uint8,
diff --git a/python/cuda_cccl/tests/compute/test_unique_by_key.py b/python/cuda_cccl/tests/compute/test_unique_by_key.py
index 5f7d310d71e..945d4d99028 100644
--- a/python/cuda_cccl/tests/compute/test_unique_by_key.py
+++ b/python/cuda_cccl/tests/compute/test_unique_by_key.py
@@ -8,13 +8,13 @@
 import pytest
 
 import cuda.compute
-from cuda.core import Device
 from cuda.compute import (
     CacheModifiedInputIterator,
     DiscardIterator,
     OpKind,
     gpu_struct,
 )
+from cuda.core import Device
 
 DTYPE_LIST = [
     np.uint8,
diff --git a/python/cuda_cccl/tests/compute/test_zip_iterator.py b/python/cuda_cccl/tests/compute/test_zip_iterator.py
index aea1f167f9f..9fbed38359f 100644
--- a/python/cuda_cccl/tests/compute/test_zip_iterator.py
+++ b/python/cuda_cccl/tests/compute/test_zip_iterator.py
@@ -6,13 +6,13 @@
 import pytest
 
 import cuda.compute
-from cuda.core import Device
 from cuda.compute import (
     CountingIterator,
     TransformIterator,
     ZipIterator,
     gpu_struct,
 )
+from cuda.core import Device
 
 
 @pytest.mark.parametrize("num_items", [10, 1_000, 100_000])

From b09c9c6ecb2df85d4e4fed2279fcfd7efe255254 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Fri, 12 Jun 2026 17:26:21 -0500
Subject: [PATCH 13/16] Fix pre-commit

---
 .../tests/compute/test_void_ptr_wrapper_validation.py     | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py
index e4d8185eb91..2788a3c0893 100644
--- a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py
+++ b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py
@@ -16,8 +16,12 @@
 numba = pytest.importorskip("numba")
 types = numba.types
 
-from cuda.compute._odr_helpers import _ArgMode, _ArgSpec, _create_void_ptr_wrapper
-from cuda.compute._utils import sanitize_identifier
+from cuda.compute._odr_helpers import (  # noqa: E402
+    _ArgMode,
+    _ArgSpec,
+    _create_void_ptr_wrapper,
+)
+from cuda.compute._utils import sanitize_identifier  # noqa: E402
 
 
 def _make_arg_specs():

From 4e5b9ccb0d7b3bda7dbf3d55b71d3d0ff7879bba Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Mon, 15 Jun 2026 12:10:01 -0500
Subject: [PATCH 14/16] Document new caching behavior and add examples

---
 docs/python/compute/index.rst                 | 41 ++++++++++--
 .../examples/free_threading/__init__.py       |  3 +
 .../examples/free_threading/direct_api.py     | 46 ++++++++++++++
 .../examples/free_threading/object_api.py     | 63 +++++++++++++++++++
 4 files changed, 149 insertions(+), 4 deletions(-)
 create mode 100644 python/cuda_cccl/tests/compute/examples/free_threading/__init__.py
 create mode 100644 python/cuda_cccl/tests/compute/examples/free_threading/direct_api.py
 create mode 100644 python/cuda_cccl/tests/compute/examples/free_threading/object_api.py

diff --git a/docs/python/compute/index.rst b/docs/python/compute/index.rst
index ba8519a8352..f193cbe95da 100644
--- a/docs/python/compute/index.rst
+++ b/docs/python/compute/index.rst
@@ -250,10 +250,14 @@ When working with structured data, there are two common memory layouts:
 Caching
 -------
 
-Algorithms in ``cuda.compute`` are compiled to GPU code at runtime. To avoid
-recompiling on every call, build results are cached in memory. When you invoke
-an algorithm with the same configuration—same dtypes, iterator kinds, operator,
-and compute capability—the cached build is reused.
+Algorithms in ``cuda.compute`` are compiled to GPU code at runtime. To
+avoid recompiling on every call, build results are cached in memory.
+When you invoke an algorithm with the same configuration—same dtypes,
+iterator kinds, operator, compute capability, and current device—the
+cached build is reused. On systems with multiple GPUs, builds may be
+cached separately for each GPU. When free-threaded Python is enabled,
+compiled build results may be reused by multiple threads in the same
+process.
 
 What determines the cache key
 +++++++++++++++++++++++++++++
@@ -265,12 +269,41 @@ Each algorithm computes a cache key from:
 * **Operator identity** — for user-defined functions, the function's bytecode,
   constants, and closure contents (see below)
 * **Compute capability** — the GPU architecture of the current device
+* **Current device** — the CUDA device active when the algorithm is built
 * **Algorithm-specific parameters** — such as initial value dtype or determinism mode
 
 Note that array *contents* or *pointers* are not part of the cache key—only
 the array's dtype. This means you can reuse a cached algorithm across different
 arrays of the same type.
 
+Multi-GPU behavior
+++++++++++++++++++
+
+Cached builds are device-specific. If the same algorithm configuration is used
+on multiple GPUs, ``cuda.compute`` may compile and cache a separate build for
+each device. Set the intended current CUDA device before constructing or invoking
+an algorithm, and pass arrays that are valid on that device.
+
+Free-threaded Python
+++++++++++++++++++++
+
+When ``cuda.compute`` is built for a free-threaded Python interpreter,
+independent calls from multiple Python threads can reuse compiled build results
+within the same process.
+
+The cache is local to the current Python process. Separate Python processes build
+and cache independently, even if they use the same GPU and algorithm
+configuration.
+
+This does not make user-provided memory or CUDA work automatically safe to share.
+Users are still responsible for avoiding data races, such as two threads writing
+to the same output array at the same time. For concurrent use, prefer the direct
+algorithm APIs, such as
+:func:`reduce_into <cuda.compute.algorithms.reduce_into>`, or create a separate
+reusable algorithm object in each thread (for example, the object returned by
+:func:`make_reduce_into <cuda.compute.algorithms.make_reduce_into>`). If multiple
+threads share one of these objects, serialize access to that object.
+
 How user-defined functions are cached
 +++++++++++++++++++++++++++++++++++++
 
diff --git a/python/cuda_cccl/tests/compute/examples/free_threading/__init__.py b/python/cuda_cccl/tests/compute/examples/free_threading/__init__.py
new file mode 100644
index 00000000000..8bbe3ce1ab8
--- /dev/null
+++ b/python/cuda_cccl/tests/compute/examples/free_threading/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/python/cuda_cccl/tests/compute/examples/free_threading/direct_api.py b/python/cuda_cccl/tests/compute/examples/free_threading/direct_api.py
new file mode 100644
index 00000000000..02cbe05a6f6
--- /dev/null
+++ b/python/cuda_cccl/tests/compute/examples/free_threading/direct_api.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# example-begin
+"""
+Run independent direct API calls from multiple Python threads.
+"""
+
+from concurrent.futures import ThreadPoolExecutor
+
+import cupy as cp
+import numpy as np
+
+import cuda.compute
+from cuda.compute import OpKind
+
+
+def reduce_values(h_input):
+    dtype = np.int32
+    h_init = np.array([0], dtype=dtype)
+    d_input = cp.asarray(h_input, dtype=dtype)
+    d_output = cp.empty(1, dtype=dtype)
+
+    cuda.compute.reduce_into(
+        d_in=d_input,
+        d_out=d_output,
+        num_items=len(h_input),
+        op=OpKind.PLUS,
+        h_init=h_init,
+    )
+
+    return int(d_output.get()[0])
+
+
+inputs = [
+    np.array([1, 2, 3, 4], dtype=np.int32),
+    np.array([5, 6, 7, 8], dtype=np.int32),
+]
+
+with ThreadPoolExecutor(max_workers=len(inputs)) as executor:
+    results = list(executor.map(reduce_values, inputs))
+
+expected = [int(np.sum(h_input)) for h_input in inputs]
+assert results == expected
+print(f"Free-threaded direct API results: {results}")
diff --git a/python/cuda_cccl/tests/compute/examples/free_threading/object_api.py b/python/cuda_cccl/tests/compute/examples/free_threading/object_api.py
new file mode 100644
index 00000000000..2b7ec1de3ba
--- /dev/null
+++ b/python/cuda_cccl/tests/compute/examples/free_threading/object_api.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# example-begin
+"""
+Run independent object-based API calls from multiple Python threads.
+"""
+
+from concurrent.futures import ThreadPoolExecutor
+
+import cupy as cp
+import numpy as np
+
+import cuda.compute
+from cuda.compute import OpKind
+
+
+def reduce_values(h_input):
+    dtype = np.int32
+    h_init = np.array([0], dtype=dtype)
+    d_input = cp.asarray(h_input, dtype=dtype)
+    d_output = cp.empty(1, dtype=dtype)
+
+    reducer = cuda.compute.make_reduce_into(
+        d_in=d_input,
+        d_out=d_output,
+        op=OpKind.PLUS,
+        h_init=h_init,
+    )
+    temp_storage_size = reducer(
+        temp_storage=None,
+        d_in=d_input,
+        d_out=d_output,
+        num_items=len(h_input),
+        op=OpKind.PLUS,
+        h_init=h_init,
+    )
+    d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)
+
+    reducer(
+        temp_storage=d_temp_storage,
+        d_in=d_input,
+        d_out=d_output,
+        num_items=len(h_input),
+        op=OpKind.PLUS,
+        h_init=h_init,
+    )
+
+    return int(d_output.get()[0])
+
+
+inputs = [
+    np.array([1, 2, 3, 4], dtype=np.int32),
+    np.array([5, 6, 7, 8], dtype=np.int32),
+]
+
+with ThreadPoolExecutor(max_workers=len(inputs)) as executor:
+    results = list(executor.map(reduce_values, inputs))
+
+expected = [int(np.sum(h_input)) for h_input in inputs]
+assert results == expected
+print(f"Free-threaded object API results: {results}")

From cd0858e6066545cdda8fb3a97175d564ca3fcad9 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Mon, 15 Jun 2026 14:41:24 -0500
Subject: [PATCH 15/16] Update developer documentation

---
 docs/python/compute/developer_overview.rst | 170 +++++++++++++++++++++
 docs/python/compute/index.rst              |   5 +-
 2 files changed, 174 insertions(+), 1 deletion(-)

diff --git a/docs/python/compute/developer_overview.rst b/docs/python/compute/developer_overview.rst
index c6d30971b01..6cec0b38e5f 100644
--- a/docs/python/compute/developer_overview.rst
+++ b/docs/python/compute/developer_overview.rst
@@ -451,6 +451,176 @@ as an example:
    At this point, the kernels stored in the reduction object are
    launched and the reduction is performed.
 
+Caching and free-threaded Python
+--------------------------------
+
+The user-facing cache behavior is described in :ref:`cuda.compute.caching`. This
+section describes the implementation contracts that keep that behavior correct
+for free-threaded Python and multi-GPU use.
+
+Design requirements
++++++++++++++++++++
+
+The free-threading design is constrained by the following requirements:
+
+* Importing ``cuda.compute`` in a free-threaded CPython interpreter must not
+  re-enable the GIL.
+* Free-threading support should not add global locking or shared-state
+  contention to the normal single-threaded execution path. Wrapper cache hits
+  should be thread-local, and normal algorithm execution should not take a
+  global cache lock.
+* Mutable wrapper state must not be shared across threads.
+* Expensive native build results should still be shared across threads when they
+  are safe to share.
+* Same-key concurrent cold builds should build once; waiters should receive the
+  same result or observe the same exception.
+
+Build and validation requirements
++++++++++++++++++++++++++++++++++
+
+The Cython extension that backs ``cuda.compute`` must opt in to free-threaded
+execution:
+
+.. code-block:: cython
+
+   # cython: freethreading_compatible=True
+
+Without this marker, importing the extension in a free-threaded CPython process
+can cause CPython to re-enable the GIL. The generated extension should advertise
+``Py_MOD_GIL_NOT_USED`` and importing ``cuda.compute`` should leave
+``sys._is_gil_enabled()`` false.
+
+The free-threaded wheel must also keep its free-threaded ABI tag after repair and
+merge steps. For CPython 3.14, the expected wheel tag contains
+``cp314-cp314t`` rather than the regular ``cp314-cp314`` tag. The acceptance
+criteria for a free-threaded build are:
+
+* the wheel has the expected ``cp314-cp314t`` ABI tag;
+* importing ``cuda.compute`` does not re-enable the GIL;
+* the free-threading stress suite passes without forcing ``PYTHON_GIL=0`` or
+  ``-X gil=0``.
+
+Two cache layers
+++++++++++++++++
+
+Internally, ``cuda.compute`` separates two kinds of cached state:
+
+* **Wrapper objects** are the Python objects returned by ``make_*`` APIs, such as
+  ``make_reduce_into``. They own per-call descriptor state and are cached per
+  Python thread by ``cache_with_registered_key_functions`` in
+  ``cuda/compute/_caching.py``. Keeping wrapper caches thread-local avoids
+  sharing mutable wrapper state across concurrent calls from free-threaded
+  Python.
+* **Build results** are the Cython objects that own the native C parallel build
+  state, such as loaded CUDA libraries, kernels, policy state, and other
+  read-only data needed to invoke an algorithm. They are cached by
+  ``cache_build_result`` and may be shared by wrapper objects in different
+  Python threads.
+
+The normal cache-hit path is intentionally cheap. A wrapper-cache hit is
+thread-local and does not take the shared build-cache lock. The shared
+build-cache lock is used when constructing a wrapper that needs to look up,
+coordinate, or create a native build result, not during ordinary execution of an
+already-returned wrapper object.
+
+Device keying
++++++++++++++
+
+Both cache layers include the current CUDA runtime device ordinal and compute
+capability in their keys. The compute capability identifies the architecture used
+for code generation and policy selection. The device ordinal keeps native build
+state associated with the device on which it was built.
+
+The first implementation intentionally keys shared build results by CUDA runtime
+device ordinal rather than by CUDA context handle. User-managed CUDA driver
+contexts are not a target use case for ``cuda.compute``. CUDA runtime,
+``cuda.core``, CuPy, and PyTorch-style applications are expected to use the
+primary-context model, and language frontends generally prefer that model.
+
+The first implementation also does not share build results across devices that
+happen to have the same compute capability. Native build results are not treated
+as pure SM-level code artifacts. They can contain CUDA-facing build/load state,
+and CUB launch paths may resolve a ``CUkernel`` to the current-context
+``CUfunction`` before occupancy queries or launch. Some paths also get or set
+kernel attributes on the resolved function, and CUDA kernel-attribute behavior
+is device-specific. Until every build-result path is audited for same-SM
+cross-device sharing, separate device ordinals build and cache separate native
+results.
+
+Concurrent build coordination
++++++++++++++++++++++++++++++
+
+``cache_build_result`` is responsible for coordinating concurrent cache misses.
+The first thread to miss a build-result key runs the builder, while other
+threads wait for that in-flight build to complete. If the build succeeds, all
+waiting threads receive the same cached build result. If it fails, the exception
+is propagated to the waiting threads and the failed build is not stored in the
+cache.
+
+When adding a new algorithm, the factory that returns the reusable wrapper object
+should use ``cache_with_registered_key_functions``. The wrapper constructor
+should pass the expensive native build operation to ``cache_build_result`` if
+that native state is safe to share across threads. Do not perform an expensive
+native build before entering ``cache_build_result``; otherwise same-key cold
+factory calls can duplicate the build and bypass single-flight coordination.
+
+The specialization key must include every argument that can affect generated
+code, type layout, policy selection, or native build state. It should not include
+runtime-only values such as array pointers, array contents, item counts, streams,
+or temporary-storage pointers unless those values change the compiled interface.
+
+User-object and descriptor contracts
+++++++++++++++++++++++++++++++++++++
+
+Wrapper objects returned by ``make_*`` APIs are not thread-reentrant. If two
+threads need the same algorithm specialization, each thread should call the
+factory and receive its own wrapper object, or the caller must externally
+serialize access to a shared wrapper. The wrapper updates its Cython
+``Iterator``, ``Op``, ``Value``, and algorithm-specific descriptors before each
+native call, so concurrent calls through the same wrapper could overwrite the
+descriptor state another thread is about to use.
+
+Read-only iterator and operator objects may be shared across threads. The
+iterator base class uses a per-iterator lock for first-time lazy construction of
+advance, input-dereference, and output-dereference ``Op`` objects; cached access
+after that remains lock-free. This lock does not make arbitrary mutation safe:
+concurrent mutation of iterator state, operator state, captured state, or child
+iterators remains unsupported unless the caller synchronizes externally.
+
+Mutable execution state belongs to one thread at a time unless the caller
+provides synchronization. This includes output arrays, temporary-storage buffers,
+streams, ``DoubleBuffer`` instances, and other objects whose state changes as
+part of a launch.
+
+Backend-specific notes
+++++++++++++++++++++++
+
+The v1 NVRTC/nvJitLink backend and the v2 HostJIT backend have different
+free-threading risk surfaces and must be audited independently. v1 stresses
+NVRTC, nvJitLink, CUDA library loading, and CUB host dispatch. v2 adds HostJIT
+compiler state, LLVM/Clang initialization, persistent PCH paths, generated
+source/cubin artifacts, and dynamic loader lifetime.
+
+Transform has one additional v1 native-cache rule. In CPython 3.14
+free-threaded builds, ``python/cuda_cccl/CMakeLists.txt`` defines
+``CCCL_PYTHON_FREE_THREADED`` for the bundled C parallel target, and
+``c/parallel/src/transform.cu`` uses that macro to bypass the native
+``async_config`` / ``prefetch_config`` cache. Normal non-free-threaded builds
+keep the existing lazy native cache path. This avoids adding launch-path locking
+for transform in free-threaded Python builds while preserving the existing
+single-threaded behavior elsewhere.
+
+Clearing caches
++++++++++++++++
+
+``clear_all_caches()`` is process-local. It clears all known per-thread wrapper
+caches through a weak registry of live thread cache containers, and it clears the
+shared build-result cache. Separate Python processes build and cache
+independently.
+
+Calling ``clear_all_caches()`` concurrently with active factory calls or
+algorithm execution is not supported unless the caller synchronizes externally.
+
 
 For readers who want to connect this overview back to the source tree:
 
diff --git a/docs/python/compute/index.rst b/docs/python/compute/index.rst
index f193cbe95da..e5f82bb8386 100644
--- a/docs/python/compute/index.rst
+++ b/docs/python/compute/index.rst
@@ -297,7 +297,10 @@ configuration.
 
 This does not make user-provided memory or CUDA work automatically safe to share.
 Users are still responsible for avoiding data races, such as two threads writing
-to the same output array at the same time. For concurrent use, prefer the direct
+to the same output array at the same time. Read-only iterator and operator
+objects may be shared across threads, but concurrent mutation of those objects,
+captured state, or underlying arrays requires external synchronization. For
+concurrent use, prefer the direct
 algorithm APIs, such as
 :func:`reduce_into <cuda.compute.algorithms.reduce_into>`, or create a separate
 reusable algorithm object in each thread (for example, the object returned by

From 15652c00259b5650977a4fb33d66d4bd680559b6 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Tue, 23 Jun 2026 09:34:33 -0500
Subject: [PATCH 16/16] Move no numba tests to separate file

---
 ci/test_cuda_compute_minimal_python.sh        |  21 +-
 .../tests/compute/test_binary_search.py       |   1 -
 .../cuda_cccl/tests/compute/test_bindings.py  |   2 -
 .../compute/test_deferred_annotations.py      |   2 -
 .../tests/compute/test_func_caching.py        |   9 -
 .../cuda_cccl/tests/compute/test_histogram.py |   2 -
 .../cuda_cccl/tests/compute/test_iterators.py |  10 +-
 .../tests/compute/test_merge_sort.py          |  55 +-
 .../tests/compute/test_nested_struct.py       |   5 -
 .../cuda_cccl/tests/compute/test_no_numba.py  | 560 +++++++++++++++++-
 .../compute/test_permutation_iterator.py      |   6 -
 .../tests/compute/test_radix_sort.py          |  76 ++-
 python/cuda_cccl/tests/compute/test_reduce.py |  38 +-
 python/cuda_cccl/tests/compute/test_scan.py   |  16 +-
 .../tests/compute/test_segmented_reduce.py    |   3 -
 .../tests/compute/test_segmented_sort.py      |  51 +-
 python/cuda_cccl/tests/compute/test_select.py |  31 -
 .../tests/compute/test_shuffle_iterator.py    |   1 -
 .../compute/test_struct_field_validation.py   |   2 -
 .../tests/compute/test_three_way_partition.py |  46 --
 .../cuda_cccl/tests/compute/test_transform.py |   5 -
 .../tests/compute/test_unique_by_key.py       |  69 ++-
 .../test_void_ptr_wrapper_validation.py       |  12 +-
 .../tests/compute/test_zip_iterator.py        |  11 +-
 24 files changed, 716 insertions(+), 318 deletions(-)

diff --git a/ci/test_cuda_compute_minimal_python.sh b/ci/test_cuda_compute_minimal_python.sh
index 672338bbf33..a43bbac7bfa 100755
--- a/ci/test_cuda_compute_minimal_python.sh
+++ b/ci/test_cuda_compute_minimal_python.sh
@@ -27,25 +27,10 @@ else
 fi
 
 # Install cuda_cccl with the minimal CUDA extra. This intentionally avoids the
-# full cu* extras because those pull in numba/numba-cuda. In a clean minimal
-# environment, the test phase below runs only tests marked no_numba.
+# full cu* extras because those pull in numba/numba-cuda.
 CUDA_CCCL_WHEEL_PATH="$(ls "${wheelhouse_dir}"/cuda_cccl-*.whl)"
 python -m pip install "${CUDA_CCCL_WHEEL_PATH}[minimal-cu${cuda_major_version}]"
 python -m pip install pytest pytest-xdist "cupy-cuda${cuda_major_version}x"
 
-if python - <<'PY'
-try:
-    import numba.cuda  # noqa: F401
-except Exception as exc:
-    print(f"numba.cuda unavailable; running no_numba subset: {exc!r}")
-    raise SystemExit(1)
-else:
-    print("numba.cuda available; running full compute test suite.")
-PY
-then
-  cd "${repo_root}/python/cuda_cccl/tests/"
-  python -m pytest -n 6 -v compute/ -m "not large"
-else
-  cd "${repo_root}/python/cuda_cccl/tests/"
-  python -m pytest -n 6 -v compute/ -m "not large and no_numba"
-fi
+cd "${repo_root}/python/cuda_cccl/tests/"
+python -m pytest -n 6 -v compute/test_no_numba.py
diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py
index 1e24ca00116..585a998c44c 100644
--- a/python/cuda_cccl/tests/compute/test_binary_search.py
+++ b/python/cuda_cccl/tests/compute/test_binary_search.py
@@ -47,7 +47,6 @@ def disable_sass_check(monkeypatch):
         (cuda.compute.upper_bound, "right"),
     ],
 )
-@pytest.mark.no_numba
 def test_binary_search_explicit_opkind_less(search, side):
     h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32)
     h_values = np.array([0, 3, 4, 10], dtype=np.int32)
diff --git a/python/cuda_cccl/tests/compute/test_bindings.py b/python/cuda_cccl/tests/compute/test_bindings.py
index ccfbd8d0bc3..2f16da620ef 100644
--- a/python/cuda_cccl/tests/compute/test_bindings.py
+++ b/python/cuda_cccl/tests/compute/test_bindings.py
@@ -4,8 +4,6 @@
 
 import cuda.compute._bindings as bindings
 
-pytestmark = pytest.mark.no_numba
-
 
 @pytest.fixture(
     params=[
diff --git a/python/cuda_cccl/tests/compute/test_deferred_annotations.py b/python/cuda_cccl/tests/compute/test_deferred_annotations.py
index a584b8ffbe2..c30aeda4068 100644
--- a/python/cuda_cccl/tests/compute/test_deferred_annotations.py
+++ b/python/cuda_cccl/tests/compute/test_deferred_annotations.py
@@ -7,12 +7,10 @@
 
 import cupy as cp
 import numpy as np
-import pytest
 
 from cuda.compute import OpKind, TransformIterator, gpu_struct, reduce_into
 
 
-@pytest.mark.no_numba
 def test_deferred_annotations():
     # test that we can use @gpu_struct with deferred annotations
     # GH: #6421
diff --git a/python/cuda_cccl/tests/compute/test_func_caching.py b/python/cuda_cccl/tests/compute/test_func_caching.py
index e94e0069529..8fcf47b32a7 100644
--- a/python/cuda_cccl/tests/compute/test_func_caching.py
+++ b/python/cuda_cccl/tests/compute/test_func_caching.py
@@ -1,12 +1,10 @@
 import numpy as np
-import pytest
 
 from cuda.compute._caching import CachableFunction
 
 global_x = 1
 
 
-@pytest.mark.no_numba
 def test_func_caching_basic():
     def func(x):
         return x
@@ -21,7 +19,6 @@ def func(x):
     assert f1 == f2
 
 
-@pytest.mark.no_numba
 def test_func_caching_different_names():
     def func(x):
         return x
@@ -36,7 +33,6 @@ def func2(x):
     assert f1 != f2
 
 
-@pytest.mark.no_numba
 def test_func_caching_different_code():
     def func(x):
         return x
@@ -50,7 +46,6 @@ def func(x):
     assert f1 != f2
 
 
-@pytest.mark.no_numba
 def test_func_caching_with_closure():
     def factory(x):
         def func(y):
@@ -66,7 +61,6 @@ def func(y):
     assert f1 != f3
 
 
-@pytest.mark.no_numba
 def test_func_caching_with_numpy_numeric_scalar_closure():
     def factory(indexlength, regularsize):
         index_dtype = np.int64
@@ -86,7 +80,6 @@ def func(counter):
     assert f1 != f3
 
 
-@pytest.mark.no_numba
 def test_func_caching_with_global_variable():
     global global_x
 
@@ -133,7 +126,6 @@ def func(x):
     assert CachableFunction(func1) != CachableFunction(func3)
 
 
-@pytest.mark.no_numba
 def test_func_caching_with_global_np_ufunc():
     def make_func():
         def func(x):
@@ -153,7 +145,6 @@ def func(x):
     assert CachableFunction(func1) != CachableFunction(func2)
 
 
-@pytest.mark.no_numba
 def test_func_caching_with_aliased_np_ufunc():
     def make_func1():
         amin = np.argmin
diff --git a/python/cuda_cccl/tests/compute/test_histogram.py b/python/cuda_cccl/tests/compute/test_histogram.py
index 5c104114754..0a9d96674e6 100644
--- a/python/cuda_cccl/tests/compute/test_histogram.py
+++ b/python/cuda_cccl/tests/compute/test_histogram.py
@@ -12,8 +12,6 @@
     CountingIterator,
 )
 
-pytestmark = pytest.mark.no_numba
-
 DTYPE_LIST = [
     np.uint8,
     np.uint16,
diff --git a/python/cuda_cccl/tests/compute/test_iterators.py b/python/cuda_cccl/tests/compute/test_iterators.py
index c3b6acc9fc9..49e2ab39d15 100644
--- a/python/cuda_cccl/tests/compute/test_iterators.py
+++ b/python/cuda_cccl/tests/compute/test_iterators.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import cupy as cp
+import numba.cuda
 import numpy as np
 import pytest
 
@@ -21,7 +22,6 @@
 )
 
 
-@pytest.mark.no_numba
 def test_constant_iterator_equality():
     it1 = ConstantIterator(np.int32(0))
     it2 = ConstantIterator(np.int32(0))
@@ -32,7 +32,6 @@ def test_constant_iterator_equality():
     assert it1.kind != it4.kind
 
 
-@pytest.mark.no_numba
 def test_counting_iterator_equality():
     it1 = CountingIterator(np.int32(0))
     it2 = CountingIterator(np.int32(0))
@@ -43,7 +42,6 @@ def test_counting_iterator_equality():
     assert it1.kind != it4.kind
 
 
-@pytest.mark.no_numba
 def test_cache_modified_input_iterator_equality():
     ary1 = cp.asarray([0, 1, 2], dtype="int32")
     ary2 = cp.asarray([3, 4, 5], dtype="int32")
@@ -124,14 +122,11 @@ def reverse_iterator_array(request):
     if array_type == "cupy":
         array = cp.array(base_array)
     else:
-        import numba.cuda
-
         array = numba.cuda.to_device(base_array)
 
     return array
 
 
-@pytest.mark.no_numba
 def test_reverse_input_iterator_equality():
     ary1 = cp.asarray([0, 1, 2], dtype="int32")
     ary2 = cp.asarray([3, 4, 5], dtype="int32")
@@ -146,7 +141,6 @@ def test_reverse_input_iterator_equality():
     assert it1.kind != it4.kind
 
 
-@pytest.mark.no_numba
 def test_reverse_output_iterator_equality():
     ary1 = cp.asarray([0, 1, 2], dtype="int32")
     ary2 = cp.asarray([3, 4, 5], dtype="int32")
@@ -161,7 +155,6 @@ def test_reverse_output_iterator_equality():
     assert it1.kind != it4.kind
 
 
-@pytest.mark.no_numba
 @pytest.mark.parametrize(
     "shape, itemsize, expected",
     [
@@ -186,7 +179,6 @@ def test_compute_c_contiguous_strides_in_bytes(shape, itemsize, expected):
     assert result == expected
 
 
-@pytest.mark.no_numba
 @pytest.mark.parametrize(
     "shape, dtype",
     [
diff --git a/python/cuda_cccl/tests/compute/test_merge_sort.py b/python/cuda_cccl/tests/compute/test_merge_sort.py
index db97d5aca01..33d7d15de58 100644
--- a/python/cuda_cccl/tests/compute/test_merge_sort.py
+++ b/python/cuda_cccl/tests/compute/test_merge_sort.py
@@ -5,6 +5,7 @@
 from typing import List
 
 import cupy as cp
+import numba.cuda
 import numpy as np
 import pytest
 
@@ -81,11 +82,11 @@ def compare_op(lhs, rhs):
 def test_merge_sort_keys(dtype, num_items, op):
     h_in_keys = random_array(num_items, dtype)
 
-    d_in_keys = cp.asarray(h_in_keys)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
 
     merge_sort_device(d_in_keys, None, d_in_keys, None, op, num_items)
 
-    h_out_keys = d_in_keys.get()
+    h_out_keys = d_in_keys.copy_to_host()
     h_in_keys.sort()
 
     np.testing.assert_array_equal(h_out_keys, h_in_keys)
@@ -101,13 +102,13 @@ def test_merge_sort_pairs(dtype, num_items, op, monkeypatch):
     h_in_keys = random_array(num_items, dtype)
     h_in_items = random_array(num_items, np.float32)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_items = cp.asarray(h_in_items)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_items = numba.cuda.to_device(h_in_items)
 
     merge_sort_device(d_in_keys, d_in_items, d_in_keys, d_in_items, op, num_items)
 
-    h_out_keys = d_in_keys.get()
-    h_out_items = d_in_items.get()
+    h_out_keys = d_in_keys.copy_to_host()
+    h_out_items = d_in_items.copy_to_host()
 
     argsort = np.argsort(h_in_keys, stable=True)
     h_in_keys = np.array(h_in_keys)[argsort]
@@ -122,12 +123,12 @@ def test_merge_sort_keys_copy(dtype, num_items, op):
     h_in_keys = random_array(num_items, dtype)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_out_keys = cp.asarray(h_out_keys)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
 
     merge_sort_device(d_in_keys, None, d_out_keys, None, op, num_items)
 
-    h_out_keys = d_out_keys.get()
+    h_out_keys = d_out_keys.copy_to_host()
     h_in_keys.sort()
 
     np.testing.assert_array_equal(h_out_keys, h_in_keys)
@@ -145,15 +146,15 @@ def test_merge_sort_pairs_copy(dtype, num_items, op, monkeypatch):
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_items = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_items = cp.asarray(h_in_items)
-    d_out_keys = cp.asarray(h_out_keys)
-    d_out_items = cp.asarray(h_out_items)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_items = numba.cuda.to_device(h_in_items)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_out_items = numba.cuda.to_device(h_out_items)
 
     merge_sort_device(d_in_keys, d_in_items, d_out_keys, d_out_items, op, num_items)
 
-    h_out_keys = d_out_keys.get()
-    h_out_items = d_out_items.get()
+    h_out_keys = d_out_keys.copy_to_host()
+    h_out_items = d_out_items.copy_to_host()
 
     argsort = np.argsort(h_in_keys, stable=True)
     h_in_keys = np.array(h_in_keys)[argsort]
@@ -224,11 +225,11 @@ def compare_complex(lhs, rhs):
     imaginary = random_array(num_items, np.int64, max_value)
 
     h_in_keys = real + 1j * imaginary
-    d_in_keys = cp.asarray(h_in_keys)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
 
     merge_sort_device(d_in_keys, None, d_in_keys, None, compare_complex, num_items)
 
-    h_out_keys = d_in_keys.get()
+    h_out_keys = d_in_keys.copy_to_host()
     h_in_keys = h_in_keys[np.argsort(h_in_keys.real, stable=True)]
 
     np.testing.assert_array_equal(h_out_keys, h_in_keys)
@@ -239,15 +240,15 @@ def test_merge_sort_keys_copy_iterator_input(dtype, num_items, op):
     h_in_keys = random_array(num_items, dtype)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_out_keys = cp.asarray(h_out_keys)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
 
     i_input = CacheModifiedInputIterator(d_in_keys, modifier="stream")
 
     merge_sort_device(i_input, None, d_out_keys, None, op, num_items)
 
     h_in_keys.sort()
-    h_out_keys = d_out_keys.get()
+    h_out_keys = d_out_keys.copy_to_host()
 
     np.testing.assert_array_equal(h_out_keys, h_in_keys)
 
@@ -264,10 +265,10 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch)
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_items = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_items = cp.asarray(h_in_items)
-    d_out_keys = cp.asarray(h_out_keys)
-    d_out_items = cp.asarray(h_out_items)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_items = numba.cuda.to_device(h_in_items)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_out_items = numba.cuda.to_device(h_out_items)
 
     i_input_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream")
     i_input_items = CacheModifiedInputIterator(d_in_items, modifier="stream")
@@ -276,8 +277,8 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch)
         i_input_keys, i_input_items, d_out_keys, d_out_items, op, num_items
     )
 
-    h_out_keys = d_out_keys.get()
-    h_out_items = d_out_items.get()
+    h_out_keys = d_out_keys.copy_to_host()
+    h_out_items = d_out_items.copy_to_host()
 
     argsort = np.argsort(h_in_keys, stable=True)
     h_in_keys = np.array(h_in_keys)[argsort]
@@ -306,7 +307,6 @@ def test_merge_sort_with_stream(cuda_stream):
     np.testing.assert_array_equal(got, h_in_keys)
 
 
-@pytest.mark.no_numba
 def test_merge_sort_well_known_less():
     dtype = np.int32
 
@@ -326,7 +326,6 @@ def test_merge_sort_well_known_less():
     np.testing.assert_equal(d_out_keys.get(), expected)
 
 
-@pytest.mark.no_numba
 def test_merge_sort_well_known_greater():
     dtype = np.int32
 
diff --git a/python/cuda_cccl/tests/compute/test_nested_struct.py b/python/cuda_cccl/tests/compute/test_nested_struct.py
index 715873af3c8..9cbfa40e393 100644
--- a/python/cuda_cccl/tests/compute/test_nested_struct.py
+++ b/python/cuda_cccl/tests/compute/test_nested_struct.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 import cupy as cp
 import numpy as np
-import pytest
 
 import cuda.compute
 from cuda.compute import ZipIterator, gpu_struct
@@ -150,7 +149,6 @@ def sum_pixels(p1, p2):
     assert result["color"]["b"] == expected_b
 
 
-@pytest.mark.no_numba
 def test_dict_init_nested_struct():
     """Test initializing a nested struct with a dictionary."""
     Inner = gpu_struct({"a": np.int32, "b": np.float32})
@@ -164,7 +162,6 @@ def test_dict_init_nested_struct():
     assert np.isclose(obj.inner.b, 3.14)
 
 
-@pytest.mark.no_numba
 def test_dict_init_per_field():
     """Test initializing a struct with a dictionary for a nested field."""
     Inner = gpu_struct({"a": np.int32, "b": np.float32})
@@ -178,7 +175,6 @@ def test_dict_init_per_field():
     assert np.isclose(obj.inner.b, 3.14)
 
 
-@pytest.mark.no_numba
 def test_dict_init_deeply_nested():
     """Test initializing deeply nested structs (3+ levels) with dictionaries."""
     Level1 = gpu_struct({"value": np.int32})
@@ -193,7 +189,6 @@ def test_dict_init_deeply_nested():
     assert obj.middle.nested.value == 42
 
 
-@pytest.mark.no_numba
 def test_dict_init_mixed():
     """Test mixed initialization with some dicts and some direct values."""
     Inner1 = gpu_struct({"a": np.int32, "b": np.int32})
diff --git a/python/cuda_cccl/tests/compute/test_no_numba.py b/python/cuda_cccl/tests/compute/test_no_numba.py
index bbab965c8bf..6fb1ef0e811 100644
--- a/python/cuda_cccl/tests/compute/test_no_numba.py
+++ b/python/cuda_cccl/tests/compute/test_no_numba.py
@@ -1,12 +1,568 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import cupy as cp
+import numpy as np
 import pytest
 
-# Check that tests marked no_numba fail fast if they import numba.
+import cuda.compute
+from cuda.compute import (
+    CacheModifiedInputIterator,
+    ConstantIterator,
+    CountingIterator,
+    DiscardIterator,
+    OpKind,
+    PermutationIterator,
+    ReverseIterator,
+    ShuffleIterator,
+    SortOrder,
+    TransformIterator,
+    TransformOutputIterator,
+    ZipIterator,
+)
+from cuda.compute._cpp_compile import compile_cpp_op_code
+from cuda.compute.op import RawOp
+from cuda.compute.types import int16 as cccl_int16
+from cuda.compute.types import int32 as cccl_int32
+
+# These tests define the minimal-extra integration contract. They intentionally
+# use small fixed inputs and avoid the Python-callable operator path.
 pytestmark = pytest.mark.no_numba
 
 
-@pytest.mark.no_numba
+def _raw_op(source: str, name: str) -> RawOp:
+    return RawOp(ltoir=compile_cpp_op_code(source), name=name)
+
+
+def _raw_even_i32_op() -> RawOp:
+    source = """
+extern "C" __device__ void no_numba_even_i32(void* x, void* result) {
+    int value = *static_cast<int*>(x);
+    *static_cast<bool*>(result) = (value % 2) == 0;
+}
+"""
+    return _raw_op(source, "no_numba_even_i32")
+
+
+def _raw_less_than_i32_op(name: str, threshold: int) -> RawOp:
+    source = f"""
+extern "C" __device__ void {name}(void* x, void* result) {{
+    int value = *static_cast<int*>(x);
+    *static_cast<unsigned char*>(result) = value < {threshold} ? 1 : 0;
+}}
+"""
+    return _raw_op(source, name)
+
+
+def _raw_plus_i64_op() -> RawOp:
+    source = """
+extern "C" __device__ void no_numba_plus_i64(
+    void* lhs,
+    void* rhs,
+    void* result
+) {
+    *static_cast<long long*>(result) =
+        *static_cast<long long*>(lhs) + *static_cast<long long*>(rhs);
+}
+"""
+    return _raw_op(source, "no_numba_plus_i64")
+
+
+def _raw_square_i32_op() -> RawOp:
+    source = """
+extern "C" __device__ void no_numba_square_i32(void* x, void* result) {
+    int value = *static_cast<int*>(x);
+    *static_cast<int*>(result) = value * value;
+}
+"""
+    return _raw_op(source, "no_numba_square_i32")
+
+
+def _raw_zip_sum_i32_op() -> RawOp:
+    source = """
+struct Zip2I32 {
+    int field_0;
+    int field_1;
+};
+
+extern "C" __device__ void no_numba_zip_sum_i32(void* x, void* result) {
+    auto values = static_cast<Zip2I32*>(x);
+    *static_cast<int*>(result) = values->field_0 + values->field_1;
+}
+"""
+    return _raw_op(source, "no_numba_zip_sum_i32")
+
+
+def _raw_negate_i16_op() -> RawOp:
+    source = """
+extern "C" __device__ void no_numba_negate_i16(void* x, void* result) {
+    *static_cast<short*>(result) = -*static_cast<short*>(x);
+}
+"""
+    return _raw_op(source, "no_numba_negate_i16")
+
+
 def test_import_numba_raises():
     with pytest.raises(
         ImportError, match="This test is marked 'no_numba' but attempted to import it"
     ):
         import numba.cuda  # noqa: F401
+
+
+def test_reduce_well_known_plus():
+    h_input = np.arange(1, 14, dtype=np.int32)
+    d_input = cp.asarray(h_input)
+    d_output = cp.empty(1, dtype=np.int32)
+    h_init = np.array([5], dtype=np.int32)
+
+    cuda.compute.reduce_into(
+        d_in=d_input,
+        d_out=d_output,
+        num_items=d_input.size,
+        op=OpKind.PLUS,
+        h_init=h_init,
+    )
+
+    assert d_output.get()[0] == np.sum(h_input, initial=h_init[0])
+
+
+def test_exclusive_scan_well_known_plus():
+    d_input = cp.asarray([2, 4, 6, 8, 10, 12], dtype=np.uint16)
+    d_output = cp.empty_like(d_input)
+    h_init = np.array([1], dtype=np.uint16)
+
+    cuda.compute.exclusive_scan(
+        d_in=d_input,
+        d_out=d_output,
+        op=OpKind.PLUS,
+        init_value=h_init,
+        num_items=d_input.size,
+    )
+
+    expected = np.asarray([1, 3, 7, 13, 21, 31], dtype=np.uint16)
+    np.testing.assert_array_equal(d_output.get(), expected)
+
+
+def test_binary_transform_well_known_plus():
+    d_lhs = cp.asarray([1.5, 2.5, 3.5, 4.5], dtype=np.float32)
+    d_rhs = cp.asarray([10.0, 20.0, 30.0, 40.0], dtype=np.float32)
+    d_output = cp.empty_like(d_lhs)
+
+    cuda.compute.binary_transform(
+        d_in1=d_lhs,
+        d_in2=d_rhs,
+        d_out=d_output,
+        op=OpKind.PLUS,
+        num_items=d_lhs.size,
+    )
+
+    np.testing.assert_allclose(d_output.get(), d_lhs.get() + d_rhs.get())
+
+
+def test_unary_transform_well_known_negate():
+    d_input = cp.asarray([-4, -2, 0, 2, 4], dtype=np.int8)
+    d_output = cp.empty_like(d_input)
+
+    cuda.compute.unary_transform(
+        d_in=d_input,
+        d_out=d_output,
+        op=OpKind.NEGATE,
+        num_items=d_input.size,
+    )
+
+    np.testing.assert_array_equal(d_output.get(), np.asarray([4, 2, 0, -2, -4]))
+
+
+@pytest.mark.parametrize(
+    "search, side",
+    [
+        (cuda.compute.lower_bound, "left"),
+        (cuda.compute.upper_bound, "right"),
+    ],
+)
+def test_binary_search_explicit_opkind_less(search, side):
+    h_data = np.asarray([1, 3, 3, 7, 9, 11], dtype=np.int64)
+    h_values = np.asarray([0, 3, 4, 10, 12], dtype=np.int64)
+    d_out = cp.empty(h_values.size, dtype=np.uintp)
+
+    search(
+        d_data=cp.asarray(h_data),
+        num_items=h_data.size,
+        d_values=cp.asarray(h_values),
+        num_values=h_values.size,
+        d_out=d_out,
+        comp=OpKind.LESS,
+    )
+
+    expected = np.searchsorted(h_data, h_values, side=side).astype(np.uintp)
+    np.testing.assert_array_equal(d_out.get(), expected)
+
+
+def test_segmented_reduce_well_known_plus(monkeypatch):
+    monkeypatch.setattr(cuda.compute._cccl_interop, "_check_sass", False)
+
+    d_input = cp.asarray([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.uint32)
+    d_starts = cp.asarray([0, 3, 5], dtype=np.int32)
+    d_ends = cp.asarray([3, 5, 8], dtype=np.int32)
+    d_output = cp.empty(3, dtype=np.uint32)
+    h_init = np.array([0], dtype=np.uint32)
+
+    cuda.compute.segmented_reduce(
+        d_in=d_input,
+        d_out=d_output,
+        num_segments=3,
+        start_offsets_in=d_starts,
+        end_offsets_in=d_ends,
+        op=OpKind.PLUS,
+        h_init=h_init,
+    )
+
+    np.testing.assert_array_equal(d_output.get(), np.asarray([6, 9, 21]))
+
+
+def test_merge_sort_well_known_less():
+    d_input = cp.asarray([3.5, -1.0, 2.25, 2.0, 7.0], dtype=np.float64)
+    d_output = cp.empty_like(d_input)
+
+    cuda.compute.merge_sort(
+        d_in_keys=d_input,
+        d_in_values=None,
+        d_out_keys=d_output,
+        d_out_values=None,
+        num_items=d_input.size,
+        op=OpKind.LESS,
+    )
+
+    np.testing.assert_array_equal(d_output.get(), np.sort(d_input.get()))
+
+
+def test_radix_sort_key_value_pairs():
+    h_keys = np.asarray([4, -2, 7, 1, -2, 0], dtype=np.int16)
+    h_values = np.asarray([40, 20, 70, 10, 21, 0], dtype=np.uint8)
+    d_out_keys = cp.empty_like(cp.asarray(h_keys))
+    d_out_values = cp.empty_like(cp.asarray(h_values))
+
+    cuda.compute.radix_sort(
+        d_in_keys=cp.asarray(h_keys),
+        d_out_keys=d_out_keys,
+        d_in_values=cp.asarray(h_values),
+        d_out_values=d_out_values,
+        num_items=h_keys.size,
+        order=SortOrder.ASCENDING,
+    )
+
+    order = np.argsort(h_keys, stable=True)
+    np.testing.assert_array_equal(d_out_keys.get(), h_keys[order])
+    np.testing.assert_array_equal(d_out_values.get(), h_values[order])
+
+
+def test_segmented_sort_keys():
+    h_keys = np.asarray([3, 1, 2, 9, 7, 8, 6, 5], dtype=np.uint64)
+    h_offsets = np.asarray([0, 3, 6, 8], dtype=np.int64)
+    d_output = cp.empty_like(cp.asarray(h_keys))
+
+    cuda.compute.segmented_sort(
+        d_in_keys=cp.asarray(h_keys),
+        d_out_keys=d_output,
+        d_in_values=None,
+        d_out_values=None,
+        num_items=h_keys.size,
+        num_segments=h_offsets.size - 1,
+        start_offsets_in=cp.asarray(h_offsets[:-1]),
+        end_offsets_in=cp.asarray(h_offsets[1:]),
+        order=SortOrder.ASCENDING,
+    )
+
+    expected = np.asarray([1, 2, 3, 7, 8, 9, 5, 6], dtype=np.uint64)
+    np.testing.assert_array_equal(d_output.get(), expected)
+
+
+def test_unique_by_key_well_known_equal_to(monkeypatch):
+    cc_major, _ = cuda.compute._cccl_interop.CudaDevice().compute_capability
+    if cc_major >= 9:
+        monkeypatch.setattr(cuda.compute._cccl_interop, "_check_sass", False)
+
+    d_keys = cp.asarray([1, 1, 2, 2, 2, 3, 4, 4], dtype=np.int16)
+    d_values = cp.asarray([10, 11, 20, 21, 22, 30, 40, 41], dtype=np.int8)
+    d_out_keys = cp.empty_like(d_keys)
+    d_out_values = cp.empty_like(d_values)
+    d_num_selected = cp.empty(1, dtype=np.int64)
+
+    cuda.compute.unique_by_key(
+        d_in_keys=d_keys,
+        d_in_items=d_values,
+        d_out_keys=d_out_keys,
+        d_out_items=d_out_values,
+        d_out_num_selected=d_num_selected,
+        op=OpKind.EQUAL_TO,
+        num_items=d_keys.size,
+    )
+
+    num_selected = int(d_num_selected.get()[0])
+    np.testing.assert_array_equal(d_out_keys.get()[:num_selected], [1, 2, 3, 4])
+    np.testing.assert_array_equal(d_out_values.get()[:num_selected], [10, 20, 30, 40])
+
+
+def test_histogram_even_small_range():
+    h_samples = np.asarray([0.5, 1.5, 2.5, 2.75, 3.0, 3.5], dtype=np.float32)
+    d_histogram = cp.empty(4, dtype=np.int32)
+
+    cuda.compute.histogram_even(
+        d_samples=cp.asarray(h_samples),
+        d_histogram=d_histogram,
+        num_output_levels=5,
+        lower_level=np.float32(0.0),
+        upper_level=np.float32(4.0),
+        num_samples=h_samples.size,
+    )
+
+    expected, _ = np.histogram(h_samples, bins=4, range=(0.0, 4.0))
+    np.testing.assert_array_equal(d_histogram.get(), expected.astype(np.int32))
+
+
+def test_select_raw_op():
+    h_input = np.arange(12, dtype=np.int32)
+    d_output = cp.empty_like(cp.asarray(h_input))
+    d_num_selected = cp.empty(1, dtype=np.uint64)
+
+    cuda.compute.select(
+        d_in=cp.asarray(h_input),
+        d_out=d_output,
+        d_num_selected_out=d_num_selected,
+        cond=_raw_even_i32_op(),
+        num_items=h_input.size,
+    )
+
+    num_selected = int(d_num_selected.get()[0])
+    np.testing.assert_array_equal(d_output.get()[:num_selected], h_input[::2])
+
+
+def test_three_way_partition_raw_op():
+    h_input = np.arange(12, dtype=np.int32)
+    d_first = cp.empty_like(cp.asarray(h_input))
+    d_second = cp.empty_like(cp.asarray(h_input))
+    d_unselected = cp.empty_like(cp.asarray(h_input))
+    d_num_selected = cp.empty(2, dtype=np.uint64)
+
+    cuda.compute.three_way_partition(
+        d_in=cp.asarray(h_input),
+        d_first_part_out=d_first,
+        d_second_part_out=d_second,
+        d_unselected_out=d_unselected,
+        d_num_selected_out=d_num_selected,
+        select_first_part_op=_raw_less_than_i32_op("no_numba_less_than_4_i32", 4),
+        select_second_part_op=_raw_less_than_i32_op("no_numba_less_than_8_i32", 8),
+        num_items=h_input.size,
+    )
+
+    selected = d_num_selected.get()
+    first_count = int(selected[0])
+    second_count = int(selected[1])
+    unselected_count = h_input.size - first_count - second_count
+
+    np.testing.assert_array_equal(d_first.get()[:first_count], h_input[:4])
+    np.testing.assert_array_equal(d_second.get()[:second_count], h_input[4:8])
+    np.testing.assert_array_equal(d_unselected.get()[:unselected_count], h_input[8:])
+
+
+def test_raw_op_reduce():
+    h_input = np.asarray([10, 20, 30, 40], dtype=np.int64)
+    d_output = cp.empty(1, dtype=np.int64)
+
+    cuda.compute.reduce_into(
+        d_in=cp.asarray(h_input),
+        d_out=d_output,
+        num_items=h_input.size,
+        op=_raw_plus_i64_op(),
+        h_init=np.array([5], dtype=np.int64),
+    )
+
+    assert d_output.get()[0] == 105
+
+
+def test_stream_argument(cuda_stream):
+    d_lhs = cp.asarray([2, 4, 6, 8, 10], dtype=np.int32)
+    d_rhs = cp.asarray([1, 3, 5, 7, 9], dtype=np.int32)
+    d_output = cp.empty_like(d_lhs)
+
+    cuda.compute.binary_transform(
+        d_in1=d_lhs,
+        d_in2=d_rhs,
+        d_out=d_output,
+        op=OpKind.PLUS,
+        num_items=d_lhs.size,
+        stream=cuda_stream,
+    )
+
+    cp.cuda.Device().synchronize()
+    np.testing.assert_array_equal(d_output.get(), np.asarray([3, 7, 11, 15, 19]))
+
+
+def test_counting_iterator_reduce():
+    d_output = cp.empty(1, dtype=np.int32)
+
+    cuda.compute.reduce_into(
+        d_in=CountingIterator(np.int32(3)),
+        d_out=d_output,
+        num_items=8,
+        op=OpKind.PLUS,
+        h_init=np.array([0], dtype=np.int32),
+    )
+
+    assert d_output.get()[0] == 52
+
+
+def test_constant_iterator_reduce():
+    d_output = cp.empty(1, dtype=np.float32)
+
+    cuda.compute.reduce_into(
+        d_in=ConstantIterator(np.float32(1.5)),
+        d_out=d_output,
+        num_items=8,
+        op=OpKind.PLUS,
+        h_init=np.array([0], dtype=np.float32),
+    )
+
+    np.testing.assert_allclose(d_output.get()[0], np.float32(12.0))
+
+
+def test_cache_modified_input_iterator_reduce():
+    d_input = cp.asarray([2, 4, 6, 8, 10], dtype=np.uint16)
+    d_output = cp.empty(1, dtype=np.uint16)
+    iterator = CacheModifiedInputIterator(d_input, modifier="stream")
+
+    cuda.compute.reduce_into(
+        d_in=iterator,
+        d_out=d_output,
+        num_items=d_input.size,
+        op=OpKind.PLUS,
+        h_init=np.array([0], dtype=np.uint16),
+    )
+
+    assert d_output.get()[0] == 30
+
+
+def test_reverse_input_iterator_scan():
+    d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)
+    d_output = cp.empty_like(d_input)
+
+    cuda.compute.inclusive_scan(
+        d_in=ReverseIterator(d_input),
+        d_out=d_output,
+        op=OpKind.PLUS,
+        init_value=np.array([0], dtype=np.int32),
+        num_items=d_input.size,
+    )
+
+    np.testing.assert_array_equal(d_output.get(), np.asarray([5, 9, 12, 14, 15]))
+
+
+def test_reverse_output_iterator_scan():
+    d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)
+    d_output = cp.empty_like(d_input)
+
+    cuda.compute.inclusive_scan(
+        d_in=d_input,
+        d_out=ReverseIterator(d_output),
+        op=OpKind.PLUS,
+        init_value=np.array([0], dtype=np.int32),
+        num_items=d_input.size,
+    )
+
+    np.testing.assert_array_equal(d_output.get(), np.asarray([15, 10, 6, 3, 1]))
+
+
+def test_permutation_iterator_reduce():
+    d_values = cp.asarray([10, 20, 30, 40, 50, 60], dtype=np.int64)
+    d_indices = cp.asarray([4, 2, 5, 1], dtype=np.int32)
+    d_output = cp.empty(1, dtype=np.int64)
+
+    cuda.compute.reduce_into(
+        d_in=PermutationIterator(d_values, d_indices),
+        d_out=d_output,
+        num_items=d_indices.size,
+        op=OpKind.PLUS,
+        h_init=np.array([0], dtype=np.int64),
+    )
+
+    assert d_output.get()[0] == 160
+
+
+def test_transform_iterator_reduce():
+    d_output = cp.empty(1, dtype=np.int32)
+    iterator = TransformIterator(
+        CountingIterator(np.int32(1)), _raw_square_i32_op(), value_type=cccl_int32
+    )
+
+    cuda.compute.reduce_into(
+        d_in=iterator,
+        d_out=d_output,
+        num_items=6,
+        op=OpKind.PLUS,
+        h_init=np.array([0], dtype=np.int32),
+    )
+
+    assert d_output.get()[0] == 91
+
+
+def test_transform_output_iterator_reduce():
+    d_input = cp.asarray([1, 2, 3, 4], dtype=np.int16)
+    d_output = cp.empty(1, dtype=np.int16)
+    output_iterator = TransformOutputIterator(
+        d_output, _raw_negate_i16_op(), output_value_type=cccl_int16
+    )
+
+    cuda.compute.reduce_into(
+        d_in=d_input,
+        d_out=output_iterator,
+        num_items=d_input.size,
+        op=OpKind.PLUS,
+        h_init=np.array([0], dtype=np.int16),
+    )
+
+    assert d_output.get()[0] == -10
+
+
+def test_zip_iterator_transform():
+    d_lhs = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)
+    d_rhs = cp.asarray([10, 20, 30, 40, 50], dtype=np.int32)
+    d_output = cp.empty_like(d_lhs)
+
+    cuda.compute.unary_transform(
+        d_in=ZipIterator(d_lhs, d_rhs),
+        d_out=d_output,
+        op=_raw_zip_sum_i32_op(),
+        num_items=d_lhs.size,
+    )
+
+    np.testing.assert_array_equal(d_output.get(), d_lhs.get() + d_rhs.get())
+
+
+def test_shuffle_iterator_transform():
+    num_items = 17
+    d_output = cp.empty(num_items, dtype=np.int64)
+
+    cuda.compute.unary_transform(
+        d_in=ShuffleIterator(num_items, seed=123),
+        d_out=d_output,
+        op=OpKind.IDENTITY,
+        num_items=num_items,
+    )
+
+    result = d_output.get()
+    assert sorted(result.tolist()) == list(range(num_items))
+
+
+def test_discard_iterator_transform():
+    d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)
+    d_reference = cp.full_like(d_input, -1)
+
+    cuda.compute.unary_transform(
+        d_in=d_input,
+        d_out=DiscardIterator(d_reference),
+        op=OpKind.IDENTITY,
+        num_items=d_input.size,
+    )
+
+    np.testing.assert_array_equal(d_reference.get(), np.full(5, -1, dtype=np.int32))
diff --git a/python/cuda_cccl/tests/compute/test_permutation_iterator.py b/python/cuda_cccl/tests/compute/test_permutation_iterator.py
index 226904b75da..74d74a7b4e6 100644
--- a/python/cuda_cccl/tests/compute/test_permutation_iterator.py
+++ b/python/cuda_cccl/tests/compute/test_permutation_iterator.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 import cupy as cp
 import numpy as np
-import pytest
 
 import cuda.compute
 from cuda.compute.iterators import (
@@ -13,7 +12,6 @@
 )
 
 
-@pytest.mark.no_numba
 def test_permutation_iterator_equality():
     values1 = cp.asarray([10, 20, 30, 40, 50], dtype="int32")
     values2 = cp.asarray([100, 200, 300], dtype="int32")
@@ -39,7 +37,6 @@ def test_permutation_iterator_equality():
     assert it1.kind != it5.kind
 
 
-@pytest.mark.no_numba
 def test_permutation_iterator_with_array_values():
     values = cp.asarray([10, 20, 30, 40, 50], dtype="int32")
     indices = cp.asarray([2, 0, 4, 1], dtype="int32")
@@ -57,7 +54,6 @@ def test_permutation_iterator_with_array_values():
     assert d_output[0] == values[indices].sum()
 
 
-@pytest.mark.no_numba
 def test_permutation_iterator_with_iterator_values():
     values_it = CountingIterator(np.int32(10))
     indices = cp.asarray([2, 0, 4, 1], dtype="int32")
@@ -161,7 +157,6 @@ def op(a):
     assert cp.all(d_out == expected)
 
 
-@pytest.mark.no_numba
 def test_caching_permutation_iterator():
     """Test that iterator compilation is cached across instances with the same structure."""
     from cuda.compute._cpp_compile import compile_cpp_op_code
@@ -209,7 +204,6 @@ def test_caching_permutation_iterator():
     )
 
 
-@pytest.mark.no_numba
 def test_permutation_iterator_advance():
     """Test PermutationIterator.__add__ only advances indices, not values."""
     # Create values array [10, 20, 30, 40, 50, 60, 70]
diff --git a/python/cuda_cccl/tests/compute/test_radix_sort.py b/python/cuda_cccl/tests/compute/test_radix_sort.py
index d39a7e07fad..62b8e7dcb44 100644
--- a/python/cuda_cccl/tests/compute/test_radix_sort.py
+++ b/python/cuda_cccl/tests/compute/test_radix_sort.py
@@ -6,6 +6,7 @@
 from typing import Tuple
 
 import cupy as cp
+import numba
 import numpy as np
 import pytest
 
@@ -14,7 +15,6 @@
     DoubleBuffer,
     SortOrder,
 )
-from cuda.core import Device
 
 
 def get_mark(dt, log_size):
@@ -148,7 +148,7 @@ def host_sort(h_in_keys, h_in_values, order, begin_bit=None, end_bit=None) -> Tu
     DTYPE_SIZE,
 )
 def test_radix_sort_keys(dtype, num_items, monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -164,12 +164,12 @@ def test_radix_sort_keys(dtype, num_items, monkeypatch):
     h_in_keys = random_array(num_items, dtype, max_value=20)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_out_keys = cp.asarray(h_out_keys)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
 
     radix_sort_device(d_in_keys, d_out_keys, None, None, order, num_items)
 
-    h_out_keys = d_out_keys.get()
+    h_out_keys = d_out_keys.copy_to_host()
 
     h_in_keys, _ = host_sort(h_in_keys, None, order)
 
@@ -195,17 +195,17 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch):
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_values = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_values = cp.asarray(h_in_values)
-    d_out_keys = cp.asarray(h_out_keys)
-    d_out_values = cp.asarray(h_out_values)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_values = numba.cuda.to_device(h_in_values)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_out_values = numba.cuda.to_device(h_out_values)
 
     radix_sort_device(
         d_in_keys, d_out_keys, d_in_values, d_out_values, order, num_items
     )
 
-    h_out_keys = d_out_keys.get()
-    h_out_values = d_out_values.get()
+    h_out_keys = d_out_keys.copy_to_host()
+    h_out_values = d_out_values.copy_to_host()
 
     h_in_keys, h_in_values = host_sort(h_in_keys, h_in_values, order)
 
@@ -218,7 +218,7 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch):
     DTYPE_SIZE,
 )
 def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -234,14 +234,14 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch):
     h_in_keys = random_array(num_items, dtype, max_value=20)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_out_keys = cp.asarray(h_out_keys)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
 
     keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys)
 
     radix_sort_device(keys_double_buffer, None, None, None, order, num_items)
 
-    h_out_keys = keys_double_buffer.current().get()
+    h_out_keys = keys_double_buffer.current().copy_to_host()
 
     h_in_keys, _ = host_sort(h_in_keys, None, order)
 
@@ -253,7 +253,7 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch):
     DTYPE_SIZE,
 )
 def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # NOTE: int16 failures seen only with NVRTC 13.1:
     if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)):
         import cuda.compute._cccl_interop
@@ -270,10 +270,10 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_values = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_values = cp.asarray(h_in_values)
-    d_out_keys = cp.asarray(h_out_keys)
-    d_out_values = cp.asarray(h_out_values)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_values = numba.cuda.to_device(h_in_values)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_out_values = numba.cuda.to_device(h_out_values)
 
     keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys)
     values_double_buffer = DoubleBuffer(d_in_values, d_out_values)
@@ -282,8 +282,8 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
         keys_double_buffer, None, values_double_buffer, None, order, num_items
     )
 
-    h_out_keys = keys_double_buffer.current().get()
-    h_out_values = values_double_buffer.current().get()
+    h_out_keys = keys_double_buffer.current().copy_to_host()
+    h_out_values = values_double_buffer.current().copy_to_host()
 
     h_in_keys, h_in_values = host_sort(h_in_keys, h_in_values, order)
 
@@ -304,7 +304,7 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
     DTYPE_SIZE_BIT_WINDOW,
 )
 def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # NOTE: int16 failures seen only with NVRTC 13.1:
     if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)):
         import cuda.compute._cccl_interop
@@ -329,10 +329,10 @@ def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch):
         h_out_keys = np.empty(num_items, dtype=dtype)
         h_out_values = np.empty(num_items, dtype=np.float32)
 
-        d_in_keys = cp.asarray(h_in_keys)
-        d_in_values = cp.asarray(h_in_values)
-        d_out_keys = cp.asarray(h_out_keys)
-        d_out_values = cp.asarray(h_out_values)
+        d_in_keys = numba.cuda.to_device(h_in_keys)
+        d_in_values = numba.cuda.to_device(h_in_values)
+        d_out_keys = numba.cuda.to_device(h_out_keys)
+        d_out_values = numba.cuda.to_device(h_out_values)
 
         radix_sort_device(
             d_in_keys,
@@ -345,8 +345,8 @@ def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch):
             end_bit,
         )
 
-        h_out_keys = d_out_keys.get()
-        h_out_values = d_out_values.get()
+        h_out_keys = d_out_keys.copy_to_host()
+        h_out_values = d_out_values.copy_to_host()
 
         h_in_keys, h_in_values = host_sort(
             h_in_keys, h_in_values, order, begin_bit, end_bit
@@ -384,10 +384,10 @@ def test_radix_sort_pairs_double_buffer_bit_window(dtype, num_items, monkeypatch
         h_out_keys = np.empty(num_items, dtype=dtype)
         h_out_values = np.empty(num_items, dtype=np.float32)
 
-        d_in_keys = cp.asarray(h_in_keys)
-        d_in_values = cp.asarray(h_in_values)
-        d_out_keys = cp.asarray(h_out_keys)
-        d_out_values = cp.asarray(h_out_values)
+        d_in_keys = numba.cuda.to_device(h_in_keys)
+        d_in_values = numba.cuda.to_device(h_in_values)
+        d_out_keys = numba.cuda.to_device(h_out_keys)
+        d_out_values = numba.cuda.to_device(h_out_values)
 
         keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys)
         values_double_buffer = DoubleBuffer(d_in_values, d_out_values)
@@ -403,8 +403,8 @@ def test_radix_sort_pairs_double_buffer_bit_window(dtype, num_items, monkeypatch
             end_bit,
         )
 
-        h_out_keys = keys_double_buffer.current().get()
-        h_out_values = values_double_buffer.current().get()
+        h_out_keys = keys_double_buffer.current().copy_to_host()
+        h_out_values = values_double_buffer.current().copy_to_host()
 
         h_in_keys, h_in_values = host_sort(
             h_in_keys, h_in_values, order, begin_bit, end_bit
@@ -469,9 +469,8 @@ def test_radix_sort_with_stream(cuda_stream):
     np.testing.assert_array_equal(got, h_in_keys)
 
 
-@pytest.mark.no_numba
 def test_radix_sort(monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -519,9 +518,8 @@ def test_radix_sort(monkeypatch):
     np.testing.assert_array_equal(h_out_items, h_in_values)
 
 
-@pytest.mark.no_numba
 def test_radix_sort_double_buffer(monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
diff --git a/python/cuda_cccl/tests/compute/test_reduce.py b/python/cuda_cccl/tests/compute/test_reduce.py
index 15f11428506..df3f024f0ee 100644
--- a/python/cuda_cccl/tests/compute/test_reduce.py
+++ b/python/cuda_cccl/tests/compute/test_reduce.py
@@ -6,6 +6,7 @@
 import random
 
 import cupy as cp
+import numba.cuda
 import numpy as np
 import pytest
 from cupy.cuda import runtime
@@ -72,15 +73,15 @@ def add_op(a, b):
 def test_device_reduce(dtype, num_items, op):
     init_value = 42
     h_init = np.array([init_value], dtype=dtype)
-    d_output = cp.empty(1, dtype=dtype)
+    d_output = numba.cuda.device_array(1, dtype=dtype)
 
     h_input = random_int(num_items, dtype)
-    d_input = cp.asarray(h_input)
+    d_input = numba.cuda.to_device(h_input)
 
     cuda.compute.reduce_into(
         d_in=d_input, d_out=d_output, num_items=d_input.size, op=op, h_init=h_init
     )
-    h_output = d_output.get()
+    h_output = d_output.copy_to_host()
     assert h_output[0] == pytest.approx(
         sum(h_input) + init_value, rel=0.08 if dtype == np.float16 else 0
     )  # obtained relative error value from c2h/include/c2h/check_results.cuh
@@ -93,10 +94,10 @@ def test_device_reduce_with_lambda():
     num_items = 1024
 
     h_init = np.array([init_value], dtype=dtype)
-    d_output = cp.empty(1, dtype=dtype)
+    d_output = numba.cuda.device_array(1, dtype=dtype)
 
     h_input = random_int(num_items, dtype)
-    d_input = cp.asarray(h_input)
+    d_input = numba.cuda.to_device(h_input)
 
     # Use a lambda function directly as the reducer
     cuda.compute.reduce_into(
@@ -106,7 +107,7 @@ def test_device_reduce_with_lambda():
         op=lambda a, b: a + b,
         h_init=h_init,
     )
-    h_output = d_output.get()
+    h_output = d_output.copy_to_host()
     assert h_output[0] == sum(h_input) + init_value
 
 
@@ -117,10 +118,10 @@ def test_device_reduce_with_lambda_variable():
     num_items = 1024
 
     h_init = np.array([init_value], dtype=dtype)
-    d_output = cp.empty(1, dtype=dtype)
+    d_output = numba.cuda.device_array(1, dtype=dtype)
 
     h_input = random_int(num_items, dtype)
-    d_input = cp.asarray(h_input)
+    d_input = numba.cuda.to_device(h_input)
 
     # Use a lambda function assigned to a variable as the reducer
     cuda.compute.reduce_into(
@@ -130,24 +131,24 @@ def test_device_reduce_with_lambda_variable():
         op=add_op_lambda,
         h_init=h_init,
     )
-    h_output = d_output.get()
+    h_output = d_output.copy_to_host()
     assert h_output[0] == sum(h_input) + init_value
 
 
 def test_complex_device_reduce():
     h_init = np.array([40.0 + 2.0j], dtype=complex)
-    d_output = cp.empty(1, dtype=complex)
+    d_output = numba.cuda.device_array(1, dtype=complex)
 
     for num_items in [42, 420000]:
         real_imag = np.random.random((2, num_items))
         h_input = real_imag[0] + 1j * real_imag[1]
-        d_input = cp.asarray(h_input)
+        d_input = numba.cuda.to_device(h_input)
         assert d_input.size == num_items
         cuda.compute.reduce_into(
             d_in=d_input, d_out=d_output, num_items=num_items, op=add_op, h_init=h_init
         )
 
-        result = d_output.get()[0]
+        result = d_output.copy_to_host()[0]
         expected = np.sum(h_input, initial=h_init[0])
         assert result == pytest.approx(expected)
 
@@ -161,11 +162,11 @@ def _test_device_sum_with_iterator(
 
     if use_numpy_array:
         h_input = np.array(l_varr, dtype_inp)
-        d_input = cp.asarray(h_input)
+        d_input = numba.cuda.to_device(h_input)
     else:
         d_input = i_input
 
-    d_output = cp.empty(1, dtype_out)  # to store device sum
+    d_output = numba.cuda.device_array(1, dtype_out)  # to store device sum
 
     h_init = np.array([start_sum_with], dtype_out)
 
@@ -173,7 +174,7 @@ def _test_device_sum_with_iterator(
         d_in=d_input, d_out=d_output, num_items=len(l_varr), op=add_op, h_init=h_init
     )
 
-    h_output = d_output.get()
+    h_output = d_output.copy_to_host()
     assert h_output[0] == expected_result
 
 
@@ -215,7 +216,7 @@ def test_device_sum_cache_modified_input_it(
     l_varr = [rng.randrange(100) for _ in range(num_items)]
     dtype_inp = np.dtype(supported_value_type)
     dtype_out = dtype_inp
-    input_devarr = cp.asarray(np.array(l_varr, dtype=dtype_inp))
+    input_devarr = numba.cuda.to_device(np.array(l_varr, dtype=dtype_inp))
     i_input = CacheModifiedInputIterator(input_devarr, modifier="stream")
     _test_device_sum_with_iterator(
         l_varr, start_sum_with, i_input, dtype_inp, dtype_out, use_numpy_array
@@ -690,7 +691,6 @@ def add_op(x, y):
         )
 
 
-@pytest.mark.no_numba
 def test_device_reduce_well_known_plus():
     dtype = np.int32
     h_init = np.array([0], dtype=dtype)
@@ -709,7 +709,6 @@ def test_device_reduce_well_known_plus():
     assert (d_output == expected_output).all()
 
 
-@pytest.mark.no_numba
 def test_device_reduce_well_known_minimum():
     dtype = np.int32
     h_init = np.array([100], dtype=dtype)
@@ -728,7 +727,6 @@ def test_device_reduce_well_known_minimum():
     assert (d_output == expected_output).all()
 
 
-@pytest.mark.no_numba
 def test_device_reduce_well_known_maximum():
     dtype = np.int32
     h_init = np.array([-100], dtype=dtype)
@@ -927,7 +925,6 @@ def sqrt(x: dtype) -> dtype:
     np.testing.assert_allclose(d_output.get(), expected.get(), atol=1e-6)
 
 
-@pytest.mark.no_numba
 def test_reduce_with_not_guaranteed_determinism(floating_array):
     dtype = floating_array.dtype
     h_init = np.array([0], dtype=dtype)
@@ -945,7 +942,6 @@ def test_reduce_with_not_guaranteed_determinism(floating_array):
     )
 
 
-@pytest.mark.no_numba
 def test_reduce_bool():
     h_init = np.array([False])
     d_input = cp.array([True, False, True])
diff --git a/python/cuda_cccl/tests/compute/test_scan.py b/python/cuda_cccl/tests/compute/test_scan.py
index 0eb3af34c49..af6d941b11b 100644
--- a/python/cuda_cccl/tests/compute/test_scan.py
+++ b/python/cuda_cccl/tests/compute/test_scan.py
@@ -4,6 +4,7 @@
 
 
 import cupy as cp
+import numba.cuda
 import numpy as np
 import pytest
 
@@ -15,7 +16,6 @@
     TransformOutputIterator,
     gpu_struct,
 )
-from cuda.core import Device
 
 
 def scan_host(h_input: np.ndarray, op, h_init, force_inclusive):
@@ -52,7 +52,7 @@ def scan_device(d_input, d_output, num_items, op, h_init, force_inclusive, strea
     [True, False],
 )
 def test_scan_array_input(force_inclusive, input_array, monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip sass verification if input is complex
     # as LDL/STL instructions are emitted for complex types.
     # Also skip for:
@@ -214,7 +214,6 @@ def op(a, b):
     np.testing.assert_allclose(expected, got, rtol=1e-5)
 
 
-@pytest.mark.no_numba
 def test_exclusive_scan_well_known_plus():
     dtype = np.int32
     h_init = np.array([0], dtype=dtype)
@@ -233,9 +232,8 @@ def test_exclusive_scan_well_known_plus():
     np.testing.assert_equal(d_output.get(), expected)
 
 
-@pytest.mark.no_numba
 def test_inclusive_scan_well_known_plus(monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip SASS check for CC 9.0+, due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -356,7 +354,7 @@ def add_op(a, b):
 
 
 def test_reverse_input_iterator(monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip SASS check for CC 9.0+, due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -411,7 +409,6 @@ def add_op(a, b):
     np.testing.assert_equal(d_output.get(), expected)
 
 
-@pytest.mark.no_numba
 @pytest.mark.parametrize(
     "force_inclusive",
     [True, False],
@@ -433,14 +430,13 @@ def test_future_init_value(force_inclusive):
     np.testing.assert_array_equal(expected, got)
 
 
-@pytest.mark.no_numba
 def test_no_init_value(monkeypatch):
     force_inclusive = True
     num_items = 1024
     dtype = np.dtype("int32")
 
     # Skip SASS check for CC 9.0 due to LDL/STL CI failure.
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     if cc_major >= 9:
         import cuda.compute._cccl_interop
 
@@ -460,7 +456,6 @@ def test_no_init_value(monkeypatch):
     np.testing.assert_array_equal(expected, got)
 
 
-@pytest.mark.no_numba
 def test_no_init_value_iterator():
     force_inclusive = True
     num_items = 1024
@@ -498,7 +493,6 @@ def test_inclusive_scan_with_lambda():
     np.testing.assert_array_equal(d_output.get(), expected)
 
 
-@pytest.mark.no_numba
 @pytest.mark.parametrize("force_inclusive", [True, False])
 def test_scan_bool_maximum(force_inclusive):
     h_init = np.array([False], dtype=np.bool_)
diff --git a/python/cuda_cccl/tests/compute/test_segmented_reduce.py b/python/cuda_cccl/tests/compute/test_segmented_reduce.py
index 815c98b8edb..950d860a14d 100644
--- a/python/cuda_cccl/tests/compute/test_segmented_reduce.py
+++ b/python/cuda_cccl/tests/compute/test_segmented_reduce.py
@@ -269,7 +269,6 @@ def _plus(a, b):
         )
 
 
-@pytest.mark.no_numba
 def test_segmented_reduce_well_known_plus(monkeypatch):
     # Disable SASS verification for this test (LDL instruction in SASS).
     monkeypatch.setattr(
@@ -300,7 +299,6 @@ def test_segmented_reduce_well_known_plus(monkeypatch):
     np.testing.assert_equal(d_output.get(), expected)
 
 
-@pytest.mark.no_numba
 def test_segmented_reduce_well_known_maximum(monkeypatch):
     # Disable SASS verification for this test (LDL instruction in SASS).
     monkeypatch.setattr(
@@ -331,7 +329,6 @@ def test_segmented_reduce_well_known_maximum(monkeypatch):
     np.testing.assert_equal(d_output.get(), expected)
 
 
-@pytest.mark.no_numba
 def test_segmented_reduce_bool_maximum(monkeypatch):
     # Disable SASS verification for this test (LDL instruction in SASS).
     monkeypatch.setattr(
diff --git a/python/cuda_cccl/tests/compute/test_segmented_sort.py b/python/cuda_cccl/tests/compute/test_segmented_sort.py
index 9c2ab9f2de4..2bdd03abb42 100644
--- a/python/cuda_cccl/tests/compute/test_segmented_sort.py
+++ b/python/cuda_cccl/tests/compute/test_segmented_sort.py
@@ -5,13 +5,12 @@
 from typing import Tuple
 
 import cupy as cp
+import numba
 import numpy as np
 import pytest
 
 import cuda.compute
 
-pytestmark = pytest.mark.no_numba
-
 DTYPE_LIST = [
     np.uint8,
     np.int16,
@@ -119,8 +118,8 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch):
     h_in_keys = random_array(num_items, dtype, max_value=50)
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_out_keys = cp.asarray(np.empty_like(h_in_keys))
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
 
     cuda.compute.segmented_sort(
         d_in_keys=d_in_keys,
@@ -134,7 +133,7 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch):
         order=order,
     )
 
-    h_out_keys = d_out_keys.get()
+    h_out_keys = d_out_keys.copy_to_host()
     expected_keys, _ = host_segmented_sort(
         h_in_keys, None, start_offsets, end_offsets, order
     )
@@ -154,10 +153,10 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size):
 
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_vals = cp.asarray(h_in_vals)
-    d_out_keys = cp.asarray(np.empty_like(h_in_keys))
-    d_out_vals = cp.asarray(np.empty_like(h_in_vals))
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_vals = numba.cuda.to_device(h_in_vals)
+    d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
+    d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals))
 
     cuda.compute.segmented_sort(
         d_in_keys=d_in_keys,
@@ -171,8 +170,8 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size):
         order=order,
     )
 
-    h_out_keys = d_out_keys.get()
-    h_out_vals = d_out_vals.get()
+    h_out_keys = d_out_keys.copy_to_host()
+    h_out_vals = d_out_vals.copy_to_host()
 
     expected_keys, expected_vals = host_segmented_sort(
         h_in_keys, h_in_vals, start_offsets, end_offsets, order
@@ -190,8 +189,8 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size):
     h_in_keys = random_array(num_items, dtype, max_value=20)
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_tmp_keys = cp.asarray(np.empty_like(h_in_keys))
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
     keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys)
 
     cuda.compute.segmented_sort(
@@ -206,7 +205,7 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size):
         order=order,
     )
 
-    h_out_keys = keys_db.current().get()
+    h_out_keys = keys_db.current().copy_to_host()
     expected_keys, _ = host_segmented_sort(
         h_in_keys, None, start_offsets, end_offsets, order
     )
@@ -225,10 +224,10 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size):
 
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_vals = cp.asarray(h_in_vals)
-    d_tmp_keys = cp.asarray(np.empty_like(h_in_keys))
-    d_tmp_vals = cp.asarray(np.empty_like(h_in_vals))
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_vals = numba.cuda.to_device(h_in_vals)
+    d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
+    d_tmp_vals = numba.cuda.to_device(np.empty_like(h_in_vals))
 
     keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys)
     vals_db = cuda.compute.DoubleBuffer(d_in_vals, d_tmp_vals)
@@ -245,8 +244,8 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size):
         order=order,
     )
 
-    h_out_keys = keys_db.current().get()
-    h_out_vals = vals_db.current().get()
+    h_out_keys = keys_db.current().copy_to_host()
+    h_out_vals = vals_db.current().copy_to_host()
 
     expected_keys, expected_vals = host_segmented_sort(
         h_in_keys, h_in_vals, start_offsets, end_offsets, order
@@ -298,10 +297,10 @@ def test_segmented_sort_variable_segment_sizes(num_segments):
     h_in_keys = random_array(num_items, np.int32, max_value=100)
     h_in_vals = random_array(num_items, np.float32)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_vals = cp.asarray(h_in_vals)
-    d_out_keys = cp.asarray(np.empty_like(h_in_keys))
-    d_out_vals = cp.asarray(np.empty_like(h_in_vals))
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_vals = numba.cuda.to_device(h_in_vals)
+    d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
+    d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals))
 
     cuda.compute.segmented_sort(
         d_in_keys=d_in_keys,
@@ -315,8 +314,8 @@ def test_segmented_sort_variable_segment_sizes(num_segments):
         order=order,
     )
 
-    h_out_keys = d_out_keys.get()
-    h_out_vals = d_out_vals.get()
+    h_out_keys = d_out_keys.copy_to_host()
+    h_out_vals = d_out_vals.copy_to_host()
     expected_keys, expected_vals = host_segmented_sort(
         h_in_keys, h_in_vals, start_offsets, end_offsets, order
     )
diff --git a/python/cuda_cccl/tests/compute/test_select.py b/python/cuda_cccl/tests/compute/test_select.py
index 3dcaf98a507..f2483860a4a 100644
--- a/python/cuda_cccl/tests/compute/test_select.py
+++ b/python/cuda_cccl/tests/compute/test_select.py
@@ -8,8 +8,6 @@
 
 import cuda.compute
 from cuda.compute import CacheModifiedInputIterator, ZipIterator, gpu_struct
-from cuda.compute._cpp_compile import compile_cpp_op_code
-from cuda.compute.op import RawOp
 
 DTYPE_LIST = [
     np.uint8,
@@ -61,35 +59,6 @@ def _host_select(h_in: np.ndarray, cond):
     return selected, np.int64(selected.size)
 
 
-def _raw_even_i32_op() -> RawOp:
-    source = """
-extern "C" __device__ void is_even_i32(void* x, void* result) {
-    int value = *static_cast<int*>(x);
-    *static_cast<bool*>(result) = (value % 2) == 0;
-}
-"""
-    return RawOp(ltoir=compile_cpp_op_code(source), name="is_even_i32")
-
-
-@pytest.mark.no_numba
-def test_select_raw_op_minimal():
-    h_in = np.arange(10, dtype=np.int32)
-    d_in = cp.asarray(h_in)
-    d_out = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.uint64)
-
-    cuda.compute.select(
-        d_in=d_in,
-        d_out=d_out,
-        d_num_selected_out=d_num_selected,
-        cond=_raw_even_i32_op(),
-        num_items=len(d_in),
-    )
-
-    num_selected = int(d_num_selected[0].get())
-    np.testing.assert_array_equal(d_out.get()[:num_selected], h_in[h_in % 2 == 0])
-
-
 @pytest.mark.parametrize("dtype,num_items", select_params)
 def test_select_basic(dtype, num_items):
     h_in = random_array(num_items, dtype, max_value=100)
diff --git a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py
index 715f9e51235..5c82fe1ad38 100644
--- a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py
+++ b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py
@@ -87,7 +87,6 @@ def test_shuffle_iterator_with_permutation_iterator():
     assert sorted(result) == sorted(d_values.get())
 
 
-@pytest.mark.no_numba
 def test_shuffle_iterator_invalid_num_items():
     with pytest.raises(ValueError, match="num_items must be > 0"):
         ShuffleIterator(0, seed=42)
diff --git a/python/cuda_cccl/tests/compute/test_struct_field_validation.py b/python/cuda_cccl/tests/compute/test_struct_field_validation.py
index cf1edd7e681..3ddaee54d41 100644
--- a/python/cuda_cccl/tests/compute/test_struct_field_validation.py
+++ b/python/cuda_cccl/tests/compute/test_struct_field_validation.py
@@ -14,8 +14,6 @@
 
 from cuda.compute import gpu_struct
 
-pytestmark = pytest.mark.no_numba
-
 
 def test_newline_in_field_name_is_rejected():
     """Field names with newlines must be rejected — they are the exec() injection vector."""
diff --git a/python/cuda_cccl/tests/compute/test_three_way_partition.py b/python/cuda_cccl/tests/compute/test_three_way_partition.py
index 8def513681b..3fb5b102275 100644
--- a/python/cuda_cccl/tests/compute/test_three_way_partition.py
+++ b/python/cuda_cccl/tests/compute/test_three_way_partition.py
@@ -8,8 +8,6 @@
 
 import cuda.compute
 from cuda.compute import CacheModifiedInputIterator, gpu_struct
-from cuda.compute._cpp_compile import compile_cpp_op_code
-from cuda.compute.op import RawOp
 
 DTYPE_LIST = [
     np.uint8,
@@ -67,50 +65,6 @@ def _host_three_way_partition(h_in: np.ndarray, less_than_op, greater_equal_op):
     )
 
 
-def _raw_less_than_i32(name: str, threshold: int) -> RawOp:
-    source = f"""
-extern "C" __device__ void {name}(void* x, void* result) {{
-    int value = *static_cast<int*>(x);
-    *static_cast<unsigned char*>(result) = value < {threshold} ? 1 : 0;
-}}
-"""
-    return RawOp(ltoir=compile_cpp_op_code(source), name=name)
-
-
-@pytest.mark.no_numba
-def test_three_way_partition_raw_op_minimal():
-    h_in = np.arange(10, dtype=np.int32)
-    d_in = cp.asarray(h_in)
-    d_first = cp.empty_like(d_in)
-    d_second = cp.empty_like(d_in)
-    d_unselected = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.uint64)
-
-    cuda.compute.three_way_partition(
-        d_in=d_in,
-        d_first_part_out=d_first,
-        d_second_part_out=d_second,
-        d_unselected_out=d_unselected,
-        d_num_selected_out=d_num_selected,
-        select_first_part_op=_raw_less_than_i32("less_than_3_i32", 3),
-        select_second_part_op=_raw_less_than_i32("less_than_6_i32", 6),
-        num_items=len(d_in),
-    )
-
-    selected = d_num_selected.get()
-    first_count = int(selected[0])
-    second_count = int(selected[1])
-    unselected_count = len(h_in) - first_count - second_count
-
-    np.testing.assert_array_equal(d_first.get()[:first_count], h_in[h_in < 3])
-    np.testing.assert_array_equal(
-        d_second.get()[:second_count], h_in[(h_in >= 3) & (h_in < 6)]
-    )
-    np.testing.assert_array_equal(
-        d_unselected.get()[:unselected_count], h_in[h_in >= 6]
-    )
-
-
 @pytest.mark.parametrize("dtype,num_items", three_way_partition_params)
 def test_three_way_partition_basic(dtype, num_items, monkeypatch):
     # NOTE: the SASS check failure is seen only with NVRTC 13.1:
diff --git a/python/cuda_cccl/tests/compute/test_transform.py b/python/cuda_cccl/tests/compute/test_transform.py
index b8429726d77..c7c3ca2818a 100644
--- a/python/cuda_cccl/tests/compute/test_transform.py
+++ b/python/cuda_cccl/tests/compute/test_transform.py
@@ -266,7 +266,6 @@ def op2(a):
     np.testing.assert_allclose(expected, got)
 
 
-@pytest.mark.no_numba
 def test_unary_transform_well_known_negate():
     """Test unary transform with well-known NEGATE operation."""
     dtype = np.int32
@@ -283,7 +282,6 @@ def test_unary_transform_well_known_negate():
     np.testing.assert_equal(d_output.get(), expected)
 
 
-@pytest.mark.no_numba
 def test_unary_transform_well_known_identity():
     """Test unary transform with well-known IDENTITY operation."""
     dtype = np.int32
@@ -300,7 +298,6 @@ def test_unary_transform_well_known_identity():
     np.testing.assert_equal(d_output.get(), expected)
 
 
-@pytest.mark.no_numba
 @pytest.mark.parametrize("dtype", [np.int32, np.float16])
 def test_binary_transform_well_known_plus(dtype):
     """Test binary transform with well-known PLUS operation."""
@@ -322,7 +319,6 @@ def test_binary_transform_well_known_plus(dtype):
     np.testing.assert_equal(d_output.get(), expected)
 
 
-@pytest.mark.no_numba
 def test_binary_transform_well_known_multiplies():
     """Test binary transform with well-known MULTIPLIES operation."""
     dtype = np.int32
@@ -590,7 +586,6 @@ def test_binary_transform_with_lambda():
     np.testing.assert_array_equal(d_out.get(), expected)
 
 
-@pytest.mark.no_numba
 def test_binary_transform_bool_equal_to():
     d_input1 = cp.array([True, False, True, False], dtype=np.bool_)
     d_input2 = cp.array([True, True, False, False], dtype=np.bool_)
diff --git a/python/cuda_cccl/tests/compute/test_unique_by_key.py b/python/cuda_cccl/tests/compute/test_unique_by_key.py
index 945d4d99028..83a4a17db67 100644
--- a/python/cuda_cccl/tests/compute/test_unique_by_key.py
+++ b/python/cuda_cccl/tests/compute/test_unique_by_key.py
@@ -4,6 +4,7 @@
 
 
 import cupy as cp
+import numba.cuda
 import numpy as np
 import pytest
 
@@ -14,7 +15,6 @@
     OpKind,
     gpu_struct,
 )
-from cuda.core import Device
 
 DTYPE_LIST = [
     np.uint8,
@@ -125,7 +125,7 @@ def compare_op(lhs, rhs):
 
 @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params)
 def test_unique_by_key(dtype, num_items, op, monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -143,11 +143,11 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch):
     h_out_items = np.empty(num_items, dtype=np.float32)
     h_out_num_selected = np.empty(1, np.int32)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_items = cp.asarray(h_in_items)
-    d_out_keys = cp.asarray(h_out_keys)
-    d_out_items = cp.asarray(h_out_items)
-    d_out_num_selected = cp.asarray(h_out_num_selected)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_items = numba.cuda.to_device(h_in_items)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_out_items = numba.cuda.to_device(h_out_items)
+    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
 
     unique_by_key_device(
         d_in_keys,
@@ -159,10 +159,10 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch):
         num_items,
     )
 
-    h_out_num_selected = d_out_num_selected.get()
+    h_out_num_selected = d_out_num_selected.copy_to_host()
     num_selected = h_out_num_selected[0]
-    h_out_keys = d_out_keys.get()[:num_selected]
-    h_out_items = d_out_items.get()[:num_selected]
+    h_out_keys = d_out_keys.copy_to_host()[:num_selected]
+    h_out_items = d_out_items.copy_to_host()[:num_selected]
 
     expected_keys, expected_items = unique_by_key_host(h_in_keys, h_in_items)
 
@@ -172,7 +172,7 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch):
 
 @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params)
 def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip sass verification for CC 9.0+, due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -190,11 +190,11 @@ def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch):
     h_out_items = np.empty(num_items, dtype=np.float32)
     h_out_num_selected = np.empty(1, np.int64)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_items = cp.asarray(h_in_items)
-    d_out_keys = cp.asarray(h_out_keys)
-    d_out_items = cp.asarray(h_out_items)
-    d_out_num_selected = cp.asarray(h_out_num_selected)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_items = numba.cuda.to_device(h_in_items)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_out_items = numba.cuda.to_device(h_out_items)
+    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
 
     i_in_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream")
     i_in_items = CacheModifiedInputIterator(d_in_items, modifier="stream")
@@ -209,10 +209,10 @@ def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch):
         num_items,
     )
 
-    h_out_num_selected = d_out_num_selected.get()
+    h_out_num_selected = d_out_num_selected.copy_to_host()
     num_selected = h_out_num_selected[0]
-    h_out_keys = d_out_keys.get()[:num_selected]
-    h_out_items = d_out_items.get()[:num_selected]
+    h_out_keys = d_out_keys.copy_to_host()[:num_selected]
+    h_out_items = d_out_items.copy_to_host()[:num_selected]
 
     expected_keys, expected_items = unique_by_key_host(h_in_keys, h_in_items)
 
@@ -226,9 +226,9 @@ def test_unique_by_key_keys_only():
     h_out_keys = np.empty(num_items, dtype=np.int32)
     h_out_num_selected = np.empty(1, np.int32)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_out_keys = cp.asarray(h_out_keys)
-    d_out_num_selected = cp.asarray(h_out_num_selected)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
 
     unique_by_key_device(
         d_in_keys,
@@ -240,9 +240,9 @@ def test_unique_by_key_keys_only():
         num_items,
     )
 
-    h_out_num_selected = d_out_num_selected.get()
+    h_out_num_selected = d_out_num_selected.copy_to_host()
     num_selected = h_out_num_selected[0]
-    h_out_keys = d_out_keys.get()[:num_selected]
+    h_out_keys = d_out_keys.copy_to_host()[:num_selected]
 
     expected_keys, _ = unique_by_key_host(
         h_in_keys,
@@ -267,11 +267,11 @@ def compare_complex(lhs, rhs):
     h_out_items = np.empty(num_items, dtype=np.float32)
     h_out_num_selected = np.empty(1, np.int32)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_items = cp.asarray(h_in_items)
-    d_out_keys = cp.asarray(h_out_keys)
-    d_out_items = cp.asarray(h_out_items)
-    d_out_num_selected = cp.asarray(h_out_num_selected)
+    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_items = numba.cuda.to_device(h_in_items)
+    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_out_items = numba.cuda.to_device(h_out_items)
+    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
 
     unique_by_key_device(
         d_in_keys,
@@ -283,10 +283,10 @@ def compare_complex(lhs, rhs):
         num_items,
     )
 
-    h_out_num_selected = d_out_num_selected.get()
+    h_out_num_selected = d_out_num_selected.copy_to_host()
     num_selected = h_out_num_selected[0]
-    h_out_keys = d_out_keys.get()[:num_selected]
-    h_out_items = d_out_items.get()[:num_selected]
+    h_out_keys = d_out_keys.copy_to_host()[:num_selected]
+    h_out_items = d_out_items.copy_to_host()[:num_selected]
 
     expected_keys, expected_items = unique_by_key_host(
         h_in_keys, h_in_items, compare_complex
@@ -363,7 +363,7 @@ def struct_compare_op(lhs, rhs):
 
 
 def test_unique_by_key_with_stream(cuda_stream, monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -417,9 +417,8 @@ def test_unique_by_key_with_stream(cuda_stream, monkeypatch):
     np.testing.assert_array_equal(h_out_items, expected_items)
 
 
-@pytest.mark.no_numba
 def test_unique_by_key_well_known_equal_to(monkeypatch):
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
diff --git a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py
index 2788a3c0893..e08709b6e1a 100644
--- a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py
+++ b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py
@@ -12,16 +12,10 @@
 """
 
 import pytest
+from numba import types
 
-numba = pytest.importorskip("numba")
-types = numba.types
-
-from cuda.compute._odr_helpers import (  # noqa: E402
-    _ArgMode,
-    _ArgSpec,
-    _create_void_ptr_wrapper,
-)
-from cuda.compute._utils import sanitize_identifier  # noqa: E402
+from cuda.compute._odr_helpers import _ArgMode, _ArgSpec, _create_void_ptr_wrapper
+from cuda.compute._utils import sanitize_identifier
 
 
 def _make_arg_specs():
diff --git a/python/cuda_cccl/tests/compute/test_zip_iterator.py b/python/cuda_cccl/tests/compute/test_zip_iterator.py
index 9fbed38359f..85630bb4f5f 100644
--- a/python/cuda_cccl/tests/compute/test_zip_iterator.py
+++ b/python/cuda_cccl/tests/compute/test_zip_iterator.py
@@ -12,7 +12,6 @@
     ZipIterator,
     gpu_struct,
 )
-from cuda.core import Device
 
 
 @pytest.mark.parametrize("num_items", [10, 1_000, 100_000])
@@ -262,8 +261,10 @@ def min_pairs(p1, p2):
 @pytest.mark.parametrize("num_items", [10, 1000])
 def test_output_zip_iterator_with_scan(monkeypatch, num_items):
     """Test ZipIterator as output iterator with scan operations."""
+    import numba.cuda
+
     # Skip SASS check for CC 8.0+ due to LDL/STL CI failure.
-    cc_major, _ = Device().compute_capability
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     if cc_major >= 8:
         monkeypatch.setattr(
             cuda.compute._cccl_interop,
@@ -425,7 +426,9 @@ def sum_nested_zips(v1, v2):
     ],
 )
 def test_nested_output_zip_iterator_with_scan(monkeypatch, num_items, dtype_map):
-    cc_major, _ = Device().compute_capability
+    import numba.cuda
+
+    cc_major, _ = numba.cuda.get_current_device().compute_capability
     if cc_major >= 8:
         monkeypatch.setattr(
             cuda.compute._cccl_interop,
@@ -499,7 +502,6 @@ def g(x):
     assert it1.kind != it2.kind
 
 
-@pytest.mark.no_numba
 def test_caching_zip_iterator():
     """Test that iterator compilation is cached across instances with the same structure."""
     from cuda.compute._cpp_compile import compile_cpp_op_code
@@ -575,7 +577,6 @@ def test_caching_zip_iterator():
     assert len(set(kinds)) == 1, "Same CountingIterator types should have same kind"
 
 
-@pytest.mark.no_numba
 def test_compilation_caching_across_iterator_types():
     """Test that compilation caching works across different iterator types."""
     from cuda.compute import ConstantIterator