From a79569e667706e97b51a876074ef5947431957a8 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Fri, 12 Jun 2026 10:59:39 -0500 Subject: [PATCH 01/16] Add initial free threading cuda.compute implementation --- c/parallel/src/transform.cu | 16 + python/cuda_cccl/CMakeLists.txt | 19 + .../cuda_cccl/cuda/compute/_bindings_impl.pyx | 1 + python/cuda_cccl/cuda/compute/_caching.py | 167 +- .../cuda/compute/algorithms/_binary_search.py | 20 +- .../cuda/compute/algorithms/_histogram.py | 31 +- .../cuda/compute/algorithms/_reduce.py | 20 +- .../cuda/compute/algorithms/_scan.py | 21 +- .../compute/algorithms/_segmented_reduce.py | 25 +- .../compute/algorithms/_sort/_merge_sort.py | 22 +- .../compute/algorithms/_sort/_radix_sort.py | 27 +- .../algorithms/_sort/_segmented_sort.py | 29 +- .../algorithms/_three_way_partition.py | 28 +- .../cuda/compute/algorithms/_transform.py | 33 +- .../cuda/compute/algorithms/_unique_by_key.py | 25 +- .../cuda_cccl/cuda/compute/iterators/_base.py | 19 +- .../compute/test_free_threading_stress.py | 1444 +++++++++++++++++ 17 files changed, 1848 insertions(+), 99 deletions(-) create mode 100644 python/cuda_cccl/tests/compute/test_free_threading_stress.py diff --git a/c/parallel/src/transform.cu b/c/parallel/src/transform.cu index 63f36106a16..81e8b1dbf88 100644 --- a/c/parallel/src/transform.cu +++ b/c/parallel/src/transform.cu @@ -120,24 +120,32 @@ struct transform_kernel_source cub::detail::transform::cuda_expected CacheAsyncConfiguration(const ActionT& action) { +#if defined(CCCL_PYTHON_FREE_THREADED) + return action(); +#else // defined(CCCL_PYTHON_FREE_THREADED) auto cache = reinterpret_cast(build.cache); if (!cache->async_config.has_value()) { cache->async_config = action(); } return *cache->async_config; +#endif // defined(CCCL_PYTHON_FREE_THREADED) } template cub::detail::transform::cuda_expected CachePrefetchConfiguration(const ActionT& action) { +#if defined(CCCL_PYTHON_FREE_THREADED) + return action(); +#else // defined(CCCL_PYTHON_FREE_THREADED) auto cache = reinterpret_cast(build.cache); if (!cache->prefetch_config.has_value()) { cache->prefetch_config = action(); } return *cache->prefetch_config; +#endif // defined(CCCL_PYTHON_FREE_THREADED) } CUkernel TransformKernel() const @@ -325,7 +333,11 @@ static_assert(device_transform_policy()(detail::current_tuning_cc()) == {9}, "Ho build_ptr->cc = cc_major * 10 + cc_minor; build_ptr->cubin = (void*) result.data.release(); build_ptr->cubin_size = result.size; +#if defined(CCCL_PYTHON_FREE_THREADED) + build_ptr->cache = nullptr; +#else // defined(CCCL_PYTHON_FREE_THREADED) build_ptr->cache = new transform::cache(); +#endif // defined(CCCL_PYTHON_FREE_THREADED) // avoid new and delete which requires the allocated and freed types to match static_assert(::cuda::is_trivially_copyable_v); @@ -526,7 +538,11 @@ static_assert(device_transform_policy()(detail::current_tuning_cc()) == {12}, "H build_ptr->cc = cc_major * 10 + cc_minor; build_ptr->cubin = (void*) result.data.release(); build_ptr->cubin_size = result.size; +#if defined(CCCL_PYTHON_FREE_THREADED) + build_ptr->cache = nullptr; +#else // defined(CCCL_PYTHON_FREE_THREADED) build_ptr->cache = new transform::cache(); +#endif // defined(CCCL_PYTHON_FREE_THREADED) // avoid new and delete which requires the allocated and freed types to match static_assert(::cuda::is_trivially_copyable_v); diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index 09044f19442..905087a7f42 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -83,6 +83,25 @@ install( # Build and install Cython extension find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED) +set( + _python_gil_disabled_query + "import sysconfig; print('1' if sysconfig.get_config_var('Py_GIL_DISABLED') in (1, '1') else '0')" +) +execute_process( + COMMAND "${Python3_EXECUTABLE}" -c "${_python_gil_disabled_query}" + OUTPUT_VARIABLE _python_gil_disabled + RESULT_VARIABLE _python_gil_disabled_result + OUTPUT_STRIP_TRAILING_WHITESPACE +) +if (NOT _python_gil_disabled_result EQUAL 0) + message(FATAL_ERROR "Failed to query Py_GIL_DISABLED from ${Python3_EXECUTABLE}") +endif() + +if (Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 14 AND "${_python_gil_disabled}" STREQUAL "1") + target_compile_definitions(${_cccl_c_parallel_target} PRIVATE CCCL_PYTHON_FREE_THREADED=1) + message(STATUS "Enabling CCCL_PYTHON_FREE_THREADED for Python 3.14t") +endif() + get_filename_component(_python_path "${Python3_EXECUTABLE}" PATH) set(CYTHON_version_command "${Python3_EXECUTABLE}" -m cython --version) diff --git a/python/cuda_cccl/cuda/compute/_bindings_impl.pyx b/python/cuda_cccl/cuda/compute/_bindings_impl.pyx index 89ad14b2209..43c2ad3f1a2 100644 --- a/python/cuda_cccl/cuda/compute/_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/compute/_bindings_impl.pyx @@ -6,6 +6,7 @@ # distutils: language = c++ # cython: language_level=3 # cython: linetrace=True +# cython: freethreading_compatible=True # Python signatures are declared in the companion Python stub file _bindings.pyi # Make sure to update PYI with change to Python API to ensure that Python diff --git a/python/cuda_cccl/cuda/compute/_caching.py b/python/cuda_cccl/cuda/compute/_caching.py index 8df4779e2cc..f3315bc0a0a 100644 --- a/python/cuda_cccl/cuda/compute/_caching.py +++ b/python/cuda_cccl/cuda/compute/_caching.py @@ -6,15 +6,14 @@ from __future__ import annotations import functools +import threading import types +import weakref from typing import Any, Callable, Hashable import numpy as np -try: - from cuda.core import Device -except ImportError: - from cuda.core.experimental import Device +from cuda.core import Device from ._utils.protocols import get_dtype, get_shape, is_device_array from .struct import _Struct @@ -93,6 +92,150 @@ def _make_cache_key_from_args(*args, **kwargs) -> tuple: _cache_registry: dict[str, object] = {} +class _ThreadLocalCaches: + """ + Container for wrapper caches owned by a single Python thread. + + Each thread gets its own instance via ``threading.local()``. We use + ``__weakref__`` to enable the process-wide registry of caches to hold weak + references to the thread's caches. That way, if a thread exits, its caches + will be garbage collected and removed from the registry even if the + process-wide registry still references them. + """ + + __slots__ = ("wrapper_caches", "__weakref__") + + def __init__(self) -> None: + # Outer key: decorated algorithm factory name. Inner key: current thread + # id, current CUDA runtime device ordinal, compute capability, and + # specialization key derived from factory arguments. + self.wrapper_caches: dict[str, dict[Hashable, Any]] = {} + + +class _InFlightBuild: + """ + Coordination state for one shared build-result currently being built. + + The first thread for a cache key runs the builder. Other threads wait on + ``condition`` and receive either the completed build result or the builder's + exception. + """ + + def __init__(self) -> None: + self.condition = threading.Condition() + self.done = False + self.result: Any = None + self.exception: BaseException | None = None + + +_thread_local = threading.local() +# Process wide registry of per-thread caches. It enables a thread to call +# clear_all_caches() to clear all caches across all threads. +_thread_cache_registry: weakref.WeakSet[_ThreadLocalCaches] = weakref.WeakSet() +_thread_cache_registry_lock = threading.Lock() + +_shared_build_cache: dict[Hashable, Any] = {} +_in_flight_builds: dict[Hashable, _InFlightBuild] = {} +_shared_build_cache_lock = threading.Lock() + + +def _get_current_device_info() -> tuple[int, tuple[int, int]]: + device = Device() + cc = device.compute_capability + return device.device_id, (cc.major, cc.minor) + + +def _get_thread_caches() -> _ThreadLocalCaches: + caches = getattr(_thread_local, "caches", None) + if caches is None: + caches = _ThreadLocalCaches() + _thread_local.caches = caches + with _thread_cache_registry_lock: + _thread_cache_registry.add(caches) + return caches + + +def _clear_wrapper_caches(cache_name: str | None = None) -> None: + with _thread_cache_registry_lock: + thread_caches = list(_thread_cache_registry) + + for caches in thread_caches: + if cache_name is None: + caches.wrapper_caches.clear() + else: + caches.wrapper_caches.pop(cache_name, None) + + +def cache_build_result( + build_result_type: type, + *key_args, + builder: Callable[[], Any], +) -> Any: + """ + Cache a shared Cython build-result object for the current CUDA device. + + The key intentionally excludes the current Python thread. Wrappers are + cached per thread, but build results are shared across threads for the same + device ordinal and specialization key. + + Args: + build_result_type: Cython build-result type. This separates different + build-result caches that may otherwise have identical specialization + keys. + *key_args: Positional values used to form the specialization part of + the cache key. + builder: Callable that creates the build result on a cache miss. + Exactly one thread runs this callable for a given key while other + threads wait for the result. + + Returns: + The cached or newly built Cython build-result object. + """ + device_id, cc_key = _get_current_device_info() + user_cache_key = _make_cache_key_from_args(*key_args) + cache_key = (build_result_type, device_id, cc_key, user_cache_key) + + with _shared_build_cache_lock: + if cache_key in _shared_build_cache: + return _shared_build_cache[cache_key] + + in_flight = _in_flight_builds.get(cache_key) + if in_flight is None: + in_flight = _InFlightBuild() + _in_flight_builds[cache_key] = in_flight + is_builder = True + else: + is_builder = False + + if is_builder: + try: + result = builder() + except BaseException as exc: + with _shared_build_cache_lock: + _in_flight_builds.pop(cache_key, None) + with in_flight.condition: + in_flight.exception = exc + in_flight.done = True + in_flight.condition.notify_all() + raise + + with _shared_build_cache_lock: + _shared_build_cache[cache_key] = result + _in_flight_builds.pop(cache_key, None) + with in_flight.condition: + in_flight.result = result + in_flight.done = True + in_flight.condition.notify_all() + return result + + with in_flight.condition: + while not in_flight.done: + in_flight.condition.wait() + if in_flight.exception is not None: + raise in_flight.exception + return in_flight.result + + class _CacheWithRegisteredKeyFunctions: """ Decorator to cache the result of the decorated function. @@ -113,19 +256,21 @@ def __call__(self, func: Callable) -> Callable: The CUDA compute capability of the current device is appended to the cache key. """ - cache: dict = {} + cache_name = func.__qualname__ @functools.wraps(func) def inner(*args, **kwargs): - cc = Device().compute_capability + device_id, cc_key = _get_current_device_info() user_cache_key = _make_cache_key_from_args(*args, **kwargs) - cache_key = (user_cache_key, tuple(cc)) + cache_key = (threading.get_ident(), device_id, cc_key, user_cache_key) + thread_caches = _get_thread_caches() + cache = thread_caches.wrapper_caches.setdefault(cache_name, {}) if cache_key not in cache: result = func(*args, **kwargs) cache[cache_key] = result return cache[cache_key] - inner.cache_clear = cache.clear # type: ignore[attr-defined] + inner.cache_clear = lambda: _clear_wrapper_caches(cache_name) # type: ignore[attr-defined] # Register the cache in the central registry _cache_registry[func.__qualname__] = inner @@ -182,8 +327,10 @@ def clear_all_caches(): >>> import cuda.compute >>> cuda.compute.clear_all_caches() """ - for cached_func in _cache_registry.values(): - cached_func.cache_clear() + _clear_wrapper_caches() + with _shared_build_cache_lock: + _shared_build_cache.clear() + _in_flight_builds.clear() class CachableFunction: diff --git a/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py b/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py index 6cadd994ceb..83928127bc6 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py @@ -9,7 +9,7 @@ from .. import _bindings, types from .. import _cccl_interop as cccl -from .._caching import cache_with_registered_key_functions +from .._caching import cache_build_result, cache_with_registered_key_functions from .._cccl_interop import call_build, set_cccl_iterator_state from .._utils import protocols from ..op import OpAdapter, OpKind, make_op_adapter @@ -71,13 +71,21 @@ def __init__( self.op_cccl = comp.compile((data_value_type, data_value_type), types.uint8) - self.build_result = call_build( + self.build_result = cache_build_result( _bindings.DeviceBinarySearchBuildResult, + d_data, + d_values, + d_out, + comp, mode, - self.d_data_cccl, - self.d_values_cccl, - self.d_out_cccl, - self.op_cccl, + builder=lambda: call_build( + _bindings.DeviceBinarySearchBuildResult, + mode, + self.d_data_cccl, + self.d_values_cccl, + self.d_out_cccl, + self.op_cccl, + ), ) def __call__( diff --git a/python/cuda_cccl/cuda/compute/algorithms/_histogram.py b/python/cuda_cccl/cuda/compute/algorithms/_histogram.py index f865a767dab..6b406989ef2 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_histogram.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_histogram.py @@ -11,7 +11,7 @@ from .. import _bindings from .. import _cccl_interop as cccl -from .._caching import cache_with_registered_key_functions +from .._caching import cache_build_result, cache_with_registered_key_functions from .._cccl_interop import call_build, set_cccl_iterator_state, to_cccl_value_state from .._utils.protocols import get_data_pointer, validate_and_get_stream from .._utils.temp_storage_buffer import TempStorageBuffer @@ -51,17 +51,28 @@ def __init__( self.h_lower_level_cccl = cccl.to_cccl_value(h_lower_level) self.h_upper_level_cccl = cccl.to_cccl_value(h_upper_level) - self.build_result = call_build( + self.build_result = cache_build_result( _bindings.DeviceHistogramBuildResult, - num_channels, - num_active_channels, - self.d_samples_cccl, - num_levels, - self.d_histogram_cccl, - self.h_lower_level_cccl, - self.num_rows, - row_stride_samples, + d_samples, + d_histogram, + int(num_levels), + h_lower_level[0].item(), + h_upper_level[0].item(), + h_lower_level.dtype, + num_samples, is_evenly_segmented, + builder=lambda: call_build( + _bindings.DeviceHistogramBuildResult, + num_channels, + num_active_channels, + self.d_samples_cccl, + num_levels, + self.d_histogram_cccl, + self.h_lower_level_cccl, + self.num_rows, + row_stride_samples, + is_evenly_segmented, + ), ) def __call__( diff --git a/python/cuda_cccl/cuda/compute/algorithms/_reduce.py b/python/cuda_cccl/cuda/compute/algorithms/_reduce.py index d9c20cad2dc..d41b5223cb6 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_reduce.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_reduce.py @@ -11,7 +11,7 @@ from .. import _bindings from .. import _cccl_interop as cccl -from .._caching import cache_with_registered_key_functions +from .._caching import cache_build_result, cache_with_registered_key_functions from .._cccl_interop import ( call_build, get_value_type, @@ -59,13 +59,21 @@ def __init__( value_type = get_value_type(h_init) self.op_cccl = op.compile((value_type, value_type), value_type) - self.build_result = call_build( + self.build_result = cache_build_result( _bindings.DeviceReduceBuildResult, - self.d_in_cccl, - self.d_out_cccl, - self.op_cccl, - self.h_init_cccl, + d_in, + d_out, + op, + h_init, determinism, + builder=lambda: call_build( + _bindings.DeviceReduceBuildResult, + self.d_in_cccl, + self.d_out_cccl, + self.op_cccl, + self.h_init_cccl, + determinism, + ), ) match determinism: diff --git a/python/cuda_cccl/cuda/compute/algorithms/_scan.py b/python/cuda_cccl/cuda/compute/algorithms/_scan.py index bc7ecd4c587..21d78b342ad 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_scan.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_scan.py @@ -11,7 +11,7 @@ from .. import _bindings from .. import _cccl_interop as cccl -from .._caching import cache_with_registered_key_functions +from .._caching import cache_build_result, cache_with_registered_key_functions from .._cccl_interop import ( call_build, get_value_type, @@ -89,14 +89,23 @@ def __init__( # Compile the op with value types self.op_cccl = op.compile((value_type, value_type), value_type) - self.build_result = call_build( + self.build_result = cache_build_result( _bindings.DeviceScanBuildResult, - self.d_in_cccl, - self.d_out_cccl, - self.op_cccl, - init_value_type_info, + d_in, + d_out, + op, + init_value, force_inclusive, self.init_kind, + builder=lambda: call_build( + _bindings.DeviceScanBuildResult, + self.d_in_cccl, + self.d_out_cccl, + self.op_cccl, + init_value_type_info, + force_inclusive, + self.init_kind, + ), ) match (force_inclusive, self.init_kind): diff --git a/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py b/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py index 74b593f9944..5edfa5e0312 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py @@ -11,7 +11,7 @@ from .. import _bindings from .. import _cccl_interop as cccl -from .._caching import cache_with_registered_key_functions +from .._caching import cache_build_result, cache_with_registered_key_functions from .._cccl_interop import ( call_build, get_value_type, @@ -58,14 +58,23 @@ def __init__( self.op_cccl = op.compile((value_type, value_type), value_type) - self.build_result = call_build( + self.build_result = cache_build_result( _bindings.DeviceSegmentedReduceBuildResult, - self.d_in_cccl, - self.d_out_cccl, - self.start_offsets_in_cccl, - self.end_offsets_in_cccl, - self.op_cccl, - self.h_init_cccl, + d_in, + d_out, + start_offsets_in, + end_offsets_in, + op, + h_init, + builder=lambda: call_build( + _bindings.DeviceSegmentedReduceBuildResult, + self.d_in_cccl, + self.d_out_cccl, + self.start_offsets_in_cccl, + self.end_offsets_in_cccl, + self.op_cccl, + self.h_init_cccl, + ), ) def __call__( diff --git a/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py b/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py index 1070042a4c4..ace448bd511 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py @@ -8,7 +8,7 @@ from ... import _bindings, types from ... import _cccl_interop as cccl -from ..._caching import cache_with_registered_key_functions +from ..._caching import cache_build_result, cache_with_registered_key_functions from ..._cccl_interop import call_build, set_cccl_iterator_state from ..._utils.protocols import ( get_data_pointer, @@ -52,13 +52,21 @@ def __init__( value_type = cccl.get_value_type(d_in_keys) self.op_cccl = op.compile((value_type, value_type), types.int8) - self.build_result = call_build( + self.build_result = cache_build_result( _bindings.DeviceMergeSortBuildResult, - self.d_in_keys_cccl, - self.d_in_values_cccl, - self.d_out_keys_cccl, - self.d_out_values_cccl, - self.op_cccl, + d_in_keys, + d_in_values, + d_out_keys, + d_out_values, + op, + builder=lambda: call_build( + _bindings.DeviceMergeSortBuildResult, + self.d_in_keys_cccl, + self.d_in_values_cccl, + self.d_out_keys_cccl, + self.d_out_values_cccl, + self.op_cccl, + ), ) def __call__( diff --git a/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py b/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py index d09dcd0a79a..cf2af440135 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py @@ -7,7 +7,7 @@ from ... import _bindings from ... import _cccl_interop as cccl -from ..._caching import cache_with_registered_key_functions +from ..._caching import cache_build_result, cache_with_registered_key_functions from ..._cccl_interop import call_build, set_cccl_iterator_state from ..._utils.protocols import ( get_data_pointer, @@ -56,15 +56,26 @@ def __init__( ) decomposer_return_type = "".encode("utf-8") - self.build_result = call_build( - _bindings.DeviceRadixSortBuildResult, + build_order = ( _bindings.SortOrder.ASCENDING if order is SortOrder.ASCENDING - else _bindings.SortOrder.DESCENDING, - self.d_in_keys_cccl, - self.d_in_values_cccl, - self.decomposer_op, - decomposer_return_type, + else _bindings.SortOrder.DESCENDING + ) + self.build_result = cache_build_result( + _bindings.DeviceRadixSortBuildResult, + d_in_keys, + d_out_keys, + d_in_values, + d_out_values, + order, + builder=lambda: call_build( + _bindings.DeviceRadixSortBuildResult, + build_order, + self.d_in_keys_cccl, + self.d_in_values_cccl, + self.decomposer_op, + decomposer_return_type, + ), ) def __call__( diff --git a/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py b/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py index d74cd256a81..33ec5279c36 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py @@ -9,7 +9,7 @@ from ... import _bindings from ... import _cccl_interop as cccl -from ..._caching import cache_with_registered_key_functions +from ..._caching import cache_build_result, cache_with_registered_key_functions from ..._cccl_interop import call_build, set_cccl_iterator_state from ..._utils.protocols import ( get_data_pointer, @@ -52,15 +52,28 @@ def __init__( self.start_offsets_in_cccl = cccl.to_cccl_input_iter(start_offsets_in) self.end_offsets_in_cccl = cccl.to_cccl_input_iter(end_offsets_in) - self.build_result = call_build( - _bindings.DeviceSegmentedSortBuildResult, + build_order = ( _bindings.SortOrder.ASCENDING if order is SortOrder.ASCENDING - else _bindings.SortOrder.DESCENDING, - self.d_in_keys_cccl, - self.d_in_values_cccl, - self.start_offsets_in_cccl, - self.end_offsets_in_cccl, + else _bindings.SortOrder.DESCENDING + ) + self.build_result = cache_build_result( + _bindings.DeviceSegmentedSortBuildResult, + d_in_keys, + d_out_keys, + d_in_values, + d_out_values, + start_offsets_in, + end_offsets_in, + order, + builder=lambda: call_build( + _bindings.DeviceSegmentedSortBuildResult, + build_order, + self.d_in_keys_cccl, + self.d_in_values_cccl, + self.start_offsets_in_cccl, + self.end_offsets_in_cccl, + ), ) def __call__( diff --git a/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py b/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py index fbd3154feb8..d5b076028c5 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py @@ -9,7 +9,7 @@ from .. import _bindings, types from .. import _cccl_interop as cccl -from .._caching import cache_with_registered_key_functions +from .._caching import cache_build_result, cache_with_registered_key_functions from .._cccl_interop import call_build, set_cccl_iterator_state from .._utils import protocols from .._utils.temp_storage_buffer import TempStorageBuffer @@ -54,15 +54,25 @@ def __init__( (value_type,), types.uint8 ) - self.build_result = call_build( + self.build_result = cache_build_result( _bindings.DeviceThreeWayPartitionBuildResult, - self.d_in_cccl, - self.d_first_part_out_cccl, - self.d_second_part_out_cccl, - self.d_unselected_out_cccl, - self.d_num_selected_out_cccl, - self.select_first_part_op_cccl, - self.select_second_part_op_cccl, + d_in, + d_first_part_out, + d_second_part_out, + d_unselected_out, + d_num_selected_out, + select_first_part_op, + select_second_part_op, + builder=lambda: call_build( + _bindings.DeviceThreeWayPartitionBuildResult, + self.d_in_cccl, + self.d_first_part_out_cccl, + self.d_second_part_out_cccl, + self.d_unselected_out_cccl, + self.d_num_selected_out_cccl, + self.select_first_part_op_cccl, + self.select_second_part_op_cccl, + ), ) def __call__( diff --git a/python/cuda_cccl/cuda/compute/algorithms/_transform.py b/python/cuda_cccl/cuda/compute/algorithms/_transform.py index f987efd915c..4e32f9dfbdc 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_transform.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_transform.py @@ -9,7 +9,7 @@ from .. import _bindings from .. import _cccl_interop as cccl -from .._caching import cache_with_registered_key_functions +from .._caching import cache_build_result, cache_with_registered_key_functions from .._cccl_interop import set_cccl_iterator_state from .._utils import protocols from ..op import OpAdapter, make_op_adapter @@ -33,11 +33,17 @@ def __init__( out_type = cccl.get_value_type(d_out) self.op_cccl = op.compile((in_type,), out_type) - self.build_result = cccl.call_build( + self.build_result = cache_build_result( _bindings.DeviceUnaryTransform, - self.d_in_cccl, - self.d_out_cccl, - self.op_cccl, + d_in, + d_out, + op, + builder=lambda: cccl.call_build( + _bindings.DeviceUnaryTransform, + self.d_in_cccl, + self.d_out_cccl, + self.op_cccl, + ), ) def __call__( @@ -92,12 +98,19 @@ def __init__( out_type = cccl.get_value_type(d_out) self.op_cccl = op.compile((in1_type, in2_type), out_type) - self.build_result = cccl.call_build( + self.build_result = cache_build_result( _bindings.DeviceBinaryTransform, - self.d_in1_cccl, - self.d_in2_cccl, - self.d_out_cccl, - self.op_cccl, + d_in1, + d_in2, + d_out, + op, + builder=lambda: cccl.call_build( + _bindings.DeviceBinaryTransform, + self.d_in1_cccl, + self.d_in2_cccl, + self.d_out_cccl, + self.op_cccl, + ), ) def __call__( diff --git a/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py b/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py index 0e39f182507..7b42038dfbf 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py @@ -8,7 +8,7 @@ from .. import _bindings, types from .. import _cccl_interop as cccl -from .._caching import cache_with_registered_key_functions +from .._caching import cache_build_result, cache_with_registered_key_functions from .._cccl_interop import call_build, set_cccl_iterator_state from .._utils.protocols import ( get_data_pointer, @@ -49,14 +49,23 @@ def __init__( value_type = cccl.get_value_type(d_in_keys) self.op_cccl = op.compile((value_type, value_type), types.uint8) - self.build_result = call_build( + self.build_result = cache_build_result( _bindings.DeviceUniqueByKeyBuildResult, - self.d_in_keys_cccl, - self.d_in_items_cccl, - self.d_out_keys_cccl, - self.d_out_items_cccl, - self.d_out_num_selected_cccl, - self.op_cccl, + d_in_keys, + d_in_items, + d_out_keys, + d_out_items, + d_out_num_selected, + op, + builder=lambda: call_build( + _bindings.DeviceUniqueByKeyBuildResult, + self.d_in_keys_cccl, + self.d_in_items_cccl, + self.d_out_keys_cccl, + self.d_out_items_cccl, + self.d_out_num_selected_cccl, + self.op_cccl, + ), ) def __call__( diff --git a/python/cuda_cccl/cuda/compute/iterators/_base.py b/python/cuda_cccl/cuda/compute/iterators/_base.py index cc1b1b83fc8..746f2dc5931 100644 --- a/python/cuda_cccl/cuda/compute/iterators/_base.py +++ b/python/cuda_cccl/cuda/compute/iterators/_base.py @@ -9,6 +9,7 @@ from __future__ import annotations import hashlib +import threading from typing import Hashable from .._bindings import Iterator, IteratorKind, IteratorState, Op @@ -54,6 +55,7 @@ class IteratorBase: "_input_deref_op", "_output_deref_op", "_uid_cached", + "_op_lock", ] def __init__( @@ -75,6 +77,11 @@ def __init__( self._input_deref_op: Op | None = None self._output_deref_op: Op | None = None self._uid_cached: str | None = None + # Free-threaded Python can let multiple threads share a read-only + # iterator object and race during the first lazy Op construction. + # The lock only protects that cache miss path; cached access stays + # lock-free and iterator mutation remains the caller's responsibility. + self._op_lock = threading.Lock() @property def state(self) -> IteratorState: @@ -117,19 +124,25 @@ def _make_output_deref_symbol(self) -> str: def get_advance_op(self) -> Op: """Get the cached Op for the advance operation.""" if self._advance_op is None: - self._advance_op = self._make_advance_op() + with self._op_lock: + if self._advance_op is None: + self._advance_op = self._make_advance_op() return self._advance_op def get_input_deref_op(self) -> Op | None: """Get the cached Op for input dereference operation, or None if not supported.""" if self._input_deref_op is None: - self._input_deref_op = self._make_input_deref_op() + with self._op_lock: + if self._input_deref_op is None: + self._input_deref_op = self._make_input_deref_op() return self._input_deref_op def get_output_deref_op(self) -> Op | None: """Get the cached Op for output dereference operation, or None if not supported.""" if self._output_deref_op is None: - self._output_deref_op = self._make_output_deref_op() + with self._op_lock: + if self._output_deref_op is None: + self._output_deref_op = self._make_output_deref_op() return self._output_deref_op @property diff --git a/python/cuda_cccl/tests/compute/test_free_threading_stress.py b/python/cuda_cccl/tests/compute/test_free_threading_stress.py new file mode 100644 index 00000000000..fa2e389a627 --- /dev/null +++ b/python/cuda_cccl/tests/compute/test_free_threading_stress.py @@ -0,0 +1,1444 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from __future__ import annotations + +import concurrent.futures +import os +import sys +import sysconfig +import threading +from dataclasses import dataclass +from typing import Callable + +import numpy as np +import pytest + + +pytestmark = [ + pytest.mark.no_numba, + pytest.mark.no_verify_sass( + reason="Free-threading stress tests intentionally run concurrent workers." + ), +] + +STRESS_ITERATIONS = int(os.environ.get("CCCL_FREE_THREADING_STRESS_ITERATIONS", "10")) +STRESS_THREADS = int(os.environ.get("CCCL_FREE_THREADING_STRESS_THREADS", "2")) +TRANSFORM_NATIVE_CACHE_THREADS = int( + os.environ.get( + "CCCL_FREE_THREADING_TRANSFORM_NATIVE_CACHE_THREADS", + str(max(STRESS_THREADS, 4)), + ) +) + + +def _is_free_threaded_build() -> bool: + return sysconfig.get_config_var("Py_GIL_DISABLED") in (1, "1") + + +def _assert_gil_disabled(where: str) -> None: + is_gil_enabled = getattr(sys, "_is_gil_enabled", None) + if is_gil_enabled is not None and is_gil_enabled(): + pytest.fail(f"the GIL is enabled {where}") + + +def _require_free_threaded_python() -> None: + if not _is_free_threaded_build(): + pytest.skip("requires a free-threaded CPython build") + _assert_gil_disabled("before importing cuda.compute") + + +@pytest.fixture +def compute_modules(): + _require_free_threaded_python() + + import cupy as cp + + _assert_gil_disabled("after importing cupy") + + import cuda.compute as cc + + _assert_gil_disabled("after importing cuda.compute") + cc.clear_all_caches() + try: + yield cp, cc + finally: + cc.clear_all_caches() + + +class _CudaStream: + def __init__(self, stream): + self.stream = stream + + def __cuda_stream__(self): + return (0, self.stream.ptr) + + @property + def ptr(self): + return self.stream.ptr + + +def _make_stream(cp): + stream = cp.cuda.Stream() + return stream, _CudaStream(stream) + + +def _run_threaded(workers: list[Callable[[threading.Barrier], None]]) -> None: + barrier = threading.Barrier(len(workers)) + with concurrent.futures.ThreadPoolExecutor(max_workers=len(workers)) as executor: + futures = [executor.submit(worker, barrier) for worker in workers] + for future in futures: + future.result() + _assert_gil_disabled("after concurrent cuda.compute operations") + + +def _call_with_temp(cp, algorithm, **kwargs): + temp_storage_bytes = algorithm(temp_storage=None, **kwargs) + temp_storage = cp.empty(temp_storage_bytes, dtype=np.uint8) + return algorithm(temp_storage=temp_storage, **kwargs) + + +def _get_build_result(algorithm): + if hasattr(algorithm, "build_result"): + return algorithm.build_result + if hasattr(algorithm, "partitioner"): + return _get_build_result(algorithm.partitioner) + raise AssertionError(f"{type(algorithm).__name__} does not expose a build result") + + +def _selected_segments(keys, values, starts, ends, descending=False): + out_keys = keys.copy() + out_values = values.copy() + for start, end in zip(starts, ends): + segment_keys = keys[start:end] + order = np.argsort(segment_keys, kind="stable") + if descending: + order = order[::-1] + out_keys[start:end] = segment_keys[order] + out_values[start:end] = values[start:end][order] + return out_keys, out_values + + +@dataclass(frozen=True) +class _AlgorithmCase: + name: str + make_shared: Callable + make_worker: Callable + run: Callable + check: Callable + + def __str__(self): + return self.name + + +def _run_thread_local_algorithm_case(cp, cc, case: _AlgorithmCase) -> None: + warm_algorithm = case.make_shared(cp, cc) + + warm_worker = case.make_worker(cp, cc, worker_id=0, iteration=-1) + case.run(cp, cc, warm_algorithm, warm_worker) + case.check(cp, cc, warm_worker) + + for iteration in range(STRESS_ITERATIONS): + worker_state = [ + case.make_worker(cp, cc, worker_id=worker_id, iteration=iteration) + for worker_id in range(STRESS_THREADS) + ] + returned_algorithms = [None] * STRESS_THREADS + + def make_thread(worker_id, worker): + def thread(barrier): + barrier.wait() + algorithm = case.make_shared(cp, cc) + returned_algorithms[worker_id] = algorithm + case.run(cp, cc, algorithm, worker) + case.check(cp, cc, worker) + + return thread + + _run_threaded( + [make_thread(worker_id, worker) for worker_id, worker in enumerate(worker_state)] + ) + + assert len({id(algorithm) for algorithm in returned_algorithms}) == len( + returned_algorithms + ) + assert len( + {id(_get_build_result(algorithm)) for algorithm in returned_algorithms} + ) == 1 + + +def _make_reduce_worker(cp, cc, worker_id, iteration): + stream, cuda_stream = _make_stream(cp) + h_in = np.arange(64, dtype=np.int32) + worker_id * 101 + iteration + h_init = np.array([7 + worker_id], dtype=np.int32) + with stream: + d_in = cp.asarray(h_in) + d_out = cp.empty(1, dtype=np.int32) + return { + "stream": stream, + "cuda_stream": cuda_stream, + "h_in": h_in, + "d_in": d_in, + "d_out": d_out, + "h_init": h_init, + } + + +def _make_reduce_shared(cp, cc): + worker = _make_reduce_worker(cp, cc, 0, -1) + return cc.make_reduce_into( + d_in=worker["d_in"], + d_out=worker["d_out"], + op=cc.OpKind.PLUS, + h_init=worker["h_init"], + ) + + +def _run_reduce(cp, cc, reducer, worker): + _call_with_temp( + cp, + reducer, + d_in=worker["d_in"], + d_out=worker["d_out"], + op=cc.OpKind.PLUS, + h_init=worker["h_init"], + num_items=worker["h_in"].size, + stream=worker["cuda_stream"], + ) + + +def _check_reduce(cp, cc, worker): + worker["stream"].synchronize() + expected = worker["h_in"].sum(dtype=np.int64) + int(worker["h_init"][0]) + assert int(worker["d_out"].get()[0]) == int(expected) + + +def _make_unary_worker(cp, cc, worker_id, iteration): + stream, cuda_stream = _make_stream(cp) + h_in = np.arange(32, dtype=np.int32) + worker_id * 17 + iteration + with stream: + d_in = cp.asarray(h_in) + d_out = cp.empty_like(d_in) + return { + "stream": stream, + "cuda_stream": cuda_stream, + "h_in": h_in, + "d_in": d_in, + "d_out": d_out, + } + + +def _make_unary_shared(cp, cc): + worker = _make_unary_worker(cp, cc, 0, -1) + return cc.make_unary_transform( + d_in=worker["d_in"], d_out=worker["d_out"], op=cc.OpKind.NEGATE + ) + + +def _make_unary_for_worker(cp, cc, worker): + return cc.make_unary_transform( + d_in=worker["d_in"], d_out=worker["d_out"], op=cc.OpKind.NEGATE + ) + + +def _run_unary(cp, cc, transformer, worker): + transformer( + d_in=worker["d_in"], + d_out=worker["d_out"], + op=cc.OpKind.NEGATE, + num_items=worker["h_in"].size, + stream=worker["cuda_stream"], + ) + + +def _check_unary(cp, cc, worker): + worker["stream"].synchronize() + np.testing.assert_array_equal(worker["d_out"].get(), -worker["h_in"]) + + +def _make_binary_worker(cp, cc, worker_id, iteration): + stream, cuda_stream = _make_stream(cp) + h_in1 = np.arange(32, dtype=np.int32) + worker_id * 13 + h_in2 = np.arange(32, dtype=np.int32) + iteration * 7 + with stream: + d_in1 = cp.asarray(h_in1) + d_in2 = cp.asarray(h_in2) + d_out = cp.empty_like(d_in1) + return { + "stream": stream, + "cuda_stream": cuda_stream, + "h_in1": h_in1, + "h_in2": h_in2, + "d_in1": d_in1, + "d_in2": d_in2, + "d_out": d_out, + } + + +def _make_binary_shared(cp, cc): + worker = _make_binary_worker(cp, cc, 0, -1) + return cc.make_binary_transform( + d_in1=worker["d_in1"], + d_in2=worker["d_in2"], + d_out=worker["d_out"], + op=cc.OpKind.PLUS, + ) + + +def _make_binary_for_worker(cp, cc, worker): + return cc.make_binary_transform( + d_in1=worker["d_in1"], + d_in2=worker["d_in2"], + d_out=worker["d_out"], + op=cc.OpKind.PLUS, + ) + + +def _run_binary(cp, cc, transformer, worker): + transformer( + d_in1=worker["d_in1"], + d_in2=worker["d_in2"], + d_out=worker["d_out"], + op=cc.OpKind.PLUS, + num_items=worker["h_in1"].size, + stream=worker["cuda_stream"], + ) + + +def _check_binary(cp, cc, worker): + worker["stream"].synchronize() + np.testing.assert_array_equal(worker["d_out"].get(), worker["h_in1"] + worker["h_in2"]) + + +def _make_scan_worker(cp, cc, worker_id, iteration): + stream, cuda_stream = _make_stream(cp) + h_in = np.arange(1, 33, dtype=np.int32) + worker_id + iteration + h_init = np.array([3 + worker_id], dtype=np.int32) + with stream: + d_in = cp.asarray(h_in) + d_out = cp.empty_like(d_in) + return { + "stream": stream, + "cuda_stream": cuda_stream, + "h_in": h_in, + "h_init": h_init, + "d_in": d_in, + "d_out": d_out, + } + + +def _make_exclusive_scan_shared(cp, cc): + worker = _make_scan_worker(cp, cc, 0, -1) + return cc.make_exclusive_scan( + d_in=worker["d_in"], + d_out=worker["d_out"], + op=cc.OpKind.PLUS, + init_value=worker["h_init"], + ) + + +def _make_inclusive_scan_shared(cp, cc): + worker = _make_scan_worker(cp, cc, 0, -1) + return cc.make_inclusive_scan( + d_in=worker["d_in"], + d_out=worker["d_out"], + op=cc.OpKind.PLUS, + init_value=worker["h_init"], + ) + + +def _run_scan(cp, cc, scanner, worker): + _call_with_temp( + cp, + scanner, + d_in=worker["d_in"], + d_out=worker["d_out"], + op=cc.OpKind.PLUS, + init_value=worker["h_init"], + num_items=worker["h_in"].size, + stream=worker["cuda_stream"], + ) + + +def _check_exclusive_scan(cp, cc, worker): + worker["stream"].synchronize() + expected = np.empty_like(worker["h_in"]) + expected[0] = worker["h_init"][0] + expected[1:] = worker["h_init"][0] + np.cumsum(worker["h_in"][:-1]) + np.testing.assert_array_equal(worker["d_out"].get(), expected) + + +def _check_inclusive_scan(cp, cc, worker): + worker["stream"].synchronize() + expected = worker["h_init"][0] + np.cumsum(worker["h_in"]) + np.testing.assert_array_equal(worker["d_out"].get(), expected) + + +def _make_segmented_reduce_worker(cp, cc, worker_id, iteration): + stream, cuda_stream = _make_stream(cp) + h_in = np.arange(1, 17, dtype=np.int32) + worker_id * 3 + iteration + h_start_offsets = np.array([0, 3, 8, 12], dtype=np.int32) + h_end_offsets = np.array([3, 8, 12, 16], dtype=np.int32) + h_init = np.array([worker_id], dtype=np.int32) + with stream: + d_in = cp.asarray(h_in) + d_out = cp.empty(len(h_start_offsets), dtype=np.int32) + d_start_offsets = cp.asarray(h_start_offsets) + d_end_offsets = cp.asarray(h_end_offsets) + return { + "stream": stream, + "cuda_stream": cuda_stream, + "h_in": h_in, + "h_start_offsets": h_start_offsets, + "h_end_offsets": h_end_offsets, + "h_init": h_init, + "d_in": d_in, + "d_out": d_out, + "d_start_offsets": d_start_offsets, + "d_end_offsets": d_end_offsets, + } + + +def _make_segmented_reduce_shared(cp, cc): + worker = _make_segmented_reduce_worker(cp, cc, 0, -1) + return cc.make_segmented_reduce( + d_in=worker["d_in"], + d_out=worker["d_out"], + start_offsets_in=worker["d_start_offsets"], + end_offsets_in=worker["d_end_offsets"], + op=cc.OpKind.PLUS, + h_init=worker["h_init"], + ) + + +def _run_segmented_reduce(cp, cc, reducer, worker): + _call_with_temp( + cp, + reducer, + d_in=worker["d_in"], + d_out=worker["d_out"], + num_segments=len(worker["h_start_offsets"]), + start_offsets_in=worker["d_start_offsets"], + end_offsets_in=worker["d_end_offsets"], + op=cc.OpKind.PLUS, + h_init=worker["h_init"], + stream=worker["cuda_stream"], + ) + + +def _check_segmented_reduce(cp, cc, worker): + worker["stream"].synchronize() + expected = np.array( + [ + worker["h_in"][start:end].sum() + worker["h_init"][0] + for start, end in zip(worker["h_start_offsets"], worker["h_end_offsets"]) + ], + dtype=np.int32, + ) + np.testing.assert_array_equal(worker["d_out"].get(), expected) + + +def _make_histogram_worker(cp, cc, worker_id, iteration): + stream, cuda_stream = _make_stream(cp) + lower = np.float32(worker_id * 10) + upper = np.float32(lower + 8) + h_samples = np.array( + [ + lower + 0.5, + lower + 1.5, + lower + 2.0, + lower + 3.5, + lower + 6.0, + upper + 1.0, + ], + dtype=np.float32, + ) + h_num_levels = np.array([5], dtype=np.int32) + h_lower = np.array([lower], dtype=np.float32) + h_upper = np.array([upper], dtype=np.float32) + with stream: + d_samples = cp.asarray(h_samples) + d_histogram = cp.zeros(h_num_levels[0] - 1, dtype=np.int32) + return { + "stream": stream, + "cuda_stream": cuda_stream, + "h_samples": h_samples, + "h_num_levels": h_num_levels, + "h_lower": h_lower, + "h_upper": h_upper, + "d_samples": d_samples, + "d_histogram": d_histogram, + } + + +def _make_histogram_shared(cp, cc): + worker = _make_histogram_worker(cp, cc, 0, -1) + return cc.make_histogram_even( + d_samples=worker["d_samples"], + d_histogram=worker["d_histogram"], + h_num_output_levels=worker["h_num_levels"], + h_lower_level=worker["h_lower"], + h_upper_level=worker["h_upper"], + num_samples=worker["h_samples"].size, + ) + + +def _run_histogram(cp, cc, histogrammer, worker): + with worker["stream"]: + worker["d_histogram"].fill(0) + _call_with_temp( + cp, + histogrammer, + d_samples=worker["d_samples"], + d_histogram=worker["d_histogram"], + h_num_output_levels=worker["h_num_levels"], + h_lower_level=worker["h_lower"], + h_upper_level=worker["h_upper"], + num_samples=worker["h_samples"].size, + stream=worker["cuda_stream"], + ) + + +def _check_histogram(cp, cc, worker): + worker["stream"].synchronize() + expected, _ = np.histogram( + worker["h_samples"], + bins=int(worker["h_num_levels"][0] - 1), + range=(float(worker["h_lower"][0]), float(worker["h_upper"][0])), + ) + np.testing.assert_array_equal(worker["d_histogram"].get(), expected.astype(np.int32)) + + +def _make_binary_search_worker(cp, cc, worker_id, iteration): + stream, cuda_stream = _make_stream(cp) + h_data = np.array([90, 70, 50, 30, 10], dtype=np.int32) - worker_id + h_values = np.array([95, 70, 45, 10, 5], dtype=np.int32) - worker_id + with stream: + d_data = cp.asarray(h_data) + d_values = cp.asarray(h_values) + d_out = cp.empty(h_values.size, dtype=np.uintp) + return { + "stream": stream, + "cuda_stream": cuda_stream, + "h_data": h_data, + "h_values": h_values, + "d_data": d_data, + "d_values": d_values, + "d_out": d_out, + } + + +def _make_lower_bound_shared(cp, cc): + worker = _make_binary_search_worker(cp, cc, 0, -1) + return cc.make_lower_bound( + d_data=worker["d_data"], + d_values=worker["d_values"], + d_out=worker["d_out"], + comp=cc.OpKind.GREATER, + ) + + +def _make_upper_bound_shared(cp, cc): + worker = _make_binary_search_worker(cp, cc, 0, -1) + return cc.make_upper_bound( + d_data=worker["d_data"], + d_values=worker["d_values"], + d_out=worker["d_out"], + comp=cc.OpKind.GREATER, + ) + + +def _run_binary_search(cp, cc, searcher, worker): + searcher( + d_data=worker["d_data"], + num_items=worker["h_data"].size, + d_values=worker["d_values"], + num_values=worker["h_values"].size, + d_out=worker["d_out"], + comp=cc.OpKind.GREATER, + stream=worker["cuda_stream"], + ) + + +def _check_lower_bound(cp, cc, worker): + worker["stream"].synchronize() + expected = np.searchsorted(-worker["h_data"], -worker["h_values"], side="left") + np.testing.assert_array_equal(worker["d_out"].get(), expected.astype(np.uintp)) + + +def _check_upper_bound(cp, cc, worker): + worker["stream"].synchronize() + expected = np.searchsorted(-worker["h_data"], -worker["h_values"], side="right") + np.testing.assert_array_equal(worker["d_out"].get(), expected.astype(np.uintp)) + + +def _make_select_worker(cp, cc, worker_id, iteration): + stream, cuda_stream = _make_stream(cp) + h_in = np.array( + [True, False, worker_id % 2 == 0, True, False, iteration % 2 == 0], + dtype=np.bool_, + ) + with stream: + d_in = cp.asarray(h_in) + d_out = cp.empty_like(d_in) + d_count = cp.empty(2, dtype=np.uint64) + return { + "stream": stream, + "cuda_stream": cuda_stream, + "h_in": h_in, + "d_in": d_in, + "d_out": d_out, + "d_count": d_count, + } + + +def _make_select_shared(cp, cc): + worker = _make_select_worker(cp, cc, 0, -1) + return cc.make_select( + d_in=worker["d_in"], + d_out=worker["d_out"], + d_num_selected_out=worker["d_count"], + cond=cc.OpKind.IDENTITY, + ) + + +def _run_select(cp, cc, selector, worker): + _call_with_temp( + cp, + selector, + d_in=worker["d_in"], + d_out=worker["d_out"], + d_num_selected_out=worker["d_count"], + cond=cc.OpKind.IDENTITY, + num_items=worker["h_in"].size, + stream=worker["cuda_stream"], + ) + + +def _check_select(cp, cc, worker): + worker["stream"].synchronize() + count = int(worker["d_count"].get()[0]) + expected = worker["h_in"][worker["h_in"]] + assert count == expected.size + np.testing.assert_array_equal(worker["d_out"].get()[:count], expected) + + +def _make_three_way_shared(cp, cc): + worker = _make_select_worker(cp, cc, 0, -1) + d_unselected = cp.empty_like(worker["d_in"]) + return cc.make_three_way_partition( + d_in=worker["d_in"], + d_first_part_out=worker["d_out"], + d_second_part_out=d_unselected, + d_unselected_out=cp.empty_like(worker["d_in"]), + d_num_selected_out=worker["d_count"], + select_first_part_op=cc.OpKind.IDENTITY, + select_second_part_op=cc.OpKind.LOGICAL_NOT, + ) + + +def _make_three_way_worker(cp, cc, worker_id, iteration): + worker = _make_select_worker(cp, cc, worker_id, iteration) + stream = worker["stream"] + with stream: + worker["d_second_out"] = cp.empty_like(worker["d_in"]) + worker["d_unselected"] = cp.empty_like(worker["d_in"]) + return worker + + +def _run_three_way(cp, cc, partitioner, worker): + _call_with_temp( + cp, + partitioner, + d_in=worker["d_in"], + d_first_part_out=worker["d_out"], + d_second_part_out=worker["d_second_out"], + d_unselected_out=worker["d_unselected"], + d_num_selected_out=worker["d_count"], + select_first_part_op=cc.OpKind.IDENTITY, + select_second_part_op=cc.OpKind.LOGICAL_NOT, + num_items=worker["h_in"].size, + stream=worker["cuda_stream"], + ) + + +def _check_three_way(cp, cc, worker): + worker["stream"].synchronize() + counts = worker["d_count"].get() + true_count = int(np.count_nonzero(worker["h_in"])) + false_count = int(worker["h_in"].size - true_count) + assert int(counts[0]) == true_count + assert int(counts[1]) == false_count + np.testing.assert_array_equal( + worker["d_out"].get()[:true_count], np.ones(true_count, dtype=np.bool_) + ) + np.testing.assert_array_equal( + worker["d_second_out"].get()[:false_count], np.zeros(false_count, dtype=np.bool_) + ) + + +def _make_unique_worker(cp, cc, worker_id, iteration): + stream, cuda_stream = _make_stream(cp) + base = worker_id * 10 + iteration + h_keys = np.array([base, base, base + 1, base + 2, base + 2, base + 3], dtype=np.int32) + h_items = np.arange(h_keys.size, dtype=np.int32) + worker_id * 100 + with stream: + d_in_keys = cp.asarray(h_keys) + d_in_items = cp.asarray(h_items) + d_out_keys = cp.empty_like(d_in_keys) + d_out_items = cp.empty_like(d_in_items) + d_count = cp.empty(1, dtype=np.int32) + return { + "stream": stream, + "cuda_stream": cuda_stream, + "h_keys": h_keys, + "h_items": h_items, + "d_in_keys": d_in_keys, + "d_in_items": d_in_items, + "d_out_keys": d_out_keys, + "d_out_items": d_out_items, + "d_count": d_count, + } + + +def _make_unique_shared(cp, cc): + worker = _make_unique_worker(cp, cc, 0, -1) + return cc.make_unique_by_key( + d_in_keys=worker["d_in_keys"], + d_in_items=worker["d_in_items"], + d_out_keys=worker["d_out_keys"], + d_out_items=worker["d_out_items"], + d_out_num_selected=worker["d_count"], + op=cc.OpKind.EQUAL_TO, + ) + + +def _run_unique(cp, cc, uniquer, worker): + _call_with_temp( + cp, + uniquer, + d_in_keys=worker["d_in_keys"], + d_in_items=worker["d_in_items"], + d_out_keys=worker["d_out_keys"], + d_out_items=worker["d_out_items"], + d_out_num_selected=worker["d_count"], + op=cc.OpKind.EQUAL_TO, + num_items=worker["h_keys"].size, + stream=worker["cuda_stream"], + ) + + +def _check_unique(cp, cc, worker): + worker["stream"].synchronize() + selected = np.concatenate(([True], worker["h_keys"][1:] != worker["h_keys"][:-1])) + expected_keys = worker["h_keys"][selected] + expected_items = worker["h_items"][selected] + count = int(worker["d_count"].get()[0]) + assert count == expected_keys.size + np.testing.assert_array_equal(worker["d_out_keys"].get()[:count], expected_keys) + np.testing.assert_array_equal(worker["d_out_items"].get()[:count], expected_items) + + +def _make_merge_sort_worker(cp, cc, worker_id, iteration): + stream, cuda_stream = _make_stream(cp) + h_keys = np.array([5, 1, 3, 1, 4, 2], dtype=np.int32) + worker_id * 10 + h_values = np.arange(h_keys.size, dtype=np.int32) + iteration * 100 + with stream: + d_in_keys = cp.asarray(h_keys) + d_in_values = cp.asarray(h_values) + d_out_keys = cp.empty_like(d_in_keys) + d_out_values = cp.empty_like(d_in_values) + return { + "stream": stream, + "cuda_stream": cuda_stream, + "h_keys": h_keys, + "h_values": h_values, + "d_in_keys": d_in_keys, + "d_in_values": d_in_values, + "d_out_keys": d_out_keys, + "d_out_values": d_out_values, + } + + +def _make_merge_sort_shared(cp, cc): + worker = _make_merge_sort_worker(cp, cc, 0, -1) + return cc.make_merge_sort( + d_in_keys=worker["d_in_keys"], + d_in_values=worker["d_in_values"], + d_out_keys=worker["d_out_keys"], + d_out_values=worker["d_out_values"], + op=cc.OpKind.LESS, + ) + + +def _run_merge_sort(cp, cc, sorter, worker): + _call_with_temp( + cp, + sorter, + d_in_keys=worker["d_in_keys"], + d_in_values=worker["d_in_values"], + d_out_keys=worker["d_out_keys"], + d_out_values=worker["d_out_values"], + op=cc.OpKind.LESS, + num_items=worker["h_keys"].size, + stream=worker["cuda_stream"], + ) + + +def _check_merge_sort(cp, cc, worker): + worker["stream"].synchronize() + order = np.argsort(worker["h_keys"], kind="stable") + np.testing.assert_array_equal(worker["d_out_keys"].get(), worker["h_keys"][order]) + np.testing.assert_array_equal(worker["d_out_values"].get(), worker["h_values"][order]) + + +def _make_radix_sort_worker(cp, cc, worker_id, iteration): + stream, cuda_stream = _make_stream(cp) + h_keys = np.array([7, 3, 5, 3, 1, 9], dtype=np.uint32) + np.uint32(worker_id * 11) + h_values = np.arange(h_keys.size, dtype=np.int32) + iteration * 10 + with stream: + d_in_keys = cp.asarray(h_keys) + d_tmp_keys = cp.empty_like(d_in_keys) + d_in_values = cp.asarray(h_values) + d_tmp_values = cp.empty_like(d_in_values) + return { + "stream": stream, + "cuda_stream": cuda_stream, + "h_keys": h_keys, + "h_values": h_values, + "keys": cc.DoubleBuffer(d_in_keys, d_tmp_keys), + "values": cc.DoubleBuffer(d_in_values, d_tmp_values), + } + + +def _make_radix_sort_shared(cp, cc): + worker = _make_radix_sort_worker(cp, cc, 0, -1) + return cc.make_radix_sort( + d_in_keys=worker["keys"], + d_out_keys=None, + d_in_values=worker["values"], + d_out_values=None, + order=cc.SortOrder.ASCENDING, + ) + + +def _run_radix_sort(cp, cc, sorter, worker): + _call_with_temp( + cp, + sorter, + d_in_keys=worker["keys"], + d_out_keys=None, + d_in_values=worker["values"], + d_out_values=None, + num_items=worker["h_keys"].size, + stream=worker["cuda_stream"], + ) + + +def _check_radix_sort(cp, cc, worker): + worker["stream"].synchronize() + order = np.argsort(worker["h_keys"], kind="stable") + np.testing.assert_array_equal(worker["keys"].current().get(), worker["h_keys"][order]) + np.testing.assert_array_equal(worker["values"].current().get(), worker["h_values"][order]) + assert worker["keys"].selector == worker["values"].selector + + +def _make_segmented_sort_worker(cp, cc, worker_id, iteration): + stream, cuda_stream = _make_stream(cp) + h_keys = np.array([4, 2, 3, 8, 6, 7, 1, 5], dtype=np.int32) + worker_id * 13 + h_values = np.arange(h_keys.size, dtype=np.int32) + iteration * 100 + h_start_offsets = np.array([0, 3, 6], dtype=np.int32) + h_end_offsets = np.array([3, 6, 8], dtype=np.int32) + with stream: + d_in_keys = cp.asarray(h_keys) + d_tmp_keys = cp.empty_like(d_in_keys) + d_in_values = cp.asarray(h_values) + d_tmp_values = cp.empty_like(d_in_values) + d_start_offsets = cp.asarray(h_start_offsets) + d_end_offsets = cp.asarray(h_end_offsets) + return { + "stream": stream, + "cuda_stream": cuda_stream, + "h_keys": h_keys, + "h_values": h_values, + "h_start_offsets": h_start_offsets, + "h_end_offsets": h_end_offsets, + "keys": cc.DoubleBuffer(d_in_keys, d_tmp_keys), + "values": cc.DoubleBuffer(d_in_values, d_tmp_values), + "d_start_offsets": d_start_offsets, + "d_end_offsets": d_end_offsets, + } + + +def _make_segmented_sort_shared(cp, cc): + worker = _make_segmented_sort_worker(cp, cc, 0, -1) + return cc.make_segmented_sort( + d_in_keys=worker["keys"], + d_out_keys=None, + d_in_values=worker["values"], + d_out_values=None, + start_offsets_in=worker["d_start_offsets"], + end_offsets_in=worker["d_end_offsets"], + order=cc.SortOrder.ASCENDING, + ) + + +def _run_segmented_sort(cp, cc, sorter, worker): + _call_with_temp( + cp, + sorter, + d_in_keys=worker["keys"], + d_out_keys=None, + d_in_values=worker["values"], + d_out_values=None, + num_items=worker["h_keys"].size, + num_segments=worker["h_start_offsets"].size, + start_offsets_in=worker["d_start_offsets"], + end_offsets_in=worker["d_end_offsets"], + stream=worker["cuda_stream"], + ) + + +def _check_segmented_sort(cp, cc, worker): + worker["stream"].synchronize() + expected_keys, expected_values = _selected_segments( + worker["h_keys"], + worker["h_values"], + worker["h_start_offsets"], + worker["h_end_offsets"], + ) + np.testing.assert_array_equal(worker["keys"].current().get(), expected_keys) + np.testing.assert_array_equal(worker["values"].current().get(), expected_values) + assert worker["keys"].selector == worker["values"].selector + + +SHARED_ALGORITHM_CASES = [ + _AlgorithmCase("reduce", _make_reduce_shared, _make_reduce_worker, _run_reduce, _check_reduce), + _AlgorithmCase( + "unary_transform", _make_unary_shared, _make_unary_worker, _run_unary, _check_unary + ), + _AlgorithmCase( + "binary_transform", + _make_binary_shared, + _make_binary_worker, + _run_binary, + _check_binary, + ), + _AlgorithmCase( + "exclusive_scan", + _make_exclusive_scan_shared, + _make_scan_worker, + _run_scan, + _check_exclusive_scan, + ), + _AlgorithmCase( + "inclusive_scan", + _make_inclusive_scan_shared, + _make_scan_worker, + _run_scan, + _check_inclusive_scan, + ), + _AlgorithmCase( + "segmented_reduce", + _make_segmented_reduce_shared, + _make_segmented_reduce_worker, + _run_segmented_reduce, + _check_segmented_reduce, + ), + _AlgorithmCase( + "histogram", + _make_histogram_shared, + _make_histogram_worker, + _run_histogram, + _check_histogram, + ), + _AlgorithmCase( + "lower_bound", + _make_lower_bound_shared, + _make_binary_search_worker, + _run_binary_search, + _check_lower_bound, + ), + _AlgorithmCase( + "upper_bound", + _make_upper_bound_shared, + _make_binary_search_worker, + _run_binary_search, + _check_upper_bound, + ), + _AlgorithmCase("select", _make_select_shared, _make_select_worker, _run_select, _check_select), + _AlgorithmCase( + "three_way_partition", + _make_three_way_shared, + _make_three_way_worker, + _run_three_way, + _check_three_way, + ), + _AlgorithmCase( + "unique_by_key", _make_unique_shared, _make_unique_worker, _run_unique, _check_unique + ), + _AlgorithmCase( + "merge_sort", + _make_merge_sort_shared, + _make_merge_sort_worker, + _run_merge_sort, + _check_merge_sort, + ), + _AlgorithmCase( + "radix_sort", + _make_radix_sort_shared, + _make_radix_sort_worker, + _run_radix_sort, + _check_radix_sort, + ), + _AlgorithmCase( + "segmented_sort", + _make_segmented_sort_shared, + _make_segmented_sort_worker, + _run_segmented_sort, + _check_segmented_sort, + ), +] + + +def test_free_threaded_import_keeps_gil_disabled(compute_modules): + cp, cc = compute_modules + + h_in = np.arange(8, dtype=np.int32) + d_in = cp.asarray(h_in) + d_out = cp.empty(1, dtype=np.int32) + h_init = np.array([0], dtype=np.int32) + + cc.reduce_into( + d_in=d_in, + d_out=d_out, + num_items=h_in.size, + op=cc.OpKind.PLUS, + h_init=h_init, + ) + + assert int(d_out.get()[0]) == int(h_in.sum()) + _assert_gil_disabled("after running cuda.compute smoke operation") + + +@pytest.mark.parametrize("case", SHARED_ALGORITHM_CASES, ids=str) +def test_thread_local_algorithm_objects_share_build_result(compute_modules, case): + cp, cc = compute_modules + + _run_thread_local_algorithm_case(cp, cc, case) + + +def _cache_miss_reduce(cp, cc, worker_id, iteration): + worker = _make_reduce_worker(cp, cc, worker_id, iteration) + reducer = cc.make_reduce_into( + d_in=worker["d_in"], + d_out=worker["d_out"], + op=cc.OpKind.PLUS, + h_init=worker["h_init"], + ) + _run_reduce(cp, cc, reducer, worker) + _check_reduce(cp, cc, worker) + return reducer + + +def _cache_miss_unary_transform(cp, cc, worker_id, iteration): + worker = _make_unary_worker(cp, cc, worker_id, iteration) + transformer = cc.make_unary_transform( + d_in=worker["d_in"], d_out=worker["d_out"], op=cc.OpKind.NEGATE + ) + _run_unary(cp, cc, transformer, worker) + _check_unary(cp, cc, worker) + return transformer + + +def _cache_miss_binary_transform(cp, cc, worker_id, iteration): + worker = _make_binary_worker(cp, cc, worker_id, iteration) + transformer = cc.make_binary_transform( + d_in1=worker["d_in1"], + d_in2=worker["d_in2"], + d_out=worker["d_out"], + op=cc.OpKind.PLUS, + ) + _run_binary(cp, cc, transformer, worker) + _check_binary(cp, cc, worker) + return transformer + + +@pytest.mark.parametrize( + "factory", + [_cache_miss_reduce, _cache_miss_unary_transform, _cache_miss_binary_transform], + ids=["reduce", "unary_transform", "binary_transform"], +) +def test_same_key_factory_cache_miss_storm(compute_modules, factory): + cp, cc = compute_modules + + for iteration in range(STRESS_ITERATIONS): + cc.clear_all_caches() + returned_objects = [None] * STRESS_THREADS + + def make_thread(worker_id): + def thread(barrier): + barrier.wait() + returned_objects[worker_id] = factory(cp, cc, worker_id, iteration) + + return thread + + _run_threaded([make_thread(worker_id) for worker_id in range(STRESS_THREADS)]) + + assert len({id(obj) for obj in returned_objects}) == len(returned_objects) + assert len({id(_get_build_result(obj)) for obj in returned_objects}) == 1 + + +def test_shared_raw_op_object_direct_algorithm_stress(compute_modules): + cp, cc = compute_modules + + from cuda.compute._cpp_compile import compile_cpp_op_code + from cuda.compute.op import RawOp + + source = """ + extern "C" __device__ void raw_add_i32(void* a, void* b, void* result) { + *static_cast(result) = *static_cast(a) + *static_cast(b); + } + """ + shared_op = RawOp(ltoir=compile_cpp_op_code(source), name="raw_add_i32") + + for iteration in range(STRESS_ITERATIONS): + cc.clear_all_caches() + returned_reducers = [None] * STRESS_THREADS + + def make_thread(worker_id): + stream, cuda_stream = _make_stream(cp) + h_in = np.arange(32, dtype=np.int32) + worker_id * 31 + iteration + h_init = np.array([worker_id + 5], dtype=np.int32) + with stream: + d_in = cp.asarray(h_in) + d_out = cp.empty(1, dtype=np.int32) + + def thread(barrier): + barrier.wait() + reducer = cc.make_reduce_into( + d_in=d_in, + d_out=d_out, + op=shared_op, + h_init=h_init, + ) + returned_reducers[worker_id] = reducer + _call_with_temp( + cp, + reducer, + d_in=d_in, + d_out=d_out, + op=shared_op, + h_init=h_init, + num_items=h_in.size, + stream=cuda_stream, + ) + stream.synchronize() + expected = int(h_in.sum(dtype=np.int64) + h_init[0]) + assert int(d_out.get()[0]) == expected + + return thread + + _run_threaded([make_thread(worker_id) for worker_id in range(STRESS_THREADS)]) + + assert len({id(reducer) for reducer in returned_reducers}) == len( + returned_reducers + ) + assert len({id(_get_build_result(reducer)) for reducer in returned_reducers}) == 1 + + +@dataclass(frozen=True) +class _IteratorCase: + name: str + make_iterator: Callable + dtype: np.dtype + num_items: int + expected_sum: int + + def __str__(self): + return self.name + + +@dataclass(frozen=True) +class _ColdTransformCase: + name: str + make_worker: Callable + make_transformer: Callable + run: Callable + check: Callable + + def __str__(self): + return self.name + + +def _run_cold_transform_native_cache_case(cp, cc, case: _ColdTransformCase) -> None: + for iteration in range(STRESS_ITERATIONS): + cc.clear_all_caches() + workers = [ + case.make_worker(cp, cc, worker_id=worker_id, iteration=iteration) + for worker_id in range(TRANSFORM_NATIVE_CACHE_THREADS) + ] + returned_algorithms = [None] * TRANSFORM_NATIVE_CACHE_THREADS + # Transform's native launch config cache is filled on first execution, + # so build wrappers first and synchronize the first call separately. + execute_barrier = threading.Barrier(TRANSFORM_NATIVE_CACHE_THREADS) + + def make_thread(worker_id, worker): + def thread(barrier): + barrier.wait() + try: + algorithm = case.make_transformer(cp, cc, worker) + returned_algorithms[worker_id] = algorithm + except BaseException: + execute_barrier.abort() + raise + + execute_barrier.wait(timeout=60) + case.run(cp, cc, algorithm, worker) + case.check(cp, cc, worker) + + return thread + + _run_threaded( + [make_thread(worker_id, worker) for worker_id, worker in enumerate(workers)] + ) + + assert len({id(algorithm) for algorithm in returned_algorithms}) == len( + returned_algorithms + ) + assert len( + {id(_get_build_result(algorithm)) for algorithm in returned_algorithms} + ) == 1 + + +@pytest.mark.parametrize( + "case", + [ + _ColdTransformCase( + "unary_transform", + _make_unary_worker, + _make_unary_for_worker, + _run_unary, + _check_unary, + ), + _ColdTransformCase( + "binary_transform", + _make_binary_worker, + _make_binary_for_worker, + _run_binary, + _check_binary, + ), + ], + ids=str, +) +def test_cold_transform_native_cache_initialization_stress(compute_modules, case): + cp, cc = compute_modules + + _run_cold_transform_native_cache_case(cp, cc, case) + + +def _iterator_counting(cp, cc): + return cc.CountingIterator(np.int32(0)), np.dtype(np.int32), 32, sum(range(32)) + + +def _iterator_constant(cp, cc): + return cc.ConstantIterator(np.int32(5)), np.dtype(np.int32), 32, 32 * 5 + + +def _iterator_cache_modified(cp, cc): + h_in = np.arange(32, dtype=np.int32) + d_in = cp.asarray(h_in) + return cc.CacheModifiedInputIterator(d_in, "stream"), h_in.dtype, h_in.size, int(h_in.sum()) + + +def _iterator_reverse(cp, cc): + h_in = np.arange(32, dtype=np.int32) + d_in = cp.asarray(h_in) + return cc.ReverseIterator(d_in), h_in.dtype, h_in.size, int(h_in.sum()) + + +def _iterator_permutation(cp, cc): + h_values = np.arange(32, dtype=np.int32) + h_indices = np.arange(31, -1, -1, dtype=np.int32) + d_values = cp.asarray(h_values) + d_indices = cp.asarray(h_indices) + return ( + cc.PermutationIterator(d_values, d_indices), + h_values.dtype, + h_indices.size, + int(h_values[h_indices].sum()), + ) + + +def _iterator_shuffle(cp, cc): + num_items = 32 + return ( + cc.ShuffleIterator(num_items, seed=1234), + np.dtype(np.int64), + num_items, + sum(range(num_items)), + ) + + +def _iterator_transform(cp, cc): + from cuda.compute import types + from cuda.compute._cpp_compile import compile_cpp_op_code + from cuda.compute.op import RawOp + + num_items = 32 + source = """ + extern "C" __device__ void negate_i32(void* input, void* result) { + *static_cast(result) = -*static_cast(input); + } + """ + op = RawOp(ltoir=compile_cpp_op_code(source), name="negate_i32") + return ( + cc.TransformIterator(cc.CountingIterator(np.int32(0)), op, value_type=types.int32), + np.dtype(np.int32), + num_items, + -sum(range(num_items)), + ) + + +ITERATOR_FACTORIES = [ + _iterator_counting, + _iterator_constant, + _iterator_cache_modified, + _iterator_reverse, + _iterator_permutation, + _iterator_shuffle, + _iterator_transform, +] + + +@pytest.mark.parametrize( + "make_iterator", + ITERATOR_FACTORIES, + ids=lambda fn: fn.__name__.removeprefix("_iterator_"), +) +def test_shared_iterator_object_stress(compute_modules, make_iterator): + cp, cc = compute_modules + + shared_iterator, dtype, num_items, expected_sum = make_iterator(cp, cc) + cp.cuda.Device().synchronize() + + for iteration in range(STRESS_ITERATIONS): + cc.clear_all_caches() + + def make_thread(worker_id): + stream, cuda_stream = _make_stream(cp) + h_init = np.array([worker_id], dtype=dtype) + with stream: + d_out = cp.empty(1, dtype=dtype) + + def thread(barrier): + barrier.wait() + reducer = cc.make_reduce_into( + d_in=shared_iterator, + d_out=d_out, + op=cc.OpKind.PLUS, + h_init=h_init, + ) + _call_with_temp( + cp, + reducer, + d_in=shared_iterator, + d_out=d_out, + op=cc.OpKind.PLUS, + h_init=h_init, + num_items=num_items, + stream=cuda_stream, + ) + stream.synchronize() + assert int(d_out.get()[0]) == int(expected_sum + h_init[0]) + + return thread + + _run_threaded([make_thread(worker_id) for worker_id in range(STRESS_THREADS)]) + + +def test_runtime_ownership_isolation(compute_modules): + cp, cc = compute_modules + + def make_thread(worker_id): + def thread(barrier): + barrier.wait() + stream, cuda_stream = _make_stream(cp) + h_in = np.arange(16, dtype=np.int32) + worker_id * 10 + h_init = np.array([worker_id], dtype=np.int32) + + with stream: + d_in = cp.asarray(h_in) + d_reduce_out = cp.empty(1, dtype=np.int32) + d_scan_out = cp.empty_like(d_in) + d_transform_out = cp.empty_like(d_in) + d_hist = cp.zeros(4, dtype=np.int32) + h_keys = np.array([3, 1, 2, 1], dtype=np.uint32) + worker_id + d_keys_in = cp.asarray(h_keys) + d_keys_tmp = cp.empty_like(d_keys_in) + + cc.reduce_into( + d_in=d_in, + d_out=d_reduce_out, + num_items=h_in.size, + op=cc.OpKind.PLUS, + h_init=h_init, + stream=cuda_stream, + ) + cc.exclusive_scan( + d_in=d_in, + d_out=d_scan_out, + op=cc.OpKind.PLUS, + init_value=h_init, + num_items=h_in.size, + stream=cuda_stream, + ) + cc.unary_transform( + d_in=d_in, + d_out=d_transform_out, + op=cc.OpKind.NEGATE, + num_items=h_in.size, + stream=cuda_stream, + ) + cc.histogram_even( + d_samples=d_in, + d_histogram=d_hist, + num_output_levels=5, + lower_level=np.int32(worker_id * 10), + upper_level=np.int32(worker_id * 10 + 16), + num_samples=h_in.size, + stream=cuda_stream, + ) + keys = cc.DoubleBuffer(d_keys_in, d_keys_tmp) + cc.radix_sort( + d_in_keys=keys, + d_out_keys=None, + d_in_values=None, + d_out_values=None, + num_items=d_keys_in.size, + order=cc.SortOrder.ASCENDING, + stream=cuda_stream, + ) + + stream.synchronize() + assert int(d_reduce_out.get()[0]) == int(h_in.sum() + worker_id) + expected_scan = np.empty_like(h_in) + expected_scan[0] = worker_id + expected_scan[1:] = worker_id + np.cumsum(h_in[:-1]) + np.testing.assert_array_equal(d_scan_out.get(), expected_scan) + np.testing.assert_array_equal(d_transform_out.get(), -h_in) + assert int(d_hist.sum().get()) == h_in.size + np.testing.assert_array_equal(keys.current().get(), np.sort(h_keys)) + + return thread + + for _ in range(STRESS_ITERATIONS): + _run_threaded([make_thread(worker_id) for worker_id in range(STRESS_THREADS)]) + + +def test_cache_clear_while_active_operations_is_not_a_supported_contract(): + pytest.skip( + "clear_all_caches() while cached operations are active is an unsupported " + "contract decision; see ST-19 in stress_tests.md." + ) From e461b597054a0695db414cdb7fa730ab2a128a62 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Fri, 12 Jun 2026 12:21:29 -0500 Subject: [PATCH 02/16] Decouple from numba.cuda further in tests in favor of cupy and cuda.core and add minimal extra testing --- ci/matrix.yaml | 2 + ci/test_cuda_compute_minimal_python.sh | 51 +++++++++++++ .../tests/compute/test_binary_search.py | 35 +++++++++ .../cuda_cccl/tests/compute/test_bindings.py | 2 + .../compute/test_deferred_annotations.py | 2 + .../tests/compute/test_func_caching.py | 8 ++ .../cuda_cccl/tests/compute/test_histogram.py | 2 + .../cuda_cccl/tests/compute/test_iterators.py | 10 ++- .../tests/compute/test_merge_sort.py | 55 +++++++------- .../tests/compute/test_nested_struct.py | 5 ++ .../cuda_cccl/tests/compute/test_no_numba.py | 75 +----------------- .../compute/test_permutation_iterator.py | 6 ++ .../tests/compute/test_radix_sort.py | 76 ++++++++++--------- python/cuda_cccl/tests/compute/test_reduce.py | 38 +++++----- python/cuda_cccl/tests/compute/test_scan.py | 16 ++-- .../tests/compute/test_segmented_reduce.py | 3 + .../tests/compute/test_segmented_sort.py | 51 +++++++------ python/cuda_cccl/tests/compute/test_select.py | 31 ++++++++ .../tests/compute/test_shuffle_iterator.py | 1 + .../compute/test_struct_field_validation.py | 2 + .../tests/compute/test_three_way_partition.py | 46 +++++++++++ .../cuda_cccl/tests/compute/test_transform.py | 5 ++ .../tests/compute/test_unique_by_key.py | 69 ++++++++--------- .../test_void_ptr_wrapper_validation.py | 4 +- .../tests/compute/test_zip_iterator.py | 11 ++- 25 files changed, 379 insertions(+), 227 deletions(-) create mode 100755 ci/test_cuda_compute_minimal_python.sh diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 6146d863c87..1fcb1df75ae 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -84,6 +84,7 @@ workflows: - {jobs: ['test'], project: 'python', ctk: ['12.X', '13.X'], py_version: ['3.10'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']} - {jobs: ['test'], project: 'python', ctk: ['12.X','13.0', '13.X'], py_version: ['3.14'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']} - {jobs: ['test'], project: 'python', py_version: '3.14', gpu: 'h100', cxx: 'gcc13'} + - {jobs: ['test_py_compute_minimal'], project: 'python', ctk: '13.X', py_version: '3.14', gpu: 'l4', cxx: 'gcc13'} # CCCL packaging: - {jobs: ['test'], project: 'packaging', ctk: '12.0', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080', args: '-min-cmake'} - {jobs: ['test'], project: 'packaging', ctk: '12.X', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080'} @@ -539,6 +540,7 @@ jobs: test_py_headers: { name: "Test cuda.cccl.headers", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_headers'} } test_py_coop: { name: "Test cuda.coop._experimental", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_coop'} } test_py_par: { name: "Test cuda.compute", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute'} } + test_py_compute_minimal: { name: "Test cuda.compute minimal", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute_minimal'} } test_py_examples: { name: "Test cuda.cccl.examples", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_examples'} } # Run jobs for 'target' project (ci/util/build_and_test_targets.sh): diff --git a/ci/test_cuda_compute_minimal_python.sh b/ci/test_cuda_compute_minimal_python.sh new file mode 100755 index 00000000000..c48b313ed31 --- /dev/null +++ b/ci/test_cuda_compute_minimal_python.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash + +set -euo pipefail + +ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(cd "$ci_dir/.." && pwd)" +source "$ci_dir/pyenv_helper.sh" + +# Parse common arguments +source "$ci_dir/util/python/common_arg_parser.sh" +parse_python_args "$@" +require_py_version "Usage: $0 -py-version " + +cuda_major_version=$(nvcc --version | grep release | awk '{print $6}' | tr -d ',' | cut -d '.' -f 1 | cut -d 'V' -f 2) + +# Setup Python environment +setup_python_env "${py_version}" + +# Fetch or build the cuda_cccl wheel: +if [[ -n "${GITHUB_ACTIONS:-}" ]]; then + wheel_artifact_name=$("$ci_dir/util/workflow/get_wheel_artifact_name.sh") + "$ci_dir/util/artifacts/download.sh" "${wheel_artifact_name}" /home/coder/cccl/ + wheelhouse_dir="/home/coder/cccl/wheelhouse" +else + "$ci_dir/build_cuda_cccl_python.sh" -py-version "${py_version}" + wheelhouse_dir="${repo_root}/wheelhouse" +fi + +# Install cuda_cccl with the minimal CUDA extra. This intentionally avoids the +# full cu* extras because those pull in numba/numba-cuda. In a clean minimal +# environment, the test phase below runs only tests marked no_numba. +CUDA_CCCL_WHEEL_PATH="$(ls "${wheelhouse_dir}"/cuda_cccl-*.whl)" +python -m pip install "${CUDA_CCCL_WHEEL_PATH}[minimal-cu${cuda_major_version}]" +python -m pip install pytest pytest-xdist "cupy-cuda${cuda_major_version}x" + +if python - <<'PY' +try: + import numba.cuda # noqa: F401 +except Exception as exc: + print(f"numba.cuda unavailable; running no_numba subset: {exc!r}") + raise SystemExit(1) +else: + print("numba.cuda available; running full compute test suite.") +PY +then + cd "${repo_root}/python/cuda_cccl/tests/" + python -m pytest -n 6 -v compute/ -m "not large" +else + cd "${repo_root}/python/cuda_cccl/tests/" + python -m pytest -n 6 -v compute/ -m "not large and no_numba" +fi diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py index a4f7d047a9b..94863c3fe74 100644 --- a/python/cuda_cccl/tests/compute/test_binary_search.py +++ b/python/cuda_cccl/tests/compute/test_binary_search.py @@ -6,6 +6,8 @@ import pytest import cuda.compute +from cuda.compute._cpp_compile import compile_cpp_op_code +from cuda.compute.op import RawOp DTYPE_LIST = [ np.int32, @@ -39,6 +41,39 @@ def disable_sass_check(monkeypatch): ) +def _raw_less_i32_op() -> RawOp: + source = """ +extern "C" __device__ void less_i32(void* lhs, void* rhs, void* result) { + int lhs_value = *static_cast(lhs); + int rhs_value = *static_cast(rhs); + *static_cast(result) = lhs_value < rhs_value; +} +""" + return RawOp(ltoir=compile_cpp_op_code(source), name="less_i32") + + +@pytest.mark.no_numba +def test_lower_bound_raw_op_minimal(): + h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32) + h_values = np.array([0, 3, 4, 10], dtype=np.int32) + + d_data = cp.asarray(h_data) + d_values = cp.asarray(h_values) + d_out = cp.empty(len(h_values), dtype=np.uintp) + + cuda.compute.lower_bound( + d_data=d_data, + num_items=len(d_data), + d_values=d_values, + num_values=len(d_values), + d_out=d_out, + comp=_raw_less_i32_op(), + ) + + expected = np.searchsorted(h_data, h_values, side="left").astype(np.uintp) + np.testing.assert_array_equal(d_out.get(), expected) + + @pytest.mark.parametrize("dtype", DTYPE_LIST) @pytest.mark.parametrize( "num_items,num_values", [(0, 0), (0, 128), (128, 0), (512, 128)] diff --git a/python/cuda_cccl/tests/compute/test_bindings.py b/python/cuda_cccl/tests/compute/test_bindings.py index 2f16da620ef..ccfbd8d0bc3 100644 --- a/python/cuda_cccl/tests/compute/test_bindings.py +++ b/python/cuda_cccl/tests/compute/test_bindings.py @@ -4,6 +4,8 @@ import cuda.compute._bindings as bindings +pytestmark = pytest.mark.no_numba + @pytest.fixture( params=[ diff --git a/python/cuda_cccl/tests/compute/test_deferred_annotations.py b/python/cuda_cccl/tests/compute/test_deferred_annotations.py index c30aeda4068..a584b8ffbe2 100644 --- a/python/cuda_cccl/tests/compute/test_deferred_annotations.py +++ b/python/cuda_cccl/tests/compute/test_deferred_annotations.py @@ -7,10 +7,12 @@ import cupy as cp import numpy as np +import pytest from cuda.compute import OpKind, TransformIterator, gpu_struct, reduce_into +@pytest.mark.no_numba def test_deferred_annotations(): # test that we can use @gpu_struct with deferred annotations # GH: #6421 diff --git a/python/cuda_cccl/tests/compute/test_func_caching.py b/python/cuda_cccl/tests/compute/test_func_caching.py index 0da32901a32..5d12458790d 100644 --- a/python/cuda_cccl/tests/compute/test_func_caching.py +++ b/python/cuda_cccl/tests/compute/test_func_caching.py @@ -1,10 +1,12 @@ import numpy as np +import pytest from cuda.compute._caching import CachableFunction global_x = 1 +@pytest.mark.no_numba def test_func_caching_basic(): def func(x): return x @@ -19,6 +21,7 @@ def func(x): assert f1 == f2 +@pytest.mark.no_numba def test_func_caching_different_names(): def func(x): return x @@ -33,6 +36,7 @@ def func2(x): assert f1 != f2 +@pytest.mark.no_numba def test_func_caching_different_code(): def func(x): return x @@ -46,6 +50,7 @@ def func(x): assert f1 != f2 +@pytest.mark.no_numba def test_func_caching_with_closure(): def factory(x): def func(y): @@ -61,6 +66,7 @@ def func(y): assert f1 != f3 +@pytest.mark.no_numba def test_func_caching_with_global_variable(): global global_x @@ -107,6 +113,7 @@ def func(x): assert CachableFunction(func1) != CachableFunction(func3) +@pytest.mark.no_numba def test_func_caching_with_global_np_ufunc(): def make_func(): def func(x): @@ -126,6 +133,7 @@ def func(x): assert CachableFunction(func1) != CachableFunction(func2) +@pytest.mark.no_numba def test_func_caching_with_aliased_np_ufunc(): def make_func1(): amin = np.argmin diff --git a/python/cuda_cccl/tests/compute/test_histogram.py b/python/cuda_cccl/tests/compute/test_histogram.py index 0a9d96674e6..5c104114754 100644 --- a/python/cuda_cccl/tests/compute/test_histogram.py +++ b/python/cuda_cccl/tests/compute/test_histogram.py @@ -12,6 +12,8 @@ CountingIterator, ) +pytestmark = pytest.mark.no_numba + DTYPE_LIST = [ np.uint8, np.uint16, diff --git a/python/cuda_cccl/tests/compute/test_iterators.py b/python/cuda_cccl/tests/compute/test_iterators.py index 49e2ab39d15..c3b6acc9fc9 100644 --- a/python/cuda_cccl/tests/compute/test_iterators.py +++ b/python/cuda_cccl/tests/compute/test_iterators.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import cupy as cp -import numba.cuda import numpy as np import pytest @@ -22,6 +21,7 @@ ) +@pytest.mark.no_numba def test_constant_iterator_equality(): it1 = ConstantIterator(np.int32(0)) it2 = ConstantIterator(np.int32(0)) @@ -32,6 +32,7 @@ def test_constant_iterator_equality(): assert it1.kind != it4.kind +@pytest.mark.no_numba def test_counting_iterator_equality(): it1 = CountingIterator(np.int32(0)) it2 = CountingIterator(np.int32(0)) @@ -42,6 +43,7 @@ def test_counting_iterator_equality(): assert it1.kind != it4.kind +@pytest.mark.no_numba def test_cache_modified_input_iterator_equality(): ary1 = cp.asarray([0, 1, 2], dtype="int32") ary2 = cp.asarray([3, 4, 5], dtype="int32") @@ -122,11 +124,14 @@ def reverse_iterator_array(request): if array_type == "cupy": array = cp.array(base_array) else: + import numba.cuda + array = numba.cuda.to_device(base_array) return array +@pytest.mark.no_numba def test_reverse_input_iterator_equality(): ary1 = cp.asarray([0, 1, 2], dtype="int32") ary2 = cp.asarray([3, 4, 5], dtype="int32") @@ -141,6 +146,7 @@ def test_reverse_input_iterator_equality(): assert it1.kind != it4.kind +@pytest.mark.no_numba def test_reverse_output_iterator_equality(): ary1 = cp.asarray([0, 1, 2], dtype="int32") ary2 = cp.asarray([3, 4, 5], dtype="int32") @@ -155,6 +161,7 @@ def test_reverse_output_iterator_equality(): assert it1.kind != it4.kind +@pytest.mark.no_numba @pytest.mark.parametrize( "shape, itemsize, expected", [ @@ -179,6 +186,7 @@ def test_compute_c_contiguous_strides_in_bytes(shape, itemsize, expected): assert result == expected +@pytest.mark.no_numba @pytest.mark.parametrize( "shape, dtype", [ diff --git a/python/cuda_cccl/tests/compute/test_merge_sort.py b/python/cuda_cccl/tests/compute/test_merge_sort.py index 33d7d15de58..db97d5aca01 100644 --- a/python/cuda_cccl/tests/compute/test_merge_sort.py +++ b/python/cuda_cccl/tests/compute/test_merge_sort.py @@ -5,7 +5,6 @@ from typing import List import cupy as cp -import numba.cuda import numpy as np import pytest @@ -82,11 +81,11 @@ def compare_op(lhs, rhs): def test_merge_sort_keys(dtype, num_items, op): h_in_keys = random_array(num_items, dtype) - d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_keys = cp.asarray(h_in_keys) merge_sort_device(d_in_keys, None, d_in_keys, None, op, num_items) - h_out_keys = d_in_keys.copy_to_host() + h_out_keys = d_in_keys.get() h_in_keys.sort() np.testing.assert_array_equal(h_out_keys, h_in_keys) @@ -102,13 +101,13 @@ def test_merge_sort_pairs(dtype, num_items, op, monkeypatch): h_in_keys = random_array(num_items, dtype) h_in_items = random_array(num_items, np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_items = numba.cuda.to_device(h_in_items) + d_in_keys = cp.asarray(h_in_keys) + d_in_items = cp.asarray(h_in_items) merge_sort_device(d_in_keys, d_in_items, d_in_keys, d_in_items, op, num_items) - h_out_keys = d_in_keys.copy_to_host() - h_out_items = d_in_items.copy_to_host() + h_out_keys = d_in_keys.get() + h_out_items = d_in_items.get() argsort = np.argsort(h_in_keys, stable=True) h_in_keys = np.array(h_in_keys)[argsort] @@ -123,12 +122,12 @@ def test_merge_sort_keys_copy(dtype, num_items, op): h_in_keys = random_array(num_items, dtype) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_out_keys = numba.cuda.to_device(h_out_keys) + d_in_keys = cp.asarray(h_in_keys) + d_out_keys = cp.asarray(h_out_keys) merge_sort_device(d_in_keys, None, d_out_keys, None, op, num_items) - h_out_keys = d_out_keys.copy_to_host() + h_out_keys = d_out_keys.get() h_in_keys.sort() np.testing.assert_array_equal(h_out_keys, h_in_keys) @@ -146,15 +145,15 @@ def test_merge_sort_pairs_copy(dtype, num_items, op, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_items = np.empty(num_items, dtype=np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_items = numba.cuda.to_device(h_in_items) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_items = numba.cuda.to_device(h_out_items) + d_in_keys = cp.asarray(h_in_keys) + d_in_items = cp.asarray(h_in_items) + d_out_keys = cp.asarray(h_out_keys) + d_out_items = cp.asarray(h_out_items) merge_sort_device(d_in_keys, d_in_items, d_out_keys, d_out_items, op, num_items) - h_out_keys = d_out_keys.copy_to_host() - h_out_items = d_out_items.copy_to_host() + h_out_keys = d_out_keys.get() + h_out_items = d_out_items.get() argsort = np.argsort(h_in_keys, stable=True) h_in_keys = np.array(h_in_keys)[argsort] @@ -225,11 +224,11 @@ def compare_complex(lhs, rhs): imaginary = random_array(num_items, np.int64, max_value) h_in_keys = real + 1j * imaginary - d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_keys = cp.asarray(h_in_keys) merge_sort_device(d_in_keys, None, d_in_keys, None, compare_complex, num_items) - h_out_keys = d_in_keys.copy_to_host() + h_out_keys = d_in_keys.get() h_in_keys = h_in_keys[np.argsort(h_in_keys.real, stable=True)] np.testing.assert_array_equal(h_out_keys, h_in_keys) @@ -240,15 +239,15 @@ def test_merge_sort_keys_copy_iterator_input(dtype, num_items, op): h_in_keys = random_array(num_items, dtype) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_out_keys = numba.cuda.to_device(h_out_keys) + d_in_keys = cp.asarray(h_in_keys) + d_out_keys = cp.asarray(h_out_keys) i_input = CacheModifiedInputIterator(d_in_keys, modifier="stream") merge_sort_device(i_input, None, d_out_keys, None, op, num_items) h_in_keys.sort() - h_out_keys = d_out_keys.copy_to_host() + h_out_keys = d_out_keys.get() np.testing.assert_array_equal(h_out_keys, h_in_keys) @@ -265,10 +264,10 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch) h_out_keys = np.empty(num_items, dtype=dtype) h_out_items = np.empty(num_items, dtype=np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_items = numba.cuda.to_device(h_in_items) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_items = numba.cuda.to_device(h_out_items) + d_in_keys = cp.asarray(h_in_keys) + d_in_items = cp.asarray(h_in_items) + d_out_keys = cp.asarray(h_out_keys) + d_out_items = cp.asarray(h_out_items) i_input_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream") i_input_items = CacheModifiedInputIterator(d_in_items, modifier="stream") @@ -277,8 +276,8 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch) i_input_keys, i_input_items, d_out_keys, d_out_items, op, num_items ) - h_out_keys = d_out_keys.copy_to_host() - h_out_items = d_out_items.copy_to_host() + h_out_keys = d_out_keys.get() + h_out_items = d_out_items.get() argsort = np.argsort(h_in_keys, stable=True) h_in_keys = np.array(h_in_keys)[argsort] @@ -307,6 +306,7 @@ def test_merge_sort_with_stream(cuda_stream): np.testing.assert_array_equal(got, h_in_keys) +@pytest.mark.no_numba def test_merge_sort_well_known_less(): dtype = np.int32 @@ -326,6 +326,7 @@ def test_merge_sort_well_known_less(): np.testing.assert_equal(d_out_keys.get(), expected) +@pytest.mark.no_numba def test_merge_sort_well_known_greater(): dtype = np.int32 diff --git a/python/cuda_cccl/tests/compute/test_nested_struct.py b/python/cuda_cccl/tests/compute/test_nested_struct.py index 9cbfa40e393..715873af3c8 100644 --- a/python/cuda_cccl/tests/compute/test_nested_struct.py +++ b/python/cuda_cccl/tests/compute/test_nested_struct.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import cupy as cp import numpy as np +import pytest import cuda.compute from cuda.compute import ZipIterator, gpu_struct @@ -149,6 +150,7 @@ def sum_pixels(p1, p2): assert result["color"]["b"] == expected_b +@pytest.mark.no_numba def test_dict_init_nested_struct(): """Test initializing a nested struct with a dictionary.""" Inner = gpu_struct({"a": np.int32, "b": np.float32}) @@ -162,6 +164,7 @@ def test_dict_init_nested_struct(): assert np.isclose(obj.inner.b, 3.14) +@pytest.mark.no_numba def test_dict_init_per_field(): """Test initializing a struct with a dictionary for a nested field.""" Inner = gpu_struct({"a": np.int32, "b": np.float32}) @@ -175,6 +178,7 @@ def test_dict_init_per_field(): assert np.isclose(obj.inner.b, 3.14) +@pytest.mark.no_numba def test_dict_init_deeply_nested(): """Test initializing deeply nested structs (3+ levels) with dictionaries.""" Level1 = gpu_struct({"value": np.int32}) @@ -189,6 +193,7 @@ def test_dict_init_deeply_nested(): assert obj.middle.nested.value == 42 +@pytest.mark.no_numba def test_dict_init_mixed(): """Test mixed initialization with some dicts and some direct values.""" Inner1 = gpu_struct({"a": np.int32, "b": np.int32}) diff --git a/python/cuda_cccl/tests/compute/test_no_numba.py b/python/cuda_cccl/tests/compute/test_no_numba.py index 8f1d271e1ed..bbab965c8bf 100644 --- a/python/cuda_cccl/tests/compute/test_no_numba.py +++ b/python/cuda_cccl/tests/compute/test_no_numba.py @@ -1,12 +1,6 @@ -import cupy as cp -import numpy as np import pytest -import cuda.compute -from cuda.compute import OpKind - -# Mainly, these tests check that we can use algorithms with OpKind -# operators while not requiring numba to be installed. +# Check that tests marked no_numba fail fast if they import numba. pytestmark = pytest.mark.no_numba @@ -16,70 +10,3 @@ def test_import_numba_raises(): ImportError, match="This test is marked 'no_numba' but attempted to import it" ): import numba.cuda # noqa: F401 - - -def test_reduce_op_kind(): - num_items = 100 - h_input = np.arange(num_items, dtype=np.int32) - d_input = cp.array(h_input) - d_output = cp.empty(1, dtype=np.int32) - - h_init = np.array(0, dtype=np.int32) - cuda.compute.reduce_into( - d_in=d_input, d_out=d_output, num_items=num_items, op=OpKind.PLUS, h_init=h_init - ) - - result = d_output.get()[0] - expected = np.sum(h_input) - assert result == expected - - -def test_binary_transform_op_kind(): - num_items = 100 - h_input1 = np.arange(num_items, dtype=np.int32) - h_input2 = np.arange(num_items, dtype=np.int32) * 2 - d_input1 = cp.array(h_input1) - d_input2 = cp.array(h_input2) - d_output = cp.empty(num_items, dtype=np.int32) - - cuda.compute.binary_transform( - d_in1=d_input1, - d_in2=d_input2, - d_out=d_output, - op=OpKind.PLUS, - num_items=num_items, - ) - - result = d_output.get() - expected = h_input1 + h_input2 - assert np.array_equal(result, expected) - - -def test_segmented_sort_op_kind(): - # Create segments: [3, 1, 4] | [1, 5, 9, 2] | [6, 5] - num_items = 9 - h_keys = np.array([3, 1, 4, 1, 5, 9, 2, 6, 5], dtype=np.int32) - h_offsets = np.array([0, 3, 7, 9], dtype=np.int32) - - d_keys_in = cp.array(h_keys) - d_keys_out = cp.empty(num_items, dtype=np.int32) - d_offsets = cp.array(h_offsets) - - num_segments = len(h_offsets) - 1 - - cuda.compute.segmented_sort( - d_in_keys=d_keys_in, - d_out_keys=d_keys_out, - d_in_values=None, - d_out_values=None, - num_items=num_items, - num_segments=num_segments, - start_offsets_in=d_offsets[:-1], - end_offsets_in=d_offsets[1:], - order=cuda.compute.SortOrder.ASCENDING, - ) - - result = d_keys_out.get() - # Expected: [1, 3, 4] | [1, 2, 5, 9] | [5, 6] - expected = np.array([1, 3, 4, 1, 2, 5, 9, 5, 6], dtype=np.int32) - assert np.array_equal(result, expected) diff --git a/python/cuda_cccl/tests/compute/test_permutation_iterator.py b/python/cuda_cccl/tests/compute/test_permutation_iterator.py index 74d74a7b4e6..226904b75da 100644 --- a/python/cuda_cccl/tests/compute/test_permutation_iterator.py +++ b/python/cuda_cccl/tests/compute/test_permutation_iterator.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import cupy as cp import numpy as np +import pytest import cuda.compute from cuda.compute.iterators import ( @@ -12,6 +13,7 @@ ) +@pytest.mark.no_numba def test_permutation_iterator_equality(): values1 = cp.asarray([10, 20, 30, 40, 50], dtype="int32") values2 = cp.asarray([100, 200, 300], dtype="int32") @@ -37,6 +39,7 @@ def test_permutation_iterator_equality(): assert it1.kind != it5.kind +@pytest.mark.no_numba def test_permutation_iterator_with_array_values(): values = cp.asarray([10, 20, 30, 40, 50], dtype="int32") indices = cp.asarray([2, 0, 4, 1], dtype="int32") @@ -54,6 +57,7 @@ def test_permutation_iterator_with_array_values(): assert d_output[0] == values[indices].sum() +@pytest.mark.no_numba def test_permutation_iterator_with_iterator_values(): values_it = CountingIterator(np.int32(10)) indices = cp.asarray([2, 0, 4, 1], dtype="int32") @@ -157,6 +161,7 @@ def op(a): assert cp.all(d_out == expected) +@pytest.mark.no_numba def test_caching_permutation_iterator(): """Test that iterator compilation is cached across instances with the same structure.""" from cuda.compute._cpp_compile import compile_cpp_op_code @@ -204,6 +209,7 @@ def test_caching_permutation_iterator(): ) +@pytest.mark.no_numba def test_permutation_iterator_advance(): """Test PermutationIterator.__add__ only advances indices, not values.""" # Create values array [10, 20, 30, 40, 50, 60, 70] diff --git a/python/cuda_cccl/tests/compute/test_radix_sort.py b/python/cuda_cccl/tests/compute/test_radix_sort.py index 62b8e7dcb44..67186d3a4f7 100644 --- a/python/cuda_cccl/tests/compute/test_radix_sort.py +++ b/python/cuda_cccl/tests/compute/test_radix_sort.py @@ -6,11 +6,11 @@ from typing import Tuple import cupy as cp -import numba import numpy as np import pytest import cuda.compute +from cuda.core import Device from cuda.compute import ( DoubleBuffer, SortOrder, @@ -148,7 +148,7 @@ def host_sort(h_in_keys, h_in_values, order, begin_bit=None, end_bit=None) -> Tu DTYPE_SIZE, ) def test_radix_sort_keys(dtype, num_items, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -164,12 +164,12 @@ def test_radix_sort_keys(dtype, num_items, monkeypatch): h_in_keys = random_array(num_items, dtype, max_value=20) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_out_keys = numba.cuda.to_device(h_out_keys) + d_in_keys = cp.asarray(h_in_keys) + d_out_keys = cp.asarray(h_out_keys) radix_sort_device(d_in_keys, d_out_keys, None, None, order, num_items) - h_out_keys = d_out_keys.copy_to_host() + h_out_keys = d_out_keys.get() h_in_keys, _ = host_sort(h_in_keys, None, order) @@ -195,17 +195,17 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_values = numba.cuda.to_device(h_in_values) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_values = numba.cuda.to_device(h_out_values) + d_in_keys = cp.asarray(h_in_keys) + d_in_values = cp.asarray(h_in_values) + d_out_keys = cp.asarray(h_out_keys) + d_out_values = cp.asarray(h_out_values) radix_sort_device( d_in_keys, d_out_keys, d_in_values, d_out_values, order, num_items ) - h_out_keys = d_out_keys.copy_to_host() - h_out_values = d_out_values.copy_to_host() + h_out_keys = d_out_keys.get() + h_out_values = d_out_values.get() h_in_keys, h_in_values = host_sort(h_in_keys, h_in_values, order) @@ -218,7 +218,7 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch): DTYPE_SIZE, ) def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -234,14 +234,14 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch): h_in_keys = random_array(num_items, dtype, max_value=20) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_out_keys = numba.cuda.to_device(h_out_keys) + d_in_keys = cp.asarray(h_in_keys) + d_out_keys = cp.asarray(h_out_keys) keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys) radix_sort_device(keys_double_buffer, None, None, None, order, num_items) - h_out_keys = keys_double_buffer.current().copy_to_host() + h_out_keys = keys_double_buffer.current().get() h_in_keys, _ = host_sort(h_in_keys, None, order) @@ -253,7 +253,7 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch): DTYPE_SIZE, ) def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # NOTE: int16 failures seen only with NVRTC 13.1: if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)): import cuda.compute._cccl_interop @@ -270,10 +270,10 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_values = numba.cuda.to_device(h_in_values) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_values = numba.cuda.to_device(h_out_values) + d_in_keys = cp.asarray(h_in_keys) + d_in_values = cp.asarray(h_in_values) + d_out_keys = cp.asarray(h_out_keys) + d_out_values = cp.asarray(h_out_values) keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys) values_double_buffer = DoubleBuffer(d_in_values, d_out_values) @@ -282,8 +282,8 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): keys_double_buffer, None, values_double_buffer, None, order, num_items ) - h_out_keys = keys_double_buffer.current().copy_to_host() - h_out_values = values_double_buffer.current().copy_to_host() + h_out_keys = keys_double_buffer.current().get() + h_out_values = values_double_buffer.current().get() h_in_keys, h_in_values = host_sort(h_in_keys, h_in_values, order) @@ -304,7 +304,7 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): DTYPE_SIZE_BIT_WINDOW, ) def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # NOTE: int16 failures seen only with NVRTC 13.1: if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)): import cuda.compute._cccl_interop @@ -329,10 +329,10 @@ def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_values = numba.cuda.to_device(h_in_values) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_values = numba.cuda.to_device(h_out_values) + d_in_keys = cp.asarray(h_in_keys) + d_in_values = cp.asarray(h_in_values) + d_out_keys = cp.asarray(h_out_keys) + d_out_values = cp.asarray(h_out_values) radix_sort_device( d_in_keys, @@ -345,8 +345,8 @@ def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch): end_bit, ) - h_out_keys = d_out_keys.copy_to_host() - h_out_values = d_out_values.copy_to_host() + h_out_keys = d_out_keys.get() + h_out_values = d_out_values.get() h_in_keys, h_in_values = host_sort( h_in_keys, h_in_values, order, begin_bit, end_bit @@ -384,10 +384,10 @@ def test_radix_sort_pairs_double_buffer_bit_window(dtype, num_items, monkeypatch h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_values = numba.cuda.to_device(h_in_values) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_values = numba.cuda.to_device(h_out_values) + d_in_keys = cp.asarray(h_in_keys) + d_in_values = cp.asarray(h_in_values) + d_out_keys = cp.asarray(h_out_keys) + d_out_values = cp.asarray(h_out_values) keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys) values_double_buffer = DoubleBuffer(d_in_values, d_out_values) @@ -403,8 +403,8 @@ def test_radix_sort_pairs_double_buffer_bit_window(dtype, num_items, monkeypatch end_bit, ) - h_out_keys = keys_double_buffer.current().copy_to_host() - h_out_values = values_double_buffer.current().copy_to_host() + h_out_keys = keys_double_buffer.current().get() + h_out_values = values_double_buffer.current().get() h_in_keys, h_in_values = host_sort( h_in_keys, h_in_values, order, begin_bit, end_bit @@ -469,8 +469,9 @@ def test_radix_sort_with_stream(cuda_stream): np.testing.assert_array_equal(got, h_in_keys) +@pytest.mark.no_numba def test_radix_sort(monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -518,8 +519,9 @@ def test_radix_sort(monkeypatch): np.testing.assert_array_equal(h_out_items, h_in_values) +@pytest.mark.no_numba def test_radix_sort_double_buffer(monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: diff --git a/python/cuda_cccl/tests/compute/test_reduce.py b/python/cuda_cccl/tests/compute/test_reduce.py index df3f024f0ee..15f11428506 100644 --- a/python/cuda_cccl/tests/compute/test_reduce.py +++ b/python/cuda_cccl/tests/compute/test_reduce.py @@ -6,7 +6,6 @@ import random import cupy as cp -import numba.cuda import numpy as np import pytest from cupy.cuda import runtime @@ -73,15 +72,15 @@ def add_op(a, b): def test_device_reduce(dtype, num_items, op): init_value = 42 h_init = np.array([init_value], dtype=dtype) - d_output = numba.cuda.device_array(1, dtype=dtype) + d_output = cp.empty(1, dtype=dtype) h_input = random_int(num_items, dtype) - d_input = numba.cuda.to_device(h_input) + d_input = cp.asarray(h_input) cuda.compute.reduce_into( d_in=d_input, d_out=d_output, num_items=d_input.size, op=op, h_init=h_init ) - h_output = d_output.copy_to_host() + h_output = d_output.get() assert h_output[0] == pytest.approx( sum(h_input) + init_value, rel=0.08 if dtype == np.float16 else 0 ) # obtained relative error value from c2h/include/c2h/check_results.cuh @@ -94,10 +93,10 @@ def test_device_reduce_with_lambda(): num_items = 1024 h_init = np.array([init_value], dtype=dtype) - d_output = numba.cuda.device_array(1, dtype=dtype) + d_output = cp.empty(1, dtype=dtype) h_input = random_int(num_items, dtype) - d_input = numba.cuda.to_device(h_input) + d_input = cp.asarray(h_input) # Use a lambda function directly as the reducer cuda.compute.reduce_into( @@ -107,7 +106,7 @@ def test_device_reduce_with_lambda(): op=lambda a, b: a + b, h_init=h_init, ) - h_output = d_output.copy_to_host() + h_output = d_output.get() assert h_output[0] == sum(h_input) + init_value @@ -118,10 +117,10 @@ def test_device_reduce_with_lambda_variable(): num_items = 1024 h_init = np.array([init_value], dtype=dtype) - d_output = numba.cuda.device_array(1, dtype=dtype) + d_output = cp.empty(1, dtype=dtype) h_input = random_int(num_items, dtype) - d_input = numba.cuda.to_device(h_input) + d_input = cp.asarray(h_input) # Use a lambda function assigned to a variable as the reducer cuda.compute.reduce_into( @@ -131,24 +130,24 @@ def test_device_reduce_with_lambda_variable(): op=add_op_lambda, h_init=h_init, ) - h_output = d_output.copy_to_host() + h_output = d_output.get() assert h_output[0] == sum(h_input) + init_value def test_complex_device_reduce(): h_init = np.array([40.0 + 2.0j], dtype=complex) - d_output = numba.cuda.device_array(1, dtype=complex) + d_output = cp.empty(1, dtype=complex) for num_items in [42, 420000]: real_imag = np.random.random((2, num_items)) h_input = real_imag[0] + 1j * real_imag[1] - d_input = numba.cuda.to_device(h_input) + d_input = cp.asarray(h_input) assert d_input.size == num_items cuda.compute.reduce_into( d_in=d_input, d_out=d_output, num_items=num_items, op=add_op, h_init=h_init ) - result = d_output.copy_to_host()[0] + result = d_output.get()[0] expected = np.sum(h_input, initial=h_init[0]) assert result == pytest.approx(expected) @@ -162,11 +161,11 @@ def _test_device_sum_with_iterator( if use_numpy_array: h_input = np.array(l_varr, dtype_inp) - d_input = numba.cuda.to_device(h_input) + d_input = cp.asarray(h_input) else: d_input = i_input - d_output = numba.cuda.device_array(1, dtype_out) # to store device sum + d_output = cp.empty(1, dtype_out) # to store device sum h_init = np.array([start_sum_with], dtype_out) @@ -174,7 +173,7 @@ def _test_device_sum_with_iterator( d_in=d_input, d_out=d_output, num_items=len(l_varr), op=add_op, h_init=h_init ) - h_output = d_output.copy_to_host() + h_output = d_output.get() assert h_output[0] == expected_result @@ -216,7 +215,7 @@ def test_device_sum_cache_modified_input_it( l_varr = [rng.randrange(100) for _ in range(num_items)] dtype_inp = np.dtype(supported_value_type) dtype_out = dtype_inp - input_devarr = numba.cuda.to_device(np.array(l_varr, dtype=dtype_inp)) + input_devarr = cp.asarray(np.array(l_varr, dtype=dtype_inp)) i_input = CacheModifiedInputIterator(input_devarr, modifier="stream") _test_device_sum_with_iterator( l_varr, start_sum_with, i_input, dtype_inp, dtype_out, use_numpy_array @@ -691,6 +690,7 @@ def add_op(x, y): ) +@pytest.mark.no_numba def test_device_reduce_well_known_plus(): dtype = np.int32 h_init = np.array([0], dtype=dtype) @@ -709,6 +709,7 @@ def test_device_reduce_well_known_plus(): assert (d_output == expected_output).all() +@pytest.mark.no_numba def test_device_reduce_well_known_minimum(): dtype = np.int32 h_init = np.array([100], dtype=dtype) @@ -727,6 +728,7 @@ def test_device_reduce_well_known_minimum(): assert (d_output == expected_output).all() +@pytest.mark.no_numba def test_device_reduce_well_known_maximum(): dtype = np.int32 h_init = np.array([-100], dtype=dtype) @@ -925,6 +927,7 @@ def sqrt(x: dtype) -> dtype: np.testing.assert_allclose(d_output.get(), expected.get(), atol=1e-6) +@pytest.mark.no_numba def test_reduce_with_not_guaranteed_determinism(floating_array): dtype = floating_array.dtype h_init = np.array([0], dtype=dtype) @@ -942,6 +945,7 @@ def test_reduce_with_not_guaranteed_determinism(floating_array): ) +@pytest.mark.no_numba def test_reduce_bool(): h_init = np.array([False]) d_input = cp.array([True, False, True]) diff --git a/python/cuda_cccl/tests/compute/test_scan.py b/python/cuda_cccl/tests/compute/test_scan.py index af6d941b11b..97f5fae630a 100644 --- a/python/cuda_cccl/tests/compute/test_scan.py +++ b/python/cuda_cccl/tests/compute/test_scan.py @@ -4,11 +4,11 @@ import cupy as cp -import numba.cuda import numpy as np import pytest import cuda.compute +from cuda.core import Device from cuda.compute import ( CountingIterator, OpKind, @@ -52,7 +52,7 @@ def scan_device(d_input, d_output, num_items, op, h_init, force_inclusive, strea [True, False], ) def test_scan_array_input(force_inclusive, input_array, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip sass verification if input is complex # as LDL/STL instructions are emitted for complex types. # Also skip for: @@ -214,6 +214,7 @@ def op(a, b): np.testing.assert_allclose(expected, got, rtol=1e-5) +@pytest.mark.no_numba def test_exclusive_scan_well_known_plus(): dtype = np.int32 h_init = np.array([0], dtype=dtype) @@ -232,8 +233,9 @@ def test_exclusive_scan_well_known_plus(): np.testing.assert_equal(d_output.get(), expected) +@pytest.mark.no_numba def test_inclusive_scan_well_known_plus(monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip SASS check for CC 9.0+, due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -354,7 +356,7 @@ def add_op(a, b): def test_reverse_input_iterator(monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip SASS check for CC 9.0+, due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -409,6 +411,7 @@ def add_op(a, b): np.testing.assert_equal(d_output.get(), expected) +@pytest.mark.no_numba @pytest.mark.parametrize( "force_inclusive", [True, False], @@ -430,13 +433,14 @@ def test_future_init_value(force_inclusive): np.testing.assert_array_equal(expected, got) +@pytest.mark.no_numba def test_no_init_value(monkeypatch): force_inclusive = True num_items = 1024 dtype = np.dtype("int32") # Skip SASS check for CC 9.0 due to LDL/STL CI failure. - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability if cc_major >= 9: import cuda.compute._cccl_interop @@ -456,6 +460,7 @@ def test_no_init_value(monkeypatch): np.testing.assert_array_equal(expected, got) +@pytest.mark.no_numba def test_no_init_value_iterator(): force_inclusive = True num_items = 1024 @@ -493,6 +498,7 @@ def test_inclusive_scan_with_lambda(): np.testing.assert_array_equal(d_output.get(), expected) +@pytest.mark.no_numba @pytest.mark.parametrize("force_inclusive", [True, False]) def test_scan_bool_maximum(force_inclusive): h_init = np.array([False], dtype=np.bool_) diff --git a/python/cuda_cccl/tests/compute/test_segmented_reduce.py b/python/cuda_cccl/tests/compute/test_segmented_reduce.py index 950d860a14d..815c98b8edb 100644 --- a/python/cuda_cccl/tests/compute/test_segmented_reduce.py +++ b/python/cuda_cccl/tests/compute/test_segmented_reduce.py @@ -269,6 +269,7 @@ def _plus(a, b): ) +@pytest.mark.no_numba def test_segmented_reduce_well_known_plus(monkeypatch): # Disable SASS verification for this test (LDL instruction in SASS). monkeypatch.setattr( @@ -299,6 +300,7 @@ def test_segmented_reduce_well_known_plus(monkeypatch): np.testing.assert_equal(d_output.get(), expected) +@pytest.mark.no_numba def test_segmented_reduce_well_known_maximum(monkeypatch): # Disable SASS verification for this test (LDL instruction in SASS). monkeypatch.setattr( @@ -329,6 +331,7 @@ def test_segmented_reduce_well_known_maximum(monkeypatch): np.testing.assert_equal(d_output.get(), expected) +@pytest.mark.no_numba def test_segmented_reduce_bool_maximum(monkeypatch): # Disable SASS verification for this test (LDL instruction in SASS). monkeypatch.setattr( diff --git a/python/cuda_cccl/tests/compute/test_segmented_sort.py b/python/cuda_cccl/tests/compute/test_segmented_sort.py index 2bdd03abb42..9c2ab9f2de4 100644 --- a/python/cuda_cccl/tests/compute/test_segmented_sort.py +++ b/python/cuda_cccl/tests/compute/test_segmented_sort.py @@ -5,12 +5,13 @@ from typing import Tuple import cupy as cp -import numba import numpy as np import pytest import cuda.compute +pytestmark = pytest.mark.no_numba + DTYPE_LIST = [ np.uint8, np.int16, @@ -118,8 +119,8 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch): h_in_keys = random_array(num_items, dtype, max_value=50) start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) + d_in_keys = cp.asarray(h_in_keys) + d_out_keys = cp.asarray(np.empty_like(h_in_keys)) cuda.compute.segmented_sort( d_in_keys=d_in_keys, @@ -133,7 +134,7 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch): order=order, ) - h_out_keys = d_out_keys.copy_to_host() + h_out_keys = d_out_keys.get() expected_keys, _ = host_segmented_sort( h_in_keys, None, start_offsets, end_offsets, order ) @@ -153,10 +154,10 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size): start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_vals = numba.cuda.to_device(h_in_vals) - d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) - d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals)) + d_in_keys = cp.asarray(h_in_keys) + d_in_vals = cp.asarray(h_in_vals) + d_out_keys = cp.asarray(np.empty_like(h_in_keys)) + d_out_vals = cp.asarray(np.empty_like(h_in_vals)) cuda.compute.segmented_sort( d_in_keys=d_in_keys, @@ -170,8 +171,8 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size): order=order, ) - h_out_keys = d_out_keys.copy_to_host() - h_out_vals = d_out_vals.copy_to_host() + h_out_keys = d_out_keys.get() + h_out_vals = d_out_vals.get() expected_keys, expected_vals = host_segmented_sort( h_in_keys, h_in_vals, start_offsets, end_offsets, order @@ -189,8 +190,8 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size): h_in_keys = random_array(num_items, dtype, max_value=20) start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) + d_in_keys = cp.asarray(h_in_keys) + d_tmp_keys = cp.asarray(np.empty_like(h_in_keys)) keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys) cuda.compute.segmented_sort( @@ -205,7 +206,7 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size): order=order, ) - h_out_keys = keys_db.current().copy_to_host() + h_out_keys = keys_db.current().get() expected_keys, _ = host_segmented_sort( h_in_keys, None, start_offsets, end_offsets, order ) @@ -224,10 +225,10 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size): start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_vals = numba.cuda.to_device(h_in_vals) - d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) - d_tmp_vals = numba.cuda.to_device(np.empty_like(h_in_vals)) + d_in_keys = cp.asarray(h_in_keys) + d_in_vals = cp.asarray(h_in_vals) + d_tmp_keys = cp.asarray(np.empty_like(h_in_keys)) + d_tmp_vals = cp.asarray(np.empty_like(h_in_vals)) keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys) vals_db = cuda.compute.DoubleBuffer(d_in_vals, d_tmp_vals) @@ -244,8 +245,8 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size): order=order, ) - h_out_keys = keys_db.current().copy_to_host() - h_out_vals = vals_db.current().copy_to_host() + h_out_keys = keys_db.current().get() + h_out_vals = vals_db.current().get() expected_keys, expected_vals = host_segmented_sort( h_in_keys, h_in_vals, start_offsets, end_offsets, order @@ -297,10 +298,10 @@ def test_segmented_sort_variable_segment_sizes(num_segments): h_in_keys = random_array(num_items, np.int32, max_value=100) h_in_vals = random_array(num_items, np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_vals = numba.cuda.to_device(h_in_vals) - d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) - d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals)) + d_in_keys = cp.asarray(h_in_keys) + d_in_vals = cp.asarray(h_in_vals) + d_out_keys = cp.asarray(np.empty_like(h_in_keys)) + d_out_vals = cp.asarray(np.empty_like(h_in_vals)) cuda.compute.segmented_sort( d_in_keys=d_in_keys, @@ -314,8 +315,8 @@ def test_segmented_sort_variable_segment_sizes(num_segments): order=order, ) - h_out_keys = d_out_keys.copy_to_host() - h_out_vals = d_out_vals.copy_to_host() + h_out_keys = d_out_keys.get() + h_out_vals = d_out_vals.get() expected_keys, expected_vals = host_segmented_sort( h_in_keys, h_in_vals, start_offsets, end_offsets, order ) diff --git a/python/cuda_cccl/tests/compute/test_select.py b/python/cuda_cccl/tests/compute/test_select.py index f2483860a4a..d467d33c635 100644 --- a/python/cuda_cccl/tests/compute/test_select.py +++ b/python/cuda_cccl/tests/compute/test_select.py @@ -7,6 +7,8 @@ import pytest import cuda.compute +from cuda.compute._cpp_compile import compile_cpp_op_code +from cuda.compute.op import RawOp from cuda.compute import CacheModifiedInputIterator, ZipIterator, gpu_struct DTYPE_LIST = [ @@ -59,6 +61,35 @@ def _host_select(h_in: np.ndarray, cond): return selected, np.int64(selected.size) +def _raw_even_i32_op() -> RawOp: + source = """ +extern "C" __device__ void is_even_i32(void* x, void* result) { + int value = *static_cast(x); + *static_cast(result) = (value % 2) == 0; +} +""" + return RawOp(ltoir=compile_cpp_op_code(source), name="is_even_i32") + + +@pytest.mark.no_numba +def test_select_raw_op_minimal(): + h_in = np.arange(10, dtype=np.int32) + d_in = cp.asarray(h_in) + d_out = cp.empty_like(d_in) + d_num_selected = cp.empty(2, dtype=np.uint64) + + cuda.compute.select( + d_in=d_in, + d_out=d_out, + d_num_selected_out=d_num_selected, + cond=_raw_even_i32_op(), + num_items=len(d_in), + ) + + num_selected = int(d_num_selected[0].get()) + np.testing.assert_array_equal(d_out.get()[:num_selected], h_in[h_in % 2 == 0]) + + @pytest.mark.parametrize("dtype,num_items", select_params) def test_select_basic(dtype, num_items): h_in = random_array(num_items, dtype, max_value=100) diff --git a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py index 5c82fe1ad38..715f9e51235 100644 --- a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py +++ b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py @@ -87,6 +87,7 @@ def test_shuffle_iterator_with_permutation_iterator(): assert sorted(result) == sorted(d_values.get()) +@pytest.mark.no_numba def test_shuffle_iterator_invalid_num_items(): with pytest.raises(ValueError, match="num_items must be > 0"): ShuffleIterator(0, seed=42) diff --git a/python/cuda_cccl/tests/compute/test_struct_field_validation.py b/python/cuda_cccl/tests/compute/test_struct_field_validation.py index 3ddaee54d41..cf1edd7e681 100644 --- a/python/cuda_cccl/tests/compute/test_struct_field_validation.py +++ b/python/cuda_cccl/tests/compute/test_struct_field_validation.py @@ -14,6 +14,8 @@ from cuda.compute import gpu_struct +pytestmark = pytest.mark.no_numba + def test_newline_in_field_name_is_rejected(): """Field names with newlines must be rejected — they are the exec() injection vector.""" diff --git a/python/cuda_cccl/tests/compute/test_three_way_partition.py b/python/cuda_cccl/tests/compute/test_three_way_partition.py index 3fb5b102275..9f2683f6798 100644 --- a/python/cuda_cccl/tests/compute/test_three_way_partition.py +++ b/python/cuda_cccl/tests/compute/test_three_way_partition.py @@ -7,6 +7,8 @@ import pytest import cuda.compute +from cuda.compute._cpp_compile import compile_cpp_op_code +from cuda.compute.op import RawOp from cuda.compute import CacheModifiedInputIterator, gpu_struct DTYPE_LIST = [ @@ -65,6 +67,50 @@ def _host_three_way_partition(h_in: np.ndarray, less_than_op, greater_equal_op): ) +def _raw_less_than_i32(name: str, threshold: int) -> RawOp: + source = f""" +extern "C" __device__ void {name}(void* x, void* result) {{ + int value = *static_cast(x); + *static_cast(result) = value < {threshold}; +}} +""" + return RawOp(ltoir=compile_cpp_op_code(source), name=name) + + +@pytest.mark.no_numba +def test_three_way_partition_raw_op_minimal(): + h_in = np.arange(10, dtype=np.int32) + d_in = cp.asarray(h_in) + d_first = cp.empty_like(d_in) + d_second = cp.empty_like(d_in) + d_unselected = cp.empty_like(d_in) + d_num_selected = cp.empty(2, dtype=np.uint64) + + cuda.compute.three_way_partition( + d_in=d_in, + d_first_part_out=d_first, + d_second_part_out=d_second, + d_unselected_out=d_unselected, + d_num_selected_out=d_num_selected, + select_first_part_op=_raw_less_than_i32("less_than_3_i32", 3), + select_second_part_op=_raw_less_than_i32("less_than_6_i32", 6), + num_items=len(d_in), + ) + + selected = d_num_selected.get() + first_count = int(selected[0]) + second_count = int(selected[1]) + unselected_count = len(h_in) - first_count - second_count + + np.testing.assert_array_equal(d_first.get()[:first_count], h_in[h_in < 3]) + np.testing.assert_array_equal( + d_second.get()[:second_count], h_in[(h_in >= 3) & (h_in < 6)] + ) + np.testing.assert_array_equal( + d_unselected.get()[:unselected_count], h_in[h_in >= 6] + ) + + @pytest.mark.parametrize("dtype,num_items", three_way_partition_params) def test_three_way_partition_basic(dtype, num_items, monkeypatch): # NOTE: the SASS check failure is seen only with NVRTC 13.1: diff --git a/python/cuda_cccl/tests/compute/test_transform.py b/python/cuda_cccl/tests/compute/test_transform.py index c7c3ca2818a..b8429726d77 100644 --- a/python/cuda_cccl/tests/compute/test_transform.py +++ b/python/cuda_cccl/tests/compute/test_transform.py @@ -266,6 +266,7 @@ def op2(a): np.testing.assert_allclose(expected, got) +@pytest.mark.no_numba def test_unary_transform_well_known_negate(): """Test unary transform with well-known NEGATE operation.""" dtype = np.int32 @@ -282,6 +283,7 @@ def test_unary_transform_well_known_negate(): np.testing.assert_equal(d_output.get(), expected) +@pytest.mark.no_numba def test_unary_transform_well_known_identity(): """Test unary transform with well-known IDENTITY operation.""" dtype = np.int32 @@ -298,6 +300,7 @@ def test_unary_transform_well_known_identity(): np.testing.assert_equal(d_output.get(), expected) +@pytest.mark.no_numba @pytest.mark.parametrize("dtype", [np.int32, np.float16]) def test_binary_transform_well_known_plus(dtype): """Test binary transform with well-known PLUS operation.""" @@ -319,6 +322,7 @@ def test_binary_transform_well_known_plus(dtype): np.testing.assert_equal(d_output.get(), expected) +@pytest.mark.no_numba def test_binary_transform_well_known_multiplies(): """Test binary transform with well-known MULTIPLIES operation.""" dtype = np.int32 @@ -586,6 +590,7 @@ def test_binary_transform_with_lambda(): np.testing.assert_array_equal(d_out.get(), expected) +@pytest.mark.no_numba def test_binary_transform_bool_equal_to(): d_input1 = cp.array([True, False, True, False], dtype=np.bool_) d_input2 = cp.array([True, True, False, False], dtype=np.bool_) diff --git a/python/cuda_cccl/tests/compute/test_unique_by_key.py b/python/cuda_cccl/tests/compute/test_unique_by_key.py index 83a4a17db67..5f7d310d71e 100644 --- a/python/cuda_cccl/tests/compute/test_unique_by_key.py +++ b/python/cuda_cccl/tests/compute/test_unique_by_key.py @@ -4,11 +4,11 @@ import cupy as cp -import numba.cuda import numpy as np import pytest import cuda.compute +from cuda.core import Device from cuda.compute import ( CacheModifiedInputIterator, DiscardIterator, @@ -125,7 +125,7 @@ def compare_op(lhs, rhs): @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params) def test_unique_by_key(dtype, num_items, op, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -143,11 +143,11 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch): h_out_items = np.empty(num_items, dtype=np.float32) h_out_num_selected = np.empty(1, np.int32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_items = numba.cuda.to_device(h_in_items) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_items = numba.cuda.to_device(h_out_items) - d_out_num_selected = numba.cuda.to_device(h_out_num_selected) + d_in_keys = cp.asarray(h_in_keys) + d_in_items = cp.asarray(h_in_items) + d_out_keys = cp.asarray(h_out_keys) + d_out_items = cp.asarray(h_out_items) + d_out_num_selected = cp.asarray(h_out_num_selected) unique_by_key_device( d_in_keys, @@ -159,10 +159,10 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch): num_items, ) - h_out_num_selected = d_out_num_selected.copy_to_host() + h_out_num_selected = d_out_num_selected.get() num_selected = h_out_num_selected[0] - h_out_keys = d_out_keys.copy_to_host()[:num_selected] - h_out_items = d_out_items.copy_to_host()[:num_selected] + h_out_keys = d_out_keys.get()[:num_selected] + h_out_items = d_out_items.get()[:num_selected] expected_keys, expected_items = unique_by_key_host(h_in_keys, h_in_items) @@ -172,7 +172,7 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch): @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params) def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip sass verification for CC 9.0+, due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -190,11 +190,11 @@ def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch): h_out_items = np.empty(num_items, dtype=np.float32) h_out_num_selected = np.empty(1, np.int64) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_items = numba.cuda.to_device(h_in_items) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_items = numba.cuda.to_device(h_out_items) - d_out_num_selected = numba.cuda.to_device(h_out_num_selected) + d_in_keys = cp.asarray(h_in_keys) + d_in_items = cp.asarray(h_in_items) + d_out_keys = cp.asarray(h_out_keys) + d_out_items = cp.asarray(h_out_items) + d_out_num_selected = cp.asarray(h_out_num_selected) i_in_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream") i_in_items = CacheModifiedInputIterator(d_in_items, modifier="stream") @@ -209,10 +209,10 @@ def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch): num_items, ) - h_out_num_selected = d_out_num_selected.copy_to_host() + h_out_num_selected = d_out_num_selected.get() num_selected = h_out_num_selected[0] - h_out_keys = d_out_keys.copy_to_host()[:num_selected] - h_out_items = d_out_items.copy_to_host()[:num_selected] + h_out_keys = d_out_keys.get()[:num_selected] + h_out_items = d_out_items.get()[:num_selected] expected_keys, expected_items = unique_by_key_host(h_in_keys, h_in_items) @@ -226,9 +226,9 @@ def test_unique_by_key_keys_only(): h_out_keys = np.empty(num_items, dtype=np.int32) h_out_num_selected = np.empty(1, np.int32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_num_selected = numba.cuda.to_device(h_out_num_selected) + d_in_keys = cp.asarray(h_in_keys) + d_out_keys = cp.asarray(h_out_keys) + d_out_num_selected = cp.asarray(h_out_num_selected) unique_by_key_device( d_in_keys, @@ -240,9 +240,9 @@ def test_unique_by_key_keys_only(): num_items, ) - h_out_num_selected = d_out_num_selected.copy_to_host() + h_out_num_selected = d_out_num_selected.get() num_selected = h_out_num_selected[0] - h_out_keys = d_out_keys.copy_to_host()[:num_selected] + h_out_keys = d_out_keys.get()[:num_selected] expected_keys, _ = unique_by_key_host( h_in_keys, @@ -267,11 +267,11 @@ def compare_complex(lhs, rhs): h_out_items = np.empty(num_items, dtype=np.float32) h_out_num_selected = np.empty(1, np.int32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_items = numba.cuda.to_device(h_in_items) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_items = numba.cuda.to_device(h_out_items) - d_out_num_selected = numba.cuda.to_device(h_out_num_selected) + d_in_keys = cp.asarray(h_in_keys) + d_in_items = cp.asarray(h_in_items) + d_out_keys = cp.asarray(h_out_keys) + d_out_items = cp.asarray(h_out_items) + d_out_num_selected = cp.asarray(h_out_num_selected) unique_by_key_device( d_in_keys, @@ -283,10 +283,10 @@ def compare_complex(lhs, rhs): num_items, ) - h_out_num_selected = d_out_num_selected.copy_to_host() + h_out_num_selected = d_out_num_selected.get() num_selected = h_out_num_selected[0] - h_out_keys = d_out_keys.copy_to_host()[:num_selected] - h_out_items = d_out_items.copy_to_host()[:num_selected] + h_out_keys = d_out_keys.get()[:num_selected] + h_out_items = d_out_items.get()[:num_selected] expected_keys, expected_items = unique_by_key_host( h_in_keys, h_in_items, compare_complex @@ -363,7 +363,7 @@ def struct_compare_op(lhs, rhs): def test_unique_by_key_with_stream(cuda_stream, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -417,8 +417,9 @@ def test_unique_by_key_with_stream(cuda_stream, monkeypatch): np.testing.assert_array_equal(h_out_items, expected_items) +@pytest.mark.no_numba def test_unique_by_key_well_known_equal_to(monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: diff --git a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py index e08709b6e1a..e4d8185eb91 100644 --- a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py +++ b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py @@ -12,7 +12,9 @@ """ import pytest -from numba import types + +numba = pytest.importorskip("numba") +types = numba.types from cuda.compute._odr_helpers import _ArgMode, _ArgSpec, _create_void_ptr_wrapper from cuda.compute._utils import sanitize_identifier diff --git a/python/cuda_cccl/tests/compute/test_zip_iterator.py b/python/cuda_cccl/tests/compute/test_zip_iterator.py index 85630bb4f5f..aea1f167f9f 100644 --- a/python/cuda_cccl/tests/compute/test_zip_iterator.py +++ b/python/cuda_cccl/tests/compute/test_zip_iterator.py @@ -6,6 +6,7 @@ import pytest import cuda.compute +from cuda.core import Device from cuda.compute import ( CountingIterator, TransformIterator, @@ -261,10 +262,8 @@ def min_pairs(p1, p2): @pytest.mark.parametrize("num_items", [10, 1000]) def test_output_zip_iterator_with_scan(monkeypatch, num_items): """Test ZipIterator as output iterator with scan operations.""" - import numba.cuda - # Skip SASS check for CC 8.0+ due to LDL/STL CI failure. - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability if cc_major >= 8: monkeypatch.setattr( cuda.compute._cccl_interop, @@ -426,9 +425,7 @@ def sum_nested_zips(v1, v2): ], ) def test_nested_output_zip_iterator_with_scan(monkeypatch, num_items, dtype_map): - import numba.cuda - - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = Device().compute_capability if cc_major >= 8: monkeypatch.setattr( cuda.compute._cccl_interop, @@ -502,6 +499,7 @@ def g(x): assert it1.kind != it2.kind +@pytest.mark.no_numba def test_caching_zip_iterator(): """Test that iterator compilation is cached across instances with the same structure.""" from cuda.compute._cpp_compile import compile_cpp_op_code @@ -577,6 +575,7 @@ def test_caching_zip_iterator(): assert len(set(kinds)) == 1, "Same CountingIterator types should have same kind" +@pytest.mark.no_numba def test_compilation_caching_across_iterator_types(): """Test that compilation caching works across different iterator types.""" from cuda.compute import ConstantIterator From d97d809f8eaf9b8fc60fe9ebb337ef52284526c8 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Fri, 12 Jun 2026 12:48:11 -0500 Subject: [PATCH 03/16] Stop wrapping binary search comparator in python callable --- .../cuda/compute/algorithms/_binary_search.py | 21 +------ .../tests/compute/test_binary_search.py | 60 +++++++++++++++++++ 2 files changed, 63 insertions(+), 18 deletions(-) diff --git a/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py b/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py index 6cadd994ceb..23a99c7bc40 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_binary_search.py @@ -16,19 +16,6 @@ from ..typing import DeviceArrayLike, IteratorT, Operator -def _normalize_comp(comp: Operator | None) -> OpAdapter: - # Use a lambda for the default comparator rather than OpKind.LESS - # because well-known ops don't carry type information needed by - # the binary search JIT compilation. - if comp is None or comp is OpKind.LESS: - - def _default_less(a, b): - return a < b - - return make_op_adapter(_default_less) - return make_op_adapter(comp) - - class _BinarySearch: __slots__ = [ "build_result", @@ -96,9 +83,7 @@ def __call__( set_cccl_iterator_state(self.d_out_cccl, d_out) # Update op state for stateful ops - comp_adapter = ( - _normalize_comp(comp) if comp is not None else _normalize_comp(None) - ) + comp_adapter = make_op_adapter(OpKind.LESS if comp is None else comp) self.op_cccl.state = comp_adapter.get_state() stream_handle = protocols.validate_and_get_stream(stream) @@ -154,7 +139,7 @@ def make_lower_bound( See Also: :func:`lower_bound` """ - comp_adapter = _normalize_comp(comp) + comp_adapter = make_op_adapter(OpKind.LESS if comp is None else comp) return _make_binary_search( d_data, d_values, @@ -193,7 +178,7 @@ def make_upper_bound( See Also: :func:`upper_bound` """ - comp_adapter = _normalize_comp(comp) + comp_adapter = make_op_adapter(OpKind.LESS if comp is None else comp) return _make_binary_search( d_data, d_values, diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py index a4f7d047a9b..585a998c44c 100644 --- a/python/cuda_cccl/tests/compute/test_binary_search.py +++ b/python/cuda_cccl/tests/compute/test_binary_search.py @@ -6,6 +6,7 @@ import pytest import cuda.compute +from cuda.compute import OpKind DTYPE_LIST = [ np.int32, @@ -39,6 +40,65 @@ def disable_sass_check(monkeypatch): ) +@pytest.mark.parametrize( + "search, side", + [ + (cuda.compute.lower_bound, "left"), + (cuda.compute.upper_bound, "right"), + ], +) +def test_binary_search_explicit_opkind_less(search, side): + h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32) + h_values = np.array([0, 3, 4, 10], dtype=np.int32) + + d_data = cp.asarray(h_data) + d_values = cp.asarray(h_values) + d_out = cp.empty(len(h_values), dtype=np.uintp) + + search( + d_data=d_data, + num_items=len(d_data), + d_values=d_values, + num_values=len(d_values), + d_out=d_out, + comp=OpKind.LESS, + ) + + expected = np.searchsorted(h_data, h_values, side=side).astype(np.uintp) + np.testing.assert_array_equal(d_out.get(), expected) + + +@pytest.mark.parametrize( + "search, side", + [ + (cuda.compute.lower_bound, "left"), + (cuda.compute.upper_bound, "right"), + ], +) +def test_binary_search_custom_comparator(search, side): + h_data = np.array([9, 7, 3, 3, 1], dtype=np.int32) + h_values = np.array([10, 4, 3, 0], dtype=np.int32) + + def greater(lhs, rhs): + return lhs > rhs + + d_data = cp.asarray(h_data) + d_values = cp.asarray(h_values) + d_out = cp.empty(len(h_values), dtype=np.uintp) + + search( + d_data=d_data, + num_items=len(d_data), + d_values=d_values, + num_values=len(d_values), + d_out=d_out, + comp=greater, + ) + + expected = np.searchsorted(-h_data, -h_values, side=side).astype(np.uintp) + np.testing.assert_array_equal(d_out.get(), expected) + + @pytest.mark.parametrize("dtype", DTYPE_LIST) @pytest.mark.parametrize( "num_items,num_values", [(0, 0), (0, 128), (128, 0), (512, 128)] From e0c72ab4a780c945ff2eaf849356bd4f0e121405 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Fri, 12 Jun 2026 15:50:11 -0500 Subject: [PATCH 04/16] Add CI for 3.14t --- ci/matrix.yaml | 5 +- ci/test_cuda_compute_python.sh | 4 +- ci/windows/build_common_python.psm1 | 5 +- ci/windows/build_cuda_cccl_python.ps1 | 5 +- ci/windows/test_cuda_cccl_examples_python.ps1 | 54 ++++++++--------- ci/windows/test_cuda_cccl_headers_python.ps1 | 54 ++++++++--------- ci/windows/test_cuda_compute_python.ps1 | 58 +++++++++---------- ci/windows/test_cuda_coop_python.ps1 | 50 ++++++++-------- python/cuda_cccl/pyproject.toml | 1 + .../compute/test_free_threading_stress.py | 1 + 10 files changed, 121 insertions(+), 116 deletions(-) diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 1fcb1df75ae..18b1a5e1515 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -82,9 +82,10 @@ workflows: - {jobs: ['test'], project: 'cccl_c_stf', ctk: '13.X', cxx: 'gcc13', gpu: ['rtx2080', 'l4', 'h100']} # Python -- pinned to gcc13 / msvc2022 for consistency across CTK images - {jobs: ['test'], project: 'python', ctk: ['12.X', '13.X'], py_version: ['3.10'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']} - - {jobs: ['test'], project: 'python', ctk: ['12.X','13.0', '13.X'], py_version: ['3.14'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']} - - {jobs: ['test'], project: 'python', py_version: '3.14', gpu: 'h100', cxx: 'gcc13'} + - {jobs: ['test'], project: 'python', ctk: ['12.X','13.0', '13.X'], py_version: ['3.14', '3.14t'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']} + - {jobs: ['test'], project: 'python', py_version: ['3.14', '3.14t'], gpu: 'h100', cxx: 'gcc13'} - {jobs: ['test_py_compute_minimal'], project: 'python', ctk: '13.X', py_version: '3.14', gpu: 'l4', cxx: 'gcc13'} + - {jobs: ['test_py_compute_minimal'], project: 'python', ctk: '13.X', py_version: '3.14t', gpu: 'l4', cxx: 'gcc13'} # CCCL packaging: - {jobs: ['test'], project: 'packaging', ctk: '12.0', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080', args: '-min-cmake'} - {jobs: ['test'], project: 'packaging', ctk: '12.X', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080'} diff --git a/ci/test_cuda_compute_python.sh b/ci/test_cuda_compute_python.sh index bd6ad432178..c829e72518a 100755 --- a/ci/test_cuda_compute_python.sh +++ b/ci/test_cuda_compute_python.sh @@ -35,5 +35,5 @@ if [[ "${CCCL_PYTHON_USE_V2:-}" =~ ^(1|true|TRUE|on|ON)$ ]]; then fi cd "/home/coder/cccl/python/cuda_cccl/tests/" -python -m pytest "${pytest_extra[@]}" -n 6 -v compute/ -m "not large" -python -m pytest "${pytest_extra[@]}" -n 0 -v compute/ -m "large" +python -m pytest "${pytest_extra[@]}" -n 6 -v compute/ -m "not large and not free_threading" +python -m pytest "${pytest_extra[@]}" -n 0 -v compute/ -m "large and not free_threading" diff --git a/ci/windows/build_common_python.psm1 b/ci/windows/build_common_python.psm1 index dab9258761a..060da49d4bd 100644 --- a/ci/windows/build_common_python.psm1 +++ b/ci/windows/build_common_python.psm1 @@ -4,12 +4,13 @@ function Get-Python { Returns the path of the Python interpreter satisfying the supplied version, installing it via uv if necessary. .PARAMETER Version - A string in the form 'M.m' (e.g., '3.10', '3.13'). + A string in the form 'M.m' (e.g., '3.10', '3.13') or a free-threaded + version such as '3.14t'. #> [CmdletBinding()] param( [Parameter(Mandatory, Position = 0)] - [ValidatePattern('^\d+\.\d+$')] + [ValidatePattern('^\d+\.\d+t?$')] [string]$Version ) diff --git a/ci/windows/build_cuda_cccl_python.ps1 b/ci/windows/build_cuda_cccl_python.ps1 index 3d6dadd632a..3e825d38b9c 100644 --- a/ci/windows/build_cuda_cccl_python.ps1 +++ b/ci/windows/build_cuda_cccl_python.ps1 @@ -21,7 +21,8 @@ .PARAMETER PyVersion **Required.** The Python version to use for building the wheel, expressed - as `.` (e.g. `3.11`). + as `.` (e.g. `3.11`) or a free-threaded version such as + `3.14t`. .PARAMETER OnlyCudaMajor Optional. Restricts the build to a single CUDA major version (`12` or `13`). @@ -49,7 +50,7 @@ Param( [Parameter(Mandatory = $true)] [Alias("py-version")] - [ValidatePattern("^\d+\.\d+$")] + [ValidatePattern("^\d+\.\d+t?$")] [string]$PyVersion, [Parameter(Mandatory = $false)] diff --git a/ci/windows/test_cuda_cccl_examples_python.ps1 b/ci/windows/test_cuda_cccl_examples_python.ps1 index 0c108328822..8e50dc5d7bf 100644 --- a/ci/windows/test_cuda_cccl_examples_python.ps1 +++ b/ci/windows/test_cuda_cccl_examples_python.ps1 @@ -1,27 +1,27 @@ -Param( - [Parameter(Mandatory = $true)] - [Alias("py-version")] - [ValidatePattern("^\d+\.\d+$")] - [string]$PyVersion -) - -$ErrorActionPreference = "Stop" - -# Import shared helpers -Import-Module "$PSScriptRoot/build_common.psm1" -Import-Module "$PSScriptRoot/build_common_python.psm1" - -$python = Get-Python -Version $PyVersion -$cudaMajor = Get-CudaMajor - -$repoRoot = Get-RepoRoot - -${wheelPath} = Get-CudaCcclWheel -& $python -m pip install -U pip pytest pytest-xdist -& $python -m pip install "${wheelPath}[test-cu$cudaMajor]" - -Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests") -try { - & $python -m pytest -n 6 test_examples.py -} -finally { Pop-Location } +Param( + [Parameter(Mandatory = $true)] + [Alias("py-version")] + [ValidatePattern("^\d+\.\d+t?$")] + [string]$PyVersion +) + +$ErrorActionPreference = "Stop" + +# Import shared helpers +Import-Module "$PSScriptRoot/build_common.psm1" +Import-Module "$PSScriptRoot/build_common_python.psm1" + +$python = Get-Python -Version $PyVersion +$cudaMajor = Get-CudaMajor + +$repoRoot = Get-RepoRoot + +${wheelPath} = Get-CudaCcclWheel +& $python -m pip install -U pip pytest pytest-xdist +& $python -m pip install "${wheelPath}[test-cu$cudaMajor]" + +Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests") +try { + & $python -m pytest -n 6 test_examples.py +} +finally { Pop-Location } diff --git a/ci/windows/test_cuda_cccl_headers_python.ps1 b/ci/windows/test_cuda_cccl_headers_python.ps1 index 04a6adacc28..3a2fd40d51b 100644 --- a/ci/windows/test_cuda_cccl_headers_python.ps1 +++ b/ci/windows/test_cuda_cccl_headers_python.ps1 @@ -1,27 +1,27 @@ -Param( - [Parameter(Mandatory = $true)] - [Alias("py-version")] - [ValidatePattern("^\d+\.\d+$")] - [string]$PyVersion -) - -$ErrorActionPreference = "Stop" - -# Import shared helpers -Import-Module "$PSScriptRoot/build_common.psm1" -Import-Module "$PSScriptRoot/build_common_python.psm1" - -$python = Get-Python -Version $PyVersion -$cudaMajor = Get-CudaMajor - -$repoRoot = Get-RepoRoot - -${wheelPath} = Get-CudaCcclWheel -& $python -m pip install -U pip pytest pytest-xdist -& $python -m pip install "${wheelPath}[test-cu$cudaMajor]" - -Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests") -try { - & $python -m pytest -n auto -v headers/ -} -finally { Pop-Location } +Param( + [Parameter(Mandatory = $true)] + [Alias("py-version")] + [ValidatePattern("^\d+\.\d+t?$")] + [string]$PyVersion +) + +$ErrorActionPreference = "Stop" + +# Import shared helpers +Import-Module "$PSScriptRoot/build_common.psm1" +Import-Module "$PSScriptRoot/build_common_python.psm1" + +$python = Get-Python -Version $PyVersion +$cudaMajor = Get-CudaMajor + +$repoRoot = Get-RepoRoot + +${wheelPath} = Get-CudaCcclWheel +& $python -m pip install -U pip pytest pytest-xdist +& $python -m pip install "${wheelPath}[test-cu$cudaMajor]" + +Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests") +try { + & $python -m pytest -n auto -v headers/ +} +finally { Pop-Location } diff --git a/ci/windows/test_cuda_compute_python.ps1 b/ci/windows/test_cuda_compute_python.ps1 index 796d5128141..f8a9f2f7509 100644 --- a/ci/windows/test_cuda_compute_python.ps1 +++ b/ci/windows/test_cuda_compute_python.ps1 @@ -1,29 +1,29 @@ -Param( - [Parameter(Mandatory = $true)] - [Alias("py-version")] - [ValidatePattern("^\d+\.\d+$")] - [string]$PyVersion -) - -$ErrorActionPreference = "Stop" - -# Import shared helpers -Import-Module "$PSScriptRoot/build_common.psm1" -Import-Module "$PSScriptRoot/build_common_python.psm1" - -$python = Get-Python -Version $PyVersion -$cudaMajor = Get-CudaMajor - -$repoRoot = Get-RepoRoot - -$wheelPath = Get-CudaCcclWheel - -& $python -m pip install -U pip pytest pytest-xdist -& $python -m pip install "$wheelPath[test-cu$cudaMajor]" - -Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests") -try { - & $python -m pytest -n 6 -v compute/ -m "not large" - & $python -m pytest -n 0 -v compute/ -m "large" -} -finally { Pop-Location } +Param( + [Parameter(Mandatory = $true)] + [Alias("py-version")] + [ValidatePattern("^\d+\.\d+t?$")] + [string]$PyVersion +) + +$ErrorActionPreference = "Stop" + +# Import shared helpers +Import-Module "$PSScriptRoot/build_common.psm1" +Import-Module "$PSScriptRoot/build_common_python.psm1" + +$python = Get-Python -Version $PyVersion +$cudaMajor = Get-CudaMajor + +$repoRoot = Get-RepoRoot + +$wheelPath = Get-CudaCcclWheel + +& $python -m pip install -U pip pytest pytest-xdist +& $python -m pip install "$wheelPath[test-cu$cudaMajor]" + +Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests") +try { + & $python -m pytest -n 6 -v compute/ -m "not large and not free_threading" + & $python -m pytest -n 0 -v compute/ -m "large and not free_threading" +} +finally { Pop-Location } diff --git a/ci/windows/test_cuda_coop_python.ps1 b/ci/windows/test_cuda_coop_python.ps1 index 7fb5f9628bc..b0168ffcb32 100644 --- a/ci/windows/test_cuda_coop_python.ps1 +++ b/ci/windows/test_cuda_coop_python.ps1 @@ -1,25 +1,25 @@ -Param( - [Parameter(Mandatory = $true)] - [Alias("py-version")] - [ValidatePattern("^\d+\.\d+$")] - [string]$PyVersion -) - -$ErrorActionPreference = "Stop" - -# Import shared helpers -Import-Module "$PSScriptRoot/build_common.psm1" -Import-Module "$PSScriptRoot/build_common_python.psm1" - -$python = Get-Python -Version $PyVersion -$cudaMajor = Get-CudaMajor - -${wheelPath} = Get-CudaCcclWheel -& $python -m pip install -U pip pytest pytest-xdist -& $python -m pip install "${wheelPath}[test-cu$cudaMajor]" - -Push-Location (Join-Path (Get-RepoRoot) "python/cuda_cccl/tests") -try { - & $python -m pytest -n auto -v coop/_experimental/ -} -finally { Pop-Location } +Param( + [Parameter(Mandatory = $true)] + [Alias("py-version")] + [ValidatePattern("^\d+\.\d+t?$")] + [string]$PyVersion +) + +$ErrorActionPreference = "Stop" + +# Import shared helpers +Import-Module "$PSScriptRoot/build_common.psm1" +Import-Module "$PSScriptRoot/build_common_python.psm1" + +$python = Get-Python -Version $PyVersion +$cudaMajor = Get-CudaMajor + +${wheelPath} = Get-CudaCcclWheel +& $python -m pip install -U pip pytest pytest-xdist +& $python -m pip install "${wheelPath}[test-cu$cudaMajor]" + +Push-Location (Join-Path (Get-RepoRoot) "python/cuda_cccl/tests") +try { + & $python -m pytest -n auto -v coop/_experimental/ +} +finally { Pop-Location } diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml index 967bc86d58b..e90b58382c9 100644 --- a/python/cuda_cccl/pyproject.toml +++ b/python/cuda_cccl/pyproject.toml @@ -157,4 +157,5 @@ markers = [ "no_verify_sass: skip SASS verification check", "large: tests requiring large device memory allocations", "no_numba: tests that should not import numba or numba.cuda", + "free_threading: tests requiring free-threaded CPython with the GIL disabled", ] diff --git a/python/cuda_cccl/tests/compute/test_free_threading_stress.py b/python/cuda_cccl/tests/compute/test_free_threading_stress.py index fa2e389a627..1e40996fbe5 100644 --- a/python/cuda_cccl/tests/compute/test_free_threading_stress.py +++ b/python/cuda_cccl/tests/compute/test_free_threading_stress.py @@ -17,6 +17,7 @@ pytestmark = [ + pytest.mark.free_threading, pytest.mark.no_numba, pytest.mark.no_verify_sass( reason="Free-threading stress tests intentionally run concurrent workers." From dda2d2aa5147e04b6d12be20fda5bf7074d43810 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Fri, 12 Jun 2026 15:51:27 -0500 Subject: [PATCH 05/16] Add benchmarks to measure host side overhead --- .../benchmarks/compute/host/build_time.py | 63 + .../benchmarks/compute/host/common.py | 1263 +++++++++++++++++ .../compute/host/compare_results.py | 258 ++++ .../benchmarks/compute/host/oneshot_cached.py | 75 + .../benchmarks/compute/host/twoshot_call.py | 77 + 5 files changed, 1736 insertions(+) create mode 100644 python/cuda_cccl/benchmarks/compute/host/build_time.py create mode 100644 python/cuda_cccl/benchmarks/compute/host/common.py create mode 100644 python/cuda_cccl/benchmarks/compute/host/compare_results.py create mode 100644 python/cuda_cccl/benchmarks/compute/host/oneshot_cached.py create mode 100644 python/cuda_cccl/benchmarks/compute/host/twoshot_call.py diff --git a/python/cuda_cccl/benchmarks/compute/host/build_time.py b/python/cuda_cccl/benchmarks/compute/host/build_time.py new file mode 100644 index 00000000000..b445f57d879 --- /dev/null +++ b/python/cuda_cccl/benchmarks/compute/host/build_time.py @@ -0,0 +1,63 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from __future__ import annotations + +import argparse +import time + +import cuda.compute as cc + +from common import ( + TimingResult, + add_case_filter, + add_json_output, + print_results, + select_cases, + synchronize, + write_results_json, +) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Measure cold cuda.compute make_* build time." + ) + parser.add_argument( + "--repeat", + type=int, + default=10, + help="Number of cold build samples. Defaults to 10.", + ) + add_case_filter(parser) + add_json_output(parser) + args = parser.parse_args() + + results = [] + for case in select_cases(args.case): + state = case.setup() + synchronize() + + samples_ns = [] + for _ in range(args.repeat): + cc.clear_all_caches() + start = time.perf_counter_ns() + case.make_wrapper(state) + end = time.perf_counter_ns() + samples_ns.append(end - start) + + results.append(TimingResult(case.name, samples_ns=samples_ns, number=1)) + + print_results(results) + if args.json is not None: + write_results_json( + args.json, + benchmark="build_time", + results=results, + config={"repeat": args.repeat, "case": args.case}, + ) + + +if __name__ == "__main__": + main() diff --git a/python/cuda_cccl/benchmarks/compute/host/common.py b/python/cuda_cccl/benchmarks/compute/host/common.py new file mode 100644 index 00000000000..a6d7d4dea1b --- /dev/null +++ b/python/cuda_cccl/benchmarks/compute/host/common.py @@ -0,0 +1,1263 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from __future__ import annotations + +import argparse +import json +import platform +import statistics +import sys +import time +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from types import SimpleNamespace +from typing import Any, Callable, Iterable, Literal + +import cupy as cp +import numpy as np + +import cuda.compute as cc +from cuda.compute._cpp_compile import compile_cpp_op_code +from cuda.compute.op import RawOp + +NOOP_TEMP_STORAGE_BYTES = 1 +NUM_ITEMS = 128 +NUM_SEGMENTS = 4 +MIN_SAMPLES_FOR_NOISE_ESTIMATE = 5 + +NoopReturnKind = Literal["none", "temp_storage_bytes", "temp_storage_and_selector"] + + +@dataclass(frozen=True) +class TimingResult: + name: str + samples_ns: list[float] + number: int + + @property + def min_ns(self) -> float: + return min(self.samples_ns) + + @property + def median_ns(self) -> float: + return statistics.median(self.samples_ns) + + @property + def mean_ns(self) -> float: + return statistics.mean(self.samples_ns) + + @property + def stdev_ns(self) -> float | None: + if len(self.samples_ns) < MIN_SAMPLES_FOR_NOISE_ESTIMATE: + return None + return statistics.stdev(self.samples_ns) + + @property + def relative_noise(self) -> float | None: + stdev_ns = self.stdev_ns + mean_ns = self.mean_ns + if stdev_ns is None or mean_ns <= 0: + return None + return stdev_ns / mean_ns + + def as_json(self) -> dict[str, Any]: + return { + "name": self.name, + "unit": "ns", + "number": self.number, + "samples": self.samples_ns, + "min": self.min_ns, + "median": self.median_ns, + "mean": self.mean_ns, + "stdev": self.stdev_ns, + "relative_noise": self.relative_noise, + } + + +@dataclass(frozen=True) +class HostBenchmarkCase: + name: str + setup: Callable[[], SimpleNamespace] + make_wrapper: Callable[[SimpleNamespace], Any] + oneshot: Callable[[SimpleNamespace], None] + twoshot: Callable[[SimpleNamespace, Any], None] + noop_return_kind: NoopReturnKind + skip_reason: str | None = None + + +class NoopBuildResult: + """Proxy that skips native compute while preserving wrapper host work.""" + + def __init__(self, real_build_result: Any, return_kind: NoopReturnKind): + self._real_build_result = real_build_result + self._return_kind = return_kind + + def __getattr__(self, name: str) -> Any: + return getattr(self._real_build_result, name) + + def compute(self, *args, **kwargs): + return _noop_return(self._return_kind) + + def compute_even(self, *args, **kwargs): + return _noop_return(self._return_kind) + + +def _noop_return(return_kind: NoopReturnKind): + if return_kind == "none": + return None + if return_kind == "temp_storage_bytes": + return NOOP_TEMP_STORAGE_BYTES + if return_kind == "temp_storage_and_selector": + return NOOP_TEMP_STORAGE_BYTES, -1 + raise ValueError(f"Unsupported no-op return kind: {return_kind}") + + +def patch_wrapper_to_skip_native_compute( + wrapper: Any, return_kind: NoopReturnKind +) -> None: + """Patch a cached wrapper so measured calls skip native compute.""" + if hasattr(wrapper, "build_result"): + wrapper.build_result = NoopBuildResult(wrapper.build_result, return_kind) + + if hasattr(wrapper, "device_reduce_fn"): + wrapper.device_reduce_fn = lambda *args, **kwargs: _noop_return(return_kind) + + if hasattr(wrapper, "device_scan_fn"): + wrapper.device_scan_fn = lambda *args, **kwargs: _noop_return(return_kind) + + if hasattr(wrapper, "partitioner"): + patch_wrapper_to_skip_native_compute(wrapper.partitioner, return_kind) + + +def make_tiny_temp_storage() -> cp.ndarray: + return cp.empty(NOOP_TEMP_STORAGE_BYTES, dtype=cp.uint8) + + +def synchronize() -> None: + cp.cuda.Device().synchronize() + + +def measure_call( + name: str, + fn: Callable[[], None], + *, + repeat: int, + number: int, +) -> TimingResult: + samples_ns = [] + for _ in range(repeat): + start = time.perf_counter_ns() + for _ in range(number): + fn() + end = time.perf_counter_ns() + samples_ns.append((end - start) / number) + return TimingResult(name=name, samples_ns=samples_ns, number=number) + + +def print_results(results: Iterable[TimingResult]) -> None: + rows = list(results) + name_width = max((len(row.name) for row in rows), default=4) + print( + f"{'case':<{name_width}} {'median':>12} {'min':>12} " + f"{'mean':>12} {'noise':>8} {'repeat':>6} {'number':>6}" + ) + print("-" * (name_width + 68)) + for result in rows: + print( + f"{result.name:<{name_width}} " + f"{_format_ns(result.median_ns):>12} " + f"{_format_ns(result.min_ns):>12} " + f"{_format_ns(result.mean_ns):>12} " + f"{_format_percentage(result.relative_noise):>8} " + f"{len(result.samples_ns):>6} " + f"{result.number:>6}" + ) + + +def _format_ns(ns: float) -> str: + if ns < 1_000: + return f"{ns:.1f} ns" + if ns < 1_000_000: + return f"{ns / 1_000:.2f} us" + return f"{ns / 1_000_000:.2f} ms" + + +def _format_percentage(value: float | None) -> str: + if value is None: + return "inf" + return f"{value * 100.0:.2f}%" + + +def add_case_filter(parser: argparse.ArgumentParser) -> None: + parser.add_argument( + "--case", + action="append", + choices=[case.name for case in CASES], + help="Benchmark case to run. May be passed multiple times.", + ) + + +def add_json_output(parser: argparse.ArgumentParser) -> None: + parser.add_argument( + "--json", + type=Path, + help="Write structured benchmark results to this JSON file.", + ) + + +def write_results_json( + path: Path, + *, + benchmark: str, + results: Iterable[TimingResult], + config: dict[str, Any], +) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + payload = { + "schema": "cuda.compute.host_benchmark.v1", + "benchmark": benchmark, + "created_at": datetime.now(timezone.utc).isoformat(), + "config": config, + "environment": _environment_info(), + "results": [result.as_json() for result in results], + } + path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") + + +def _environment_info() -> dict[str, Any]: + device_count = cp.cuda.runtime.getDeviceCount() + devices = [] + for device_id in range(device_count): + props = cp.cuda.runtime.getDeviceProperties(device_id) + name = props["name"] + if isinstance(name, bytes): + name = name.decode() + devices.append( + { + "id": device_id, + "name": name, + "compute_capability": [ + int(props["major"]), + int(props["minor"]), + ], + } + ) + + return { + "python": sys.version, + "platform": platform.platform(), + "devices": devices, + } + + +def select_cases(case_names: list[str] | None) -> list[HostBenchmarkCase]: + if not case_names: + selected_cases = CASES + else: + selected = set(case_names) + selected_cases = [case for case in CASES if case.name in selected] + + runnable = [] + skipped_by_reason: dict[str, list[str]] = {} + for case in selected_cases: + if case.skip_reason is None: + runnable.append(case) + else: + skipped_by_reason.setdefault(case.skip_reason, []).append(case.name) + + for reason, names in skipped_by_reason.items(): + print(f"Skipping {len(names)} benchmark case(s): {', '.join(names)}") + print(f" Reason: {reason}") + + return runnable + + +def _numba_cuda_skip_reason() -> str | None: + try: + import numba.cuda # noqa: F401 + except Exception as exc: + return f"numba.cuda is not available: {exc}" + return None + + +_NUMBA_CUDA_SKIP_REASON = _numba_cuda_skip_reason() + + +def _raw_predicate_i32(name: str) -> RawOp: + source = f""" +extern "C" __device__ void {name}(void* x, void* result) {{ + int value = *static_cast(x); + *static_cast(result) = value < {NUM_ITEMS // 2}; +}} +""" + return RawOp(ltoir=compile_cpp_op_code(source), name=name) + + +def _raw_plus_i32() -> RawOp: + source = """ +extern "C" __device__ void host_bench_plus_i32( + void* lhs, + void* rhs, + void* result +) { + *static_cast(result) = + *static_cast(lhs) + *static_cast(rhs); +} +""" + return RawOp(ltoir=compile_cpp_op_code(source), name="host_bench_plus_i32") + + +def _raw_identity_i32() -> RawOp: + source = """ +extern "C" __device__ void host_bench_identity_i32(void* x, void* result) { + *static_cast(result) = *static_cast(x); +} +""" + return RawOp(ltoir=compile_cpp_op_code(source), name="host_bench_identity_i32") + + +def _raw_less_i32() -> RawOp: + source = """ +extern "C" __device__ void host_bench_less_i32( + void* lhs, + void* rhs, + void* result +) { + *static_cast(result) = + *static_cast(lhs) < *static_cast(rhs); +} +""" + return RawOp(ltoir=compile_cpp_op_code(source), name="host_bench_less_i32") + + +def _raw_equal_i32() -> RawOp: + source = """ +extern "C" __device__ void host_bench_equal_i32( + void* lhs, + void* rhs, + void* result +) { + *static_cast(result) = + *static_cast(lhs) == *static_cast(rhs); +} +""" + return RawOp(ltoir=compile_cpp_op_code(source), name="host_bench_equal_i32") + + +def _py_plus_i32(lhs, rhs): + return lhs + rhs + + +def _py_identity_i32(x): + return x + + +def _py_less_i32(lhs, rhs): + return lhs < rhs + + +def _py_equal_i32(lhs, rhs): + return lhs == rhs + + +def _py_predicate_i32(x): + return x < NUM_ITEMS // 2 + + +def _setup_unary_input_output() -> SimpleNamespace: + d_in = cp.arange(NUM_ITEMS, dtype=cp.int32) + d_out = cp.empty_like(d_in) + return SimpleNamespace(d_in=d_in, d_out=d_out, num_items=NUM_ITEMS) + + +def _setup_binary_input_output() -> SimpleNamespace: + d_in1 = cp.arange(NUM_ITEMS, dtype=cp.int32) + d_in2 = cp.arange(NUM_ITEMS, dtype=cp.int32) + d_out = cp.empty_like(d_in1) + return SimpleNamespace(d_in1=d_in1, d_in2=d_in2, d_out=d_out, num_items=NUM_ITEMS) + + +def _setup_reduce() -> SimpleNamespace: + state = _setup_unary_input_output() + state.h_init = np.array([0], dtype=np.int32) + state.op = cc.OpKind.PLUS + state.temp_storage = make_tiny_temp_storage() + return state + + +def _make_reduce(state: SimpleNamespace): + return cc.make_reduce_into( + d_in=state.d_in, + d_out=state.d_out[:1], + op=state.op, + h_init=state.h_init, + ) + + +def _oneshot_reduce(state: SimpleNamespace) -> None: + cc.reduce_into( + d_in=state.d_in, + d_out=state.d_out[:1], + num_items=state.num_items, + op=state.op, + h_init=state.h_init, + ) + + +def _twoshot_reduce(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_in=state.d_in, + d_out=state.d_out[:1], + num_items=state.num_items, + op=state.op, + h_init=state.h_init, + ) + + +def _setup_scan() -> SimpleNamespace: + state = _setup_unary_input_output() + state.h_init = np.array([0], dtype=np.int32) + state.op = cc.OpKind.PLUS + state.temp_storage = make_tiny_temp_storage() + return state + + +def _make_scan(state: SimpleNamespace): + return cc.make_exclusive_scan( + d_in=state.d_in, + d_out=state.d_out, + op=state.op, + init_value=state.h_init, + ) + + +def _oneshot_scan(state: SimpleNamespace) -> None: + cc.exclusive_scan( + d_in=state.d_in, + d_out=state.d_out, + op=state.op, + init_value=state.h_init, + num_items=state.num_items, + ) + + +def _twoshot_scan(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_in=state.d_in, + d_out=state.d_out, + op=state.op, + init_value=state.h_init, + num_items=state.num_items, + ) + + +def _setup_segmented_reduce() -> SimpleNamespace: + d_in = cp.arange(NUM_ITEMS, dtype=cp.int32) + d_out = cp.empty(NUM_SEGMENTS, dtype=cp.int32) + offsets = cp.asarray( + np.linspace(0, NUM_ITEMS, NUM_SEGMENTS + 1, dtype=np.int64) + ) + return SimpleNamespace( + d_in=d_in, + d_out=d_out, + start_offsets=offsets[:-1], + end_offsets=offsets[1:], + num_segments=NUM_SEGMENTS, + h_init=np.array([0], dtype=np.int32), + op=cc.OpKind.PLUS, + temp_storage=make_tiny_temp_storage(), + ) + + +def _make_segmented_reduce(state: SimpleNamespace): + return cc.make_segmented_reduce( + d_in=state.d_in, + d_out=state.d_out, + start_offsets_in=state.start_offsets, + end_offsets_in=state.end_offsets, + op=state.op, + h_init=state.h_init, + ) + + +def _oneshot_segmented_reduce(state: SimpleNamespace) -> None: + cc.segmented_reduce( + d_in=state.d_in, + d_out=state.d_out, + num_segments=state.num_segments, + start_offsets_in=state.start_offsets, + end_offsets_in=state.end_offsets, + op=state.op, + h_init=state.h_init, + ) + + +def _twoshot_segmented_reduce(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_in=state.d_in, + d_out=state.d_out, + num_segments=state.num_segments, + start_offsets_in=state.start_offsets, + end_offsets_in=state.end_offsets, + op=state.op, + h_init=state.h_init, + ) + + +def _make_unary_transform(state: SimpleNamespace): + return cc.make_unary_transform( + d_in=state.d_in, + d_out=state.d_out, + op=state.op, + ) + + +def _oneshot_unary_transform(state: SimpleNamespace) -> None: + cc.unary_transform( + d_in=state.d_in, + d_out=state.d_out, + op=state.op, + num_items=state.num_items, + ) + + +def _twoshot_unary_transform(state: SimpleNamespace, wrapper) -> None: + wrapper( + d_in=state.d_in, + d_out=state.d_out, + op=state.op, + num_items=state.num_items, + ) + + +def _make_binary_transform(state: SimpleNamespace): + return cc.make_binary_transform( + d_in1=state.d_in1, + d_in2=state.d_in2, + d_out=state.d_out, + op=state.op, + ) + + +def _oneshot_binary_transform(state: SimpleNamespace) -> None: + cc.binary_transform( + d_in1=state.d_in1, + d_in2=state.d_in2, + d_out=state.d_out, + op=state.op, + num_items=state.num_items, + ) + + +def _twoshot_binary_transform(state: SimpleNamespace, wrapper) -> None: + wrapper( + d_in1=state.d_in1, + d_in2=state.d_in2, + d_out=state.d_out, + op=state.op, + num_items=state.num_items, + ) + + +def _setup_histogram() -> SimpleNamespace: + d_samples = cp.arange(NUM_ITEMS, dtype=cp.int32) + num_output_levels = 17 + d_histogram = cp.empty(num_output_levels - 1, dtype=cp.int32) + lower_level = np.int32(0) + upper_level = np.int32(NUM_ITEMS) + return SimpleNamespace( + d_samples=d_samples, + d_histogram=d_histogram, + num_output_levels=num_output_levels, + h_num_output_levels=np.array([num_output_levels], dtype=np.int32), + lower_level=lower_level, + upper_level=upper_level, + h_lower_level=np.array([lower_level], dtype=np.int32), + h_upper_level=np.array([upper_level], dtype=np.int32), + num_samples=NUM_ITEMS, + temp_storage=make_tiny_temp_storage(), + ) + + +def _make_histogram(state: SimpleNamespace): + return cc.make_histogram_even( + d_samples=state.d_samples, + d_histogram=state.d_histogram, + h_num_output_levels=state.h_num_output_levels, + h_lower_level=state.h_lower_level, + h_upper_level=state.h_upper_level, + num_samples=state.num_samples, + ) + + +def _oneshot_histogram(state: SimpleNamespace) -> None: + cc.histogram_even( + d_samples=state.d_samples, + d_histogram=state.d_histogram, + num_output_levels=state.num_output_levels, + lower_level=state.lower_level, + upper_level=state.upper_level, + num_samples=state.num_samples, + ) + + +def _twoshot_histogram(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_samples=state.d_samples, + d_histogram=state.d_histogram, + h_num_output_levels=state.h_num_output_levels, + h_lower_level=state.h_lower_level, + h_upper_level=state.h_upper_level, + num_samples=state.num_samples, + ) + + +def _setup_binary_search() -> SimpleNamespace: + d_data = cp.arange(NUM_ITEMS, dtype=cp.int32) + d_values = cp.arange(0, NUM_ITEMS, 2, dtype=cp.int32) + d_out = cp.empty(d_values.size, dtype=np.uintp) + return SimpleNamespace( + d_data=d_data, + d_values=d_values, + d_out=d_out, + num_items=NUM_ITEMS, + num_values=int(d_values.size), + comp=cc.OpKind.LESS, + ) + + +def _make_lower_bound(state: SimpleNamespace): + return cc.make_lower_bound( + d_data=state.d_data, + d_values=state.d_values, + d_out=state.d_out, + comp=state.comp, + ) + + +def _oneshot_lower_bound(state: SimpleNamespace) -> None: + cc.lower_bound( + d_data=state.d_data, + num_items=state.num_items, + d_values=state.d_values, + num_values=state.num_values, + d_out=state.d_out, + comp=state.comp, + ) + + +def _twoshot_lower_bound(state: SimpleNamespace, wrapper) -> None: + wrapper( + d_data=state.d_data, + num_items=state.num_items, + d_values=state.d_values, + num_values=state.num_values, + d_out=state.d_out, + comp=state.comp, + ) + + +def _setup_select() -> SimpleNamespace: + state = _setup_unary_input_output() + state.d_num_selected = cp.empty(1, dtype=np.uint64) + state.cond = cc.OpKind.LOGICAL_NOT + state.temp_storage = make_tiny_temp_storage() + return state + + +def _make_select(state: SimpleNamespace): + return cc.make_select( + d_in=state.d_in, + d_out=state.d_out, + d_num_selected_out=state.d_num_selected, + cond=state.cond, + ) + + +def _oneshot_select(state: SimpleNamespace) -> None: + cc.select( + d_in=state.d_in, + d_out=state.d_out, + d_num_selected_out=state.d_num_selected, + cond=state.cond, + num_items=state.num_items, + ) + + +def _twoshot_select(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_in=state.d_in, + d_out=state.d_out, + d_num_selected_out=state.d_num_selected, + cond=state.cond, + num_items=state.num_items, + ) + + +def _setup_three_way_partition() -> SimpleNamespace: + state = _setup_unary_input_output() + state.d_first = cp.empty_like(state.d_in) + state.d_second = cp.empty_like(state.d_in) + state.d_unselected = cp.empty_like(state.d_in) + state.d_num_selected = cp.empty(2, dtype=np.uint64) + state.first_op = cc.OpKind.LOGICAL_NOT + state.second_op = cc.OpKind.LOGICAL_NOT + state.temp_storage = make_tiny_temp_storage() + return state + + +def _make_three_way_partition(state: SimpleNamespace): + return cc.make_three_way_partition( + d_in=state.d_in, + d_first_part_out=state.d_first, + d_second_part_out=state.d_second, + d_unselected_out=state.d_unselected, + d_num_selected_out=state.d_num_selected, + select_first_part_op=state.first_op, + select_second_part_op=state.second_op, + ) + + +def _oneshot_three_way_partition(state: SimpleNamespace) -> None: + cc.three_way_partition( + d_in=state.d_in, + d_first_part_out=state.d_first, + d_second_part_out=state.d_second, + d_unselected_out=state.d_unselected, + d_num_selected_out=state.d_num_selected, + select_first_part_op=state.first_op, + select_second_part_op=state.second_op, + num_items=state.num_items, + ) + + +def _twoshot_three_way_partition(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_in=state.d_in, + d_first_part_out=state.d_first, + d_second_part_out=state.d_second, + d_unselected_out=state.d_unselected, + d_num_selected_out=state.d_num_selected, + select_first_part_op=state.first_op, + select_second_part_op=state.second_op, + num_items=state.num_items, + ) + + +def _setup_unique_by_key() -> SimpleNamespace: + d_keys = cp.arange(NUM_ITEMS, dtype=cp.int32) + d_items = cp.arange(NUM_ITEMS, dtype=cp.int32) + return SimpleNamespace( + d_in_keys=d_keys, + d_in_items=d_items, + d_out_keys=cp.empty_like(d_keys), + d_out_items=cp.empty_like(d_items), + d_num_selected=cp.empty(1, dtype=np.uint64), + op=cc.OpKind.EQUAL_TO, + num_items=NUM_ITEMS, + temp_storage=make_tiny_temp_storage(), + ) + + +def _make_unique_by_key(state: SimpleNamespace): + return cc.make_unique_by_key( + d_in_keys=state.d_in_keys, + d_in_items=state.d_in_items, + d_out_keys=state.d_out_keys, + d_out_items=state.d_out_items, + d_out_num_selected=state.d_num_selected, + op=state.op, + ) + + +def _oneshot_unique_by_key(state: SimpleNamespace) -> None: + cc.unique_by_key( + d_in_keys=state.d_in_keys, + d_in_items=state.d_in_items, + d_out_keys=state.d_out_keys, + d_out_items=state.d_out_items, + d_out_num_selected=state.d_num_selected, + op=state.op, + num_items=state.num_items, + ) + + +def _twoshot_unique_by_key(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_in_keys=state.d_in_keys, + d_in_items=state.d_in_items, + d_out_keys=state.d_out_keys, + d_out_items=state.d_out_items, + d_out_num_selected=state.d_num_selected, + op=state.op, + num_items=state.num_items, + ) + + +def _setup_sort() -> SimpleNamespace: + d_in_keys = cp.arange(NUM_ITEMS, 0, -1, dtype=cp.int32) + d_out_keys = cp.empty_like(d_in_keys) + return SimpleNamespace( + d_in_keys=d_in_keys, + d_out_keys=d_out_keys, + op=cc.OpKind.LESS, + num_items=NUM_ITEMS, + temp_storage=make_tiny_temp_storage(), + ) + + +def _make_merge_sort(state: SimpleNamespace): + return cc.make_merge_sort( + d_in_keys=state.d_in_keys, + d_out_keys=state.d_out_keys, + op=state.op, + ) + + +def _oneshot_merge_sort(state: SimpleNamespace) -> None: + cc.merge_sort( + d_in_keys=state.d_in_keys, + d_out_keys=state.d_out_keys, + num_items=state.num_items, + op=state.op, + ) + + +def _twoshot_merge_sort(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_in_keys=state.d_in_keys, + d_in_values=None, + d_out_keys=state.d_out_keys, + d_out_values=None, + num_items=state.num_items, + op=state.op, + ) + + +def _make_radix_sort(state: SimpleNamespace): + return cc.make_radix_sort( + d_in_keys=state.d_in_keys, + d_out_keys=state.d_out_keys, + d_in_values=None, + d_out_values=None, + order=cc.SortOrder.ASCENDING, + ) + + +def _oneshot_radix_sort(state: SimpleNamespace) -> None: + cc.radix_sort( + d_in_keys=state.d_in_keys, + d_out_keys=state.d_out_keys, + num_items=state.num_items, + order=cc.SortOrder.ASCENDING, + ) + + +def _twoshot_radix_sort(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_in_keys=state.d_in_keys, + d_out_keys=state.d_out_keys, + d_in_values=None, + d_out_values=None, + num_items=state.num_items, + ) + + +def _setup_segmented_sort() -> SimpleNamespace: + state = _setup_sort() + offsets = cp.asarray( + np.linspace(0, NUM_ITEMS, NUM_SEGMENTS + 1, dtype=np.int64) + ) + state.start_offsets = offsets[:-1] + state.end_offsets = offsets[1:] + state.num_segments = NUM_SEGMENTS + return state + + +def _make_segmented_sort(state: SimpleNamespace): + return cc.make_segmented_sort( + d_in_keys=state.d_in_keys, + d_out_keys=state.d_out_keys, + d_in_values=None, + d_out_values=None, + start_offsets_in=state.start_offsets, + end_offsets_in=state.end_offsets, + order=cc.SortOrder.ASCENDING, + ) + + +def _oneshot_segmented_sort(state: SimpleNamespace) -> None: + cc.segmented_sort( + d_in_keys=state.d_in_keys, + d_out_keys=state.d_out_keys, + d_in_values=None, + d_out_values=None, + num_items=state.num_items, + num_segments=state.num_segments, + start_offsets_in=state.start_offsets, + end_offsets_in=state.end_offsets, + order=cc.SortOrder.ASCENDING, + ) + + +def _twoshot_segmented_sort(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_in_keys=state.d_in_keys, + d_out_keys=state.d_out_keys, + d_in_values=None, + d_out_values=None, + num_items=state.num_items, + num_segments=state.num_segments, + start_offsets_in=state.start_offsets, + end_offsets_in=state.end_offsets, + ) + + +def _setup_with_values( + setup_fn: Callable[[], SimpleNamespace], **values: Any +) -> Callable[[], SimpleNamespace]: + def setup() -> SimpleNamespace: + state = setup_fn() + for name, value in values.items(): + setattr(state, name, value) + return state + + return setup + + +def _setup_with_factories( + setup_fn: Callable[[], SimpleNamespace], **factories: Callable[[], Any] +) -> Callable[[], SimpleNamespace]: + def setup() -> SimpleNamespace: + state = setup_fn() + for name, factory in factories.items(): + setattr(state, name, factory()) + return state + + return setup + + +def _make_case( + name: str, + setup: Callable[[], SimpleNamespace], + make_wrapper: Callable[[SimpleNamespace], Any], + oneshot: Callable[[SimpleNamespace], None], + twoshot: Callable[[SimpleNamespace, Any], None], + noop_return_kind: NoopReturnKind, + skip_reason: str | None = None, +) -> HostBenchmarkCase: + return HostBenchmarkCase( + name, + setup, + make_wrapper, + oneshot, + twoshot, + noop_return_kind, + skip_reason, + ) + + +CASES = [ + _make_case( + "reduce.plus", + _setup_with_values(_setup_reduce, op=cc.OpKind.PLUS), + _make_reduce, + _oneshot_reduce, + _twoshot_reduce, + "temp_storage_bytes", + ), + _make_case( + "reduce.raw_cpp", + _setup_with_factories(_setup_reduce, op=_raw_plus_i32), + _make_reduce, + _oneshot_reduce, + _twoshot_reduce, + "temp_storage_bytes", + ), + _make_case( + "reduce.python", + _setup_with_values(_setup_reduce, op=_py_plus_i32), + _make_reduce, + _oneshot_reduce, + _twoshot_reduce, + "temp_storage_bytes", + _NUMBA_CUDA_SKIP_REASON, + ), + _make_case( + "exclusive_scan.plus", + _setup_with_values(_setup_scan, op=cc.OpKind.PLUS), + _make_scan, + _oneshot_scan, + _twoshot_scan, + "temp_storage_bytes", + ), + _make_case( + "exclusive_scan.raw_cpp", + _setup_with_factories(_setup_scan, op=_raw_plus_i32), + _make_scan, + _oneshot_scan, + _twoshot_scan, + "temp_storage_bytes", + ), + _make_case( + "exclusive_scan.python", + _setup_with_values(_setup_scan, op=_py_plus_i32), + _make_scan, + _oneshot_scan, + _twoshot_scan, + "temp_storage_bytes", + _NUMBA_CUDA_SKIP_REASON, + ), + _make_case( + "segmented_reduce.plus", + _setup_with_values(_setup_segmented_reduce, op=cc.OpKind.PLUS), + _make_segmented_reduce, + _oneshot_segmented_reduce, + _twoshot_segmented_reduce, + "temp_storage_bytes", + ), + _make_case( + "segmented_reduce.raw_cpp", + _setup_with_factories(_setup_segmented_reduce, op=_raw_plus_i32), + _make_segmented_reduce, + _oneshot_segmented_reduce, + _twoshot_segmented_reduce, + "temp_storage_bytes", + ), + _make_case( + "segmented_reduce.python", + _setup_with_values(_setup_segmented_reduce, op=_py_plus_i32), + _make_segmented_reduce, + _oneshot_segmented_reduce, + _twoshot_segmented_reduce, + "temp_storage_bytes", + _NUMBA_CUDA_SKIP_REASON, + ), + _make_case( + "unary_transform.identity", + _setup_with_values(_setup_unary_input_output, op=cc.OpKind.IDENTITY), + _make_unary_transform, + _oneshot_unary_transform, + _twoshot_unary_transform, + "none", + ), + _make_case( + "unary_transform.raw_cpp", + _setup_with_factories(_setup_unary_input_output, op=_raw_identity_i32), + _make_unary_transform, + _oneshot_unary_transform, + _twoshot_unary_transform, + "none", + ), + _make_case( + "unary_transform.python", + _setup_with_values(_setup_unary_input_output, op=_py_identity_i32), + _make_unary_transform, + _oneshot_unary_transform, + _twoshot_unary_transform, + "none", + _NUMBA_CUDA_SKIP_REASON, + ), + _make_case( + "binary_transform.plus", + _setup_with_values(_setup_binary_input_output, op=cc.OpKind.PLUS), + _make_binary_transform, + _oneshot_binary_transform, + _twoshot_binary_transform, + "none", + ), + _make_case( + "binary_transform.raw_cpp", + _setup_with_factories(_setup_binary_input_output, op=_raw_plus_i32), + _make_binary_transform, + _oneshot_binary_transform, + _twoshot_binary_transform, + "none", + ), + _make_case( + "binary_transform.python", + _setup_with_values(_setup_binary_input_output, op=_py_plus_i32), + _make_binary_transform, + _oneshot_binary_transform, + _twoshot_binary_transform, + "none", + _NUMBA_CUDA_SKIP_REASON, + ), + _make_case( + "histogram_even", + _setup_histogram, + _make_histogram, + _oneshot_histogram, + _twoshot_histogram, + "temp_storage_bytes", + ), + _make_case( + "lower_bound.less", + _setup_with_values(_setup_binary_search, comp=cc.OpKind.LESS), + _make_lower_bound, + _oneshot_lower_bound, + _twoshot_lower_bound, + "none", + ), + _make_case( + "lower_bound.raw_cpp", + _setup_with_factories(_setup_binary_search, comp=_raw_less_i32), + _make_lower_bound, + _oneshot_lower_bound, + _twoshot_lower_bound, + "none", + ), + _make_case( + "lower_bound.python", + _setup_with_values(_setup_binary_search, comp=_py_less_i32), + _make_lower_bound, + _oneshot_lower_bound, + _twoshot_lower_bound, + "none", + _NUMBA_CUDA_SKIP_REASON, + ), + _make_case( + "select.logical_not", + _setup_with_values(_setup_select, cond=cc.OpKind.LOGICAL_NOT), + _make_select, + _oneshot_select, + _twoshot_select, + "temp_storage_bytes", + ), + _make_case( + "select.raw_cpp", + _setup_with_factories( + _setup_select, + cond=lambda: _raw_predicate_i32("host_bench_select_predicate_i32"), + ), + _make_select, + _oneshot_select, + _twoshot_select, + "temp_storage_bytes", + ), + _make_case( + "select.python", + _setup_with_values(_setup_select, cond=_py_predicate_i32), + _make_select, + _oneshot_select, + _twoshot_select, + "temp_storage_bytes", + _NUMBA_CUDA_SKIP_REASON, + ), + _make_case( + "three_way_partition.logical_not", + _setup_with_values( + _setup_three_way_partition, + first_op=cc.OpKind.LOGICAL_NOT, + second_op=cc.OpKind.LOGICAL_NOT, + ), + _make_three_way_partition, + _oneshot_three_way_partition, + _twoshot_three_way_partition, + "temp_storage_bytes", + ), + _make_case( + "three_way_partition.raw_cpp", + _setup_with_factories( + _setup_three_way_partition, + first_op=lambda: _raw_predicate_i32("host_bench_partition_first_i32"), + second_op=lambda: _raw_predicate_i32("host_bench_partition_second_i32"), + ), + _make_three_way_partition, + _oneshot_three_way_partition, + _twoshot_three_way_partition, + "temp_storage_bytes", + ), + _make_case( + "three_way_partition.python", + _setup_with_values( + _setup_three_way_partition, + first_op=_py_predicate_i32, + second_op=_py_predicate_i32, + ), + _make_three_way_partition, + _oneshot_three_way_partition, + _twoshot_three_way_partition, + "temp_storage_bytes", + _NUMBA_CUDA_SKIP_REASON, + ), + _make_case( + "unique_by_key.equal", + _setup_with_values(_setup_unique_by_key, op=cc.OpKind.EQUAL_TO), + _make_unique_by_key, + _oneshot_unique_by_key, + _twoshot_unique_by_key, + "temp_storage_bytes", + ), + _make_case( + "unique_by_key.raw_cpp", + _setup_with_factories(_setup_unique_by_key, op=_raw_equal_i32), + _make_unique_by_key, + _oneshot_unique_by_key, + _twoshot_unique_by_key, + "temp_storage_bytes", + ), + _make_case( + "unique_by_key.python", + _setup_with_values(_setup_unique_by_key, op=_py_equal_i32), + _make_unique_by_key, + _oneshot_unique_by_key, + _twoshot_unique_by_key, + "temp_storage_bytes", + _NUMBA_CUDA_SKIP_REASON, + ), + _make_case( + "merge_sort.less", + _setup_with_values(_setup_sort, op=cc.OpKind.LESS), + _make_merge_sort, + _oneshot_merge_sort, + _twoshot_merge_sort, + "temp_storage_bytes", + ), + _make_case( + "merge_sort.raw_cpp", + _setup_with_factories(_setup_sort, op=_raw_less_i32), + _make_merge_sort, + _oneshot_merge_sort, + _twoshot_merge_sort, + "temp_storage_bytes", + ), + _make_case( + "merge_sort.python", + _setup_with_values(_setup_sort, op=_py_less_i32), + _make_merge_sort, + _oneshot_merge_sort, + _twoshot_merge_sort, + "temp_storage_bytes", + _NUMBA_CUDA_SKIP_REASON, + ), + _make_case( + "radix_sort", + _setup_sort, + _make_radix_sort, + _oneshot_radix_sort, + _twoshot_radix_sort, + "temp_storage_and_selector", + ), + _make_case( + "segmented_sort", + _setup_segmented_sort, + _make_segmented_sort, + _oneshot_segmented_sort, + _twoshot_segmented_sort, + "temp_storage_and_selector", + ), +] diff --git a/python/cuda_cccl/benchmarks/compute/host/compare_results.py b/python/cuda_cccl/benchmarks/compute/host/compare_results.py new file mode 100644 index 00000000000..7a670e11a01 --- /dev/null +++ b/python/cuda_cccl/benchmarks/compute/host/compare_results.py @@ -0,0 +1,258 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from __future__ import annotations + +import argparse +import json +import sys +from enum import StrEnum +from pathlib import Path +from typing import Any + +SCHEMA = "cuda.compute.host_benchmark.v1" + + +class _Color(StrEnum): + RED = "\033[31m" + GREEN = "\033[32m" + BLUE = "\033[34m" + YELLOW = "\033[33m" + RESET = "\033[0m" + NONE = "" + + +class _Emoji(StrEnum): + YELLOW = "\U0001f7e1" + BLUE = "\U0001f535" + GREEN = "\U0001f7e2" + RED = "\U0001f534" + NONE = "" + + +def _colorize(label: str, color: _Color, emoji: _Emoji, no_color: bool) -> str: + if no_color: + if emoji: + return f"{emoji} {label}" + return label + return f"{color}{label}{_Color.RESET}" + + +def _format_ns(ns: float) -> str: + if ns < 1_000: + return f"{ns:.1f} ns" + if ns < 1_000_000: + return f"{ns / 1_000:.2f} us" + return f"{ns / 1_000_000:.2f} ms" + + +def _format_percentage(value: float | None) -> str: + if value is None: + return "inf" + return f"{value * 100.0:.2f}%" + + +def _load(path: Path) -> dict[str, Any]: + payload = json.loads(path.read_text(encoding="utf-8")) + schema = payload.get("schema") + if schema != SCHEMA: + raise ValueError(f"{path}: expected schema {SCHEMA!r}, got {schema!r}") + return payload + + +def _result_map(payload: dict[str, Any]) -> dict[str, dict[str, Any]]: + return {result["name"]: result for result in payload["results"]} + + +def _minimum_noise(ref_noise: float | None, cmp_noise: float | None) -> float | None: + if ref_noise is not None and cmp_noise is not None: + return min(ref_noise, cmp_noise) + if ref_noise is not None: + return ref_noise + return cmp_noise + + +def _status( + ref_mean: float, + cmp_mean: float, + ref_noise: float | None, + cmp_noise: float | None, +) -> tuple[str, float, float, float | None]: + diff = cmp_mean - ref_mean + frac_diff = diff / ref_mean + min_noise = _minimum_noise(ref_noise, cmp_noise) + + if min_noise is None: + return "????", diff, frac_diff, min_noise + if abs(frac_diff) <= min_noise: + return "SAME", diff, frac_diff, min_noise + if diff < 0: + return "FAST", diff, frac_diff, min_noise + return "SLOW", diff, frac_diff, min_noise + + +def _format_status(status: str, *, no_color: bool) -> str: + if status == "SAME": + return _colorize(status, _Color.BLUE, _Emoji.BLUE, no_color) + if status == "FAST": + return _colorize(status, _Color.GREEN, _Emoji.GREEN, no_color) + if status == "SLOW": + return _colorize(status, _Color.RED, _Emoji.RED, no_color) + return _colorize(status, _Color.YELLOW, _Emoji.YELLOW, no_color) + + +def _print_table(rows: list[list[str]]) -> None: + headers = [ + "case", + "ref mean", + "ref noise", + "cmp mean", + "cmp noise", + "diff", + "%diff", + "status", + ] + widths = [ + max(len(row[index]) for row in [headers, *rows]) + for index in range(len(headers)) + ] + + def print_row(row: list[str]) -> None: + formatted = [] + for index, value in enumerate(row): + if index in (0, 7): + formatted.append(value.ljust(widths[index])) + else: + formatted.append(value.rjust(widths[index])) + print(" ".join(formatted)) + + print_row(headers) + print(" ".join("-" * width for width in widths)) + for row in rows: + print_row(row) + + +def compare( + ref_payload: dict[str, Any], + cmp_payload: dict[str, Any], + *, + no_color: bool, + threshold: float, +) -> dict[str, int]: + ref_results = _result_map(ref_payload) + cmp_results = _result_map(cmp_payload) + common_names = sorted(set(ref_results) & set(cmp_results)) + + rows = [] + counts = {"total": 0, "same": 0, "unknown": 0, "fast": 0, "slow": 0} + for name in common_names: + ref_result = ref_results[name] + cmp_result = cmp_results[name] + ref_mean = float(ref_result["mean"]) + cmp_mean = float(cmp_result["mean"]) + ref_noise = ref_result["relative_noise"] + cmp_noise = cmp_result["relative_noise"] + status, diff, frac_diff, _ = _status( + ref_mean, + cmp_mean, + ref_noise, + cmp_noise, + ) + + counts["total"] += 1 + if status == "SAME": + counts["same"] += 1 + elif status == "FAST": + counts["fast"] += 1 + elif status == "SLOW": + counts["slow"] += 1 + else: + counts["unknown"] += 1 + + if abs(frac_diff) < threshold: + continue + + rows.append( + [ + name, + _format_ns(ref_mean), + _format_percentage(ref_noise), + _format_ns(cmp_mean), + _format_percentage(cmp_noise), + _format_ns(diff), + _format_percentage(frac_diff), + _format_status(status, no_color=no_color), + ] + ) + + if rows: + _print_table(rows) + else: + print("No matching benchmark cases exceeded the display threshold.") + + missing_in_cmp = sorted(set(ref_results) - set(cmp_results)) + missing_in_ref = sorted(set(cmp_results) - set(ref_results)) + if missing_in_cmp: + print(f"\nMissing from compare: {', '.join(missing_in_cmp)}") + if missing_in_ref: + print(f"\nMissing from reference: {', '.join(missing_in_ref)}") + + print("\n# Summary\n") + print(f"- Total Matches: {counts['total']}") + print(f" - Same (diff <= min noise): {counts['same']}") + print(f" - Fast (cmp faster): {counts['fast']}") + print(f" - Slow (cmp slower): {counts['slow']}") + print(f" - Unknown (missing noise): {counts['unknown']}") + return counts + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Compare two cuda.compute host benchmark JSON outputs." + ) + parser.add_argument("reference", type=Path) + parser.add_argument("compare", type=Path) + parser.add_argument( + "--threshold-diff", + type=float, + default=0.0, + help="Only show rows where absolute relative diff is at least this value.", + ) + parser.add_argument( + "--fail-on-change", + action="store_true", + help="Return nonzero if any case is classified FAST or SLOW.", + ) + parser.add_argument( + "--no-color", + action="store_true", + help="Use emoji instead of ANSI color codes.", + ) + args = parser.parse_args() + + ref_payload = _load(args.reference) + cmp_payload = _load(args.compare) + ref_benchmark = ref_payload["benchmark"] + cmp_benchmark = cmp_payload["benchmark"] + if ref_benchmark != cmp_benchmark: + print( + f"Benchmark types do not match: {ref_benchmark!r} vs {cmp_benchmark!r}", + file=sys.stderr, + ) + return 1 + + print(f"# {ref_benchmark}\n") + counts = compare( + ref_payload, + cmp_payload, + no_color=args.no_color, + threshold=args.threshold_diff, + ) + if args.fail_on_change: + return counts["fast"] + counts["slow"] + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/python/cuda_cccl/benchmarks/compute/host/oneshot_cached.py b/python/cuda_cccl/benchmarks/compute/host/oneshot_cached.py new file mode 100644 index 00000000000..fc9733f0702 --- /dev/null +++ b/python/cuda_cccl/benchmarks/compute/host/oneshot_cached.py @@ -0,0 +1,75 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from __future__ import annotations + +import argparse + +import cuda.compute as cc + +from common import ( + add_case_filter, + add_json_output, + measure_call, + patch_wrapper_to_skip_native_compute, + print_results, + select_cases, + synchronize, + write_results_json, +) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Measure cached cuda.compute public one-shot host overhead." + ) + parser.add_argument( + "--repeat", + type=int, + default=20, + help="Number of timing samples.", + ) + parser.add_argument( + "--number", + type=int, + default=100, + help="Number of calls per timing sample.", + ) + add_case_filter(parser) + add_json_output(parser) + args = parser.parse_args() + + results = [] + for case in select_cases(args.case): + cc.clear_all_caches() + state = case.setup() + wrapper = case.make_wrapper(state) + patch_wrapper_to_skip_native_compute(wrapper, case.noop_return_kind) + synchronize() + + results.append( + measure_call( + case.name, + lambda case=case, state=state: case.oneshot(state), + repeat=args.repeat, + number=args.number, + ) + ) + + print_results(results) + if args.json is not None: + write_results_json( + args.json, + benchmark="oneshot_cached", + results=results, + config={ + "repeat": args.repeat, + "number": args.number, + "case": args.case, + }, + ) + + +if __name__ == "__main__": + main() diff --git a/python/cuda_cccl/benchmarks/compute/host/twoshot_call.py b/python/cuda_cccl/benchmarks/compute/host/twoshot_call.py new file mode 100644 index 00000000000..40370fa1aec --- /dev/null +++ b/python/cuda_cccl/benchmarks/compute/host/twoshot_call.py @@ -0,0 +1,77 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from __future__ import annotations + +import argparse + +import cuda.compute as cc + +from common import ( + add_case_filter, + add_json_output, + measure_call, + patch_wrapper_to_skip_native_compute, + print_results, + select_cases, + synchronize, + write_results_json, +) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Measure cached cuda.compute wrapper __call__ host overhead." + ) + parser.add_argument( + "--repeat", + type=int, + default=20, + help="Number of timing samples.", + ) + parser.add_argument( + "--number", + type=int, + default=1000, + help="Number of calls per timing sample.", + ) + add_case_filter(parser) + add_json_output(parser) + args = parser.parse_args() + + results = [] + for case in select_cases(args.case): + cc.clear_all_caches() + state = case.setup() + wrapper = case.make_wrapper(state) + patch_wrapper_to_skip_native_compute(wrapper, case.noop_return_kind) + synchronize() + + results.append( + measure_call( + case.name, + lambda case=case, state=state, wrapper=wrapper: case.twoshot( + state, wrapper + ), + repeat=args.repeat, + number=args.number, + ) + ) + + print_results(results) + if args.json is not None: + write_results_json( + args.json, + benchmark="twoshot_call", + results=results, + config={ + "repeat": args.repeat, + "number": args.number, + "case": args.case, + }, + ) + + +if __name__ == "__main__": + main() From 1786eb0ca17f02f262a28644d3fa9b87eb885c0b Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Fri, 12 Jun 2026 16:18:45 -0500 Subject: [PATCH 06/16] Use pytest-benchmark instead --- .gitignore | 2 + .../benchmarks/compute/host/build_time.py | 63 ----- .../compute/host/compare_results.py | 258 ------------------ .../{common.py => host_benchmark_cases.py} | 192 +------------ .../benchmarks/compute/host/oneshot_cached.py | 75 ----- .../host/test_host_pytest_benchmark.py | 95 +++++++ .../benchmarks/compute/host/twoshot_call.py | 77 ------ python/cuda_cccl/benchmarks/compute/pixi.toml | 1 + python/cuda_cccl/pyproject.toml | 28 +- 9 files changed, 123 insertions(+), 668 deletions(-) delete mode 100644 python/cuda_cccl/benchmarks/compute/host/build_time.py delete mode 100644 python/cuda_cccl/benchmarks/compute/host/compare_results.py rename python/cuda_cccl/benchmarks/compute/host/{common.py => host_benchmark_cases.py} (84%) delete mode 100644 python/cuda_cccl/benchmarks/compute/host/oneshot_cached.py create mode 100644 python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py delete mode 100644 python/cuda_cccl/benchmarks/compute/host/twoshot_call.py diff --git a/.gitignore b/.gitignore index 06687943b0e..0d8823d768b 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,8 @@ __pycache__ *.pyd wheelhouse/ bench-artifacts/ +# Local results written by pytest-benchmark --benchmark-save. +.benchmarks/ CLAUDE.local.md .codegraph/* .cursor/rules/codegraph.mdc diff --git a/python/cuda_cccl/benchmarks/compute/host/build_time.py b/python/cuda_cccl/benchmarks/compute/host/build_time.py deleted file mode 100644 index b445f57d879..00000000000 --- a/python/cuda_cccl/benchmarks/compute/host/build_time.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. -# -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -from __future__ import annotations - -import argparse -import time - -import cuda.compute as cc - -from common import ( - TimingResult, - add_case_filter, - add_json_output, - print_results, - select_cases, - synchronize, - write_results_json, -) - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Measure cold cuda.compute make_* build time." - ) - parser.add_argument( - "--repeat", - type=int, - default=10, - help="Number of cold build samples. Defaults to 10.", - ) - add_case_filter(parser) - add_json_output(parser) - args = parser.parse_args() - - results = [] - for case in select_cases(args.case): - state = case.setup() - synchronize() - - samples_ns = [] - for _ in range(args.repeat): - cc.clear_all_caches() - start = time.perf_counter_ns() - case.make_wrapper(state) - end = time.perf_counter_ns() - samples_ns.append(end - start) - - results.append(TimingResult(case.name, samples_ns=samples_ns, number=1)) - - print_results(results) - if args.json is not None: - write_results_json( - args.json, - benchmark="build_time", - results=results, - config={"repeat": args.repeat, "case": args.case}, - ) - - -if __name__ == "__main__": - main() diff --git a/python/cuda_cccl/benchmarks/compute/host/compare_results.py b/python/cuda_cccl/benchmarks/compute/host/compare_results.py deleted file mode 100644 index 7a670e11a01..00000000000 --- a/python/cuda_cccl/benchmarks/compute/host/compare_results.py +++ /dev/null @@ -1,258 +0,0 @@ -# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. -# -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -from __future__ import annotations - -import argparse -import json -import sys -from enum import StrEnum -from pathlib import Path -from typing import Any - -SCHEMA = "cuda.compute.host_benchmark.v1" - - -class _Color(StrEnum): - RED = "\033[31m" - GREEN = "\033[32m" - BLUE = "\033[34m" - YELLOW = "\033[33m" - RESET = "\033[0m" - NONE = "" - - -class _Emoji(StrEnum): - YELLOW = "\U0001f7e1" - BLUE = "\U0001f535" - GREEN = "\U0001f7e2" - RED = "\U0001f534" - NONE = "" - - -def _colorize(label: str, color: _Color, emoji: _Emoji, no_color: bool) -> str: - if no_color: - if emoji: - return f"{emoji} {label}" - return label - return f"{color}{label}{_Color.RESET}" - - -def _format_ns(ns: float) -> str: - if ns < 1_000: - return f"{ns:.1f} ns" - if ns < 1_000_000: - return f"{ns / 1_000:.2f} us" - return f"{ns / 1_000_000:.2f} ms" - - -def _format_percentage(value: float | None) -> str: - if value is None: - return "inf" - return f"{value * 100.0:.2f}%" - - -def _load(path: Path) -> dict[str, Any]: - payload = json.loads(path.read_text(encoding="utf-8")) - schema = payload.get("schema") - if schema != SCHEMA: - raise ValueError(f"{path}: expected schema {SCHEMA!r}, got {schema!r}") - return payload - - -def _result_map(payload: dict[str, Any]) -> dict[str, dict[str, Any]]: - return {result["name"]: result for result in payload["results"]} - - -def _minimum_noise(ref_noise: float | None, cmp_noise: float | None) -> float | None: - if ref_noise is not None and cmp_noise is not None: - return min(ref_noise, cmp_noise) - if ref_noise is not None: - return ref_noise - return cmp_noise - - -def _status( - ref_mean: float, - cmp_mean: float, - ref_noise: float | None, - cmp_noise: float | None, -) -> tuple[str, float, float, float | None]: - diff = cmp_mean - ref_mean - frac_diff = diff / ref_mean - min_noise = _minimum_noise(ref_noise, cmp_noise) - - if min_noise is None: - return "????", diff, frac_diff, min_noise - if abs(frac_diff) <= min_noise: - return "SAME", diff, frac_diff, min_noise - if diff < 0: - return "FAST", diff, frac_diff, min_noise - return "SLOW", diff, frac_diff, min_noise - - -def _format_status(status: str, *, no_color: bool) -> str: - if status == "SAME": - return _colorize(status, _Color.BLUE, _Emoji.BLUE, no_color) - if status == "FAST": - return _colorize(status, _Color.GREEN, _Emoji.GREEN, no_color) - if status == "SLOW": - return _colorize(status, _Color.RED, _Emoji.RED, no_color) - return _colorize(status, _Color.YELLOW, _Emoji.YELLOW, no_color) - - -def _print_table(rows: list[list[str]]) -> None: - headers = [ - "case", - "ref mean", - "ref noise", - "cmp mean", - "cmp noise", - "diff", - "%diff", - "status", - ] - widths = [ - max(len(row[index]) for row in [headers, *rows]) - for index in range(len(headers)) - ] - - def print_row(row: list[str]) -> None: - formatted = [] - for index, value in enumerate(row): - if index in (0, 7): - formatted.append(value.ljust(widths[index])) - else: - formatted.append(value.rjust(widths[index])) - print(" ".join(formatted)) - - print_row(headers) - print(" ".join("-" * width for width in widths)) - for row in rows: - print_row(row) - - -def compare( - ref_payload: dict[str, Any], - cmp_payload: dict[str, Any], - *, - no_color: bool, - threshold: float, -) -> dict[str, int]: - ref_results = _result_map(ref_payload) - cmp_results = _result_map(cmp_payload) - common_names = sorted(set(ref_results) & set(cmp_results)) - - rows = [] - counts = {"total": 0, "same": 0, "unknown": 0, "fast": 0, "slow": 0} - for name in common_names: - ref_result = ref_results[name] - cmp_result = cmp_results[name] - ref_mean = float(ref_result["mean"]) - cmp_mean = float(cmp_result["mean"]) - ref_noise = ref_result["relative_noise"] - cmp_noise = cmp_result["relative_noise"] - status, diff, frac_diff, _ = _status( - ref_mean, - cmp_mean, - ref_noise, - cmp_noise, - ) - - counts["total"] += 1 - if status == "SAME": - counts["same"] += 1 - elif status == "FAST": - counts["fast"] += 1 - elif status == "SLOW": - counts["slow"] += 1 - else: - counts["unknown"] += 1 - - if abs(frac_diff) < threshold: - continue - - rows.append( - [ - name, - _format_ns(ref_mean), - _format_percentage(ref_noise), - _format_ns(cmp_mean), - _format_percentage(cmp_noise), - _format_ns(diff), - _format_percentage(frac_diff), - _format_status(status, no_color=no_color), - ] - ) - - if rows: - _print_table(rows) - else: - print("No matching benchmark cases exceeded the display threshold.") - - missing_in_cmp = sorted(set(ref_results) - set(cmp_results)) - missing_in_ref = sorted(set(cmp_results) - set(ref_results)) - if missing_in_cmp: - print(f"\nMissing from compare: {', '.join(missing_in_cmp)}") - if missing_in_ref: - print(f"\nMissing from reference: {', '.join(missing_in_ref)}") - - print("\n# Summary\n") - print(f"- Total Matches: {counts['total']}") - print(f" - Same (diff <= min noise): {counts['same']}") - print(f" - Fast (cmp faster): {counts['fast']}") - print(f" - Slow (cmp slower): {counts['slow']}") - print(f" - Unknown (missing noise): {counts['unknown']}") - return counts - - -def main() -> int: - parser = argparse.ArgumentParser( - description="Compare two cuda.compute host benchmark JSON outputs." - ) - parser.add_argument("reference", type=Path) - parser.add_argument("compare", type=Path) - parser.add_argument( - "--threshold-diff", - type=float, - default=0.0, - help="Only show rows where absolute relative diff is at least this value.", - ) - parser.add_argument( - "--fail-on-change", - action="store_true", - help="Return nonzero if any case is classified FAST or SLOW.", - ) - parser.add_argument( - "--no-color", - action="store_true", - help="Use emoji instead of ANSI color codes.", - ) - args = parser.parse_args() - - ref_payload = _load(args.reference) - cmp_payload = _load(args.compare) - ref_benchmark = ref_payload["benchmark"] - cmp_benchmark = cmp_payload["benchmark"] - if ref_benchmark != cmp_benchmark: - print( - f"Benchmark types do not match: {ref_benchmark!r} vs {cmp_benchmark!r}", - file=sys.stderr, - ) - return 1 - - print(f"# {ref_benchmark}\n") - counts = compare( - ref_payload, - cmp_payload, - no_color=args.no_color, - threshold=args.threshold_diff, - ) - if args.fail_on_change: - return counts["fast"] + counts["slow"] - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/python/cuda_cccl/benchmarks/compute/host/common.py b/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py similarity index 84% rename from python/cuda_cccl/benchmarks/compute/host/common.py rename to python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py index a6d7d4dea1b..3fa97827435 100644 --- a/python/cuda_cccl/benchmarks/compute/host/common.py +++ b/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py @@ -4,17 +4,9 @@ from __future__ import annotations -import argparse -import json -import platform -import statistics -import sys -import time from dataclasses import dataclass -from datetime import datetime, timezone -from pathlib import Path from types import SimpleNamespace -from typing import Any, Callable, Iterable, Literal +from typing import Any, Callable, Literal import cupy as cp import numpy as np @@ -26,57 +18,10 @@ NOOP_TEMP_STORAGE_BYTES = 1 NUM_ITEMS = 128 NUM_SEGMENTS = 4 -MIN_SAMPLES_FOR_NOISE_ESTIMATE = 5 NoopReturnKind = Literal["none", "temp_storage_bytes", "temp_storage_and_selector"] -@dataclass(frozen=True) -class TimingResult: - name: str - samples_ns: list[float] - number: int - - @property - def min_ns(self) -> float: - return min(self.samples_ns) - - @property - def median_ns(self) -> float: - return statistics.median(self.samples_ns) - - @property - def mean_ns(self) -> float: - return statistics.mean(self.samples_ns) - - @property - def stdev_ns(self) -> float | None: - if len(self.samples_ns) < MIN_SAMPLES_FOR_NOISE_ESTIMATE: - return None - return statistics.stdev(self.samples_ns) - - @property - def relative_noise(self) -> float | None: - stdev_ns = self.stdev_ns - mean_ns = self.mean_ns - if stdev_ns is None or mean_ns <= 0: - return None - return stdev_ns / mean_ns - - def as_json(self) -> dict[str, Any]: - return { - "name": self.name, - "unit": "ns", - "number": self.number, - "samples": self.samples_ns, - "min": self.min_ns, - "median": self.median_ns, - "mean": self.mean_ns, - "stdev": self.stdev_ns, - "relative_noise": self.relative_noise, - } - - @dataclass(frozen=True) class HostBenchmarkCase: name: str @@ -140,141 +85,6 @@ def synchronize() -> None: cp.cuda.Device().synchronize() -def measure_call( - name: str, - fn: Callable[[], None], - *, - repeat: int, - number: int, -) -> TimingResult: - samples_ns = [] - for _ in range(repeat): - start = time.perf_counter_ns() - for _ in range(number): - fn() - end = time.perf_counter_ns() - samples_ns.append((end - start) / number) - return TimingResult(name=name, samples_ns=samples_ns, number=number) - - -def print_results(results: Iterable[TimingResult]) -> None: - rows = list(results) - name_width = max((len(row.name) for row in rows), default=4) - print( - f"{'case':<{name_width}} {'median':>12} {'min':>12} " - f"{'mean':>12} {'noise':>8} {'repeat':>6} {'number':>6}" - ) - print("-" * (name_width + 68)) - for result in rows: - print( - f"{result.name:<{name_width}} " - f"{_format_ns(result.median_ns):>12} " - f"{_format_ns(result.min_ns):>12} " - f"{_format_ns(result.mean_ns):>12} " - f"{_format_percentage(result.relative_noise):>8} " - f"{len(result.samples_ns):>6} " - f"{result.number:>6}" - ) - - -def _format_ns(ns: float) -> str: - if ns < 1_000: - return f"{ns:.1f} ns" - if ns < 1_000_000: - return f"{ns / 1_000:.2f} us" - return f"{ns / 1_000_000:.2f} ms" - - -def _format_percentage(value: float | None) -> str: - if value is None: - return "inf" - return f"{value * 100.0:.2f}%" - - -def add_case_filter(parser: argparse.ArgumentParser) -> None: - parser.add_argument( - "--case", - action="append", - choices=[case.name for case in CASES], - help="Benchmark case to run. May be passed multiple times.", - ) - - -def add_json_output(parser: argparse.ArgumentParser) -> None: - parser.add_argument( - "--json", - type=Path, - help="Write structured benchmark results to this JSON file.", - ) - - -def write_results_json( - path: Path, - *, - benchmark: str, - results: Iterable[TimingResult], - config: dict[str, Any], -) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - payload = { - "schema": "cuda.compute.host_benchmark.v1", - "benchmark": benchmark, - "created_at": datetime.now(timezone.utc).isoformat(), - "config": config, - "environment": _environment_info(), - "results": [result.as_json() for result in results], - } - path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") - - -def _environment_info() -> dict[str, Any]: - device_count = cp.cuda.runtime.getDeviceCount() - devices = [] - for device_id in range(device_count): - props = cp.cuda.runtime.getDeviceProperties(device_id) - name = props["name"] - if isinstance(name, bytes): - name = name.decode() - devices.append( - { - "id": device_id, - "name": name, - "compute_capability": [ - int(props["major"]), - int(props["minor"]), - ], - } - ) - - return { - "python": sys.version, - "platform": platform.platform(), - "devices": devices, - } - - -def select_cases(case_names: list[str] | None) -> list[HostBenchmarkCase]: - if not case_names: - selected_cases = CASES - else: - selected = set(case_names) - selected_cases = [case for case in CASES if case.name in selected] - - runnable = [] - skipped_by_reason: dict[str, list[str]] = {} - for case in selected_cases: - if case.skip_reason is None: - runnable.append(case) - else: - skipped_by_reason.setdefault(case.skip_reason, []).append(case.name) - - for reason, names in skipped_by_reason.items(): - print(f"Skipping {len(names)} benchmark case(s): {', '.join(names)}") - print(f" Reason: {reason}") - - return runnable - - def _numba_cuda_skip_reason() -> str | None: try: import numba.cuda # noqa: F401 diff --git a/python/cuda_cccl/benchmarks/compute/host/oneshot_cached.py b/python/cuda_cccl/benchmarks/compute/host/oneshot_cached.py deleted file mode 100644 index fc9733f0702..00000000000 --- a/python/cuda_cccl/benchmarks/compute/host/oneshot_cached.py +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. -# -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -from __future__ import annotations - -import argparse - -import cuda.compute as cc - -from common import ( - add_case_filter, - add_json_output, - measure_call, - patch_wrapper_to_skip_native_compute, - print_results, - select_cases, - synchronize, - write_results_json, -) - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Measure cached cuda.compute public one-shot host overhead." - ) - parser.add_argument( - "--repeat", - type=int, - default=20, - help="Number of timing samples.", - ) - parser.add_argument( - "--number", - type=int, - default=100, - help="Number of calls per timing sample.", - ) - add_case_filter(parser) - add_json_output(parser) - args = parser.parse_args() - - results = [] - for case in select_cases(args.case): - cc.clear_all_caches() - state = case.setup() - wrapper = case.make_wrapper(state) - patch_wrapper_to_skip_native_compute(wrapper, case.noop_return_kind) - synchronize() - - results.append( - measure_call( - case.name, - lambda case=case, state=state: case.oneshot(state), - repeat=args.repeat, - number=args.number, - ) - ) - - print_results(results) - if args.json is not None: - write_results_json( - args.json, - benchmark="oneshot_cached", - results=results, - config={ - "repeat": args.repeat, - "number": args.number, - "case": args.case, - }, - ) - - -if __name__ == "__main__": - main() diff --git a/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py b/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py new file mode 100644 index 00000000000..1d64260400e --- /dev/null +++ b/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py @@ -0,0 +1,95 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from __future__ import annotations + +import pytest + +import cuda.compute as cc + +from host_benchmark_cases import ( + CASES, + HostBenchmarkCase, + patch_wrapper_to_skip_native_compute, + synchronize, +) + +pytest.importorskip("pytest_benchmark") + +BUILD_TIME_ROUNDS = 10 +ONESHOT_ROUNDS = 20 +ONESHOT_ITERATIONS = 100 +TWOSHOT_ROUNDS = 20 +TWOSHOT_ITERATIONS = 1000 + + +def _case_params() -> list[pytest.ParameterSet]: + params = [] + for case in CASES: + marks = [] + if case.skip_reason is not None: + marks.append(pytest.mark.skip(reason=case.skip_reason)) + params.append(pytest.param(case, id=case.name, marks=marks)) + return params + + +@pytest.mark.benchmark(group="cuda.compute.host.build_time") +@pytest.mark.parametrize("case", _case_params()) +def test_build_time(benchmark, case: HostBenchmarkCase): + state = case.setup() + synchronize() + + def setup() -> None: + cc.clear_all_caches() + + def build(): + return case.make_wrapper(state) + + benchmark.pedantic( + build, + setup=setup, + rounds=BUILD_TIME_ROUNDS, + iterations=1, + warmup_rounds=0, + ) + + +@pytest.mark.benchmark(group="cuda.compute.host.oneshot_cached") +@pytest.mark.parametrize("case", _case_params()) +def test_oneshot_cached_host_overhead(benchmark, case: HostBenchmarkCase): + cc.clear_all_caches() + state = case.setup() + wrapper = case.make_wrapper(state) + patch_wrapper_to_skip_native_compute(wrapper, case.noop_return_kind) + synchronize() + + def call() -> None: + case.oneshot(state) + + benchmark.pedantic( + call, + rounds=ONESHOT_ROUNDS, + iterations=ONESHOT_ITERATIONS, + warmup_rounds=0, + ) + + +@pytest.mark.benchmark(group="cuda.compute.host.twoshot_call") +@pytest.mark.parametrize("case", _case_params()) +def test_twoshot_call_host_overhead(benchmark, case: HostBenchmarkCase): + cc.clear_all_caches() + state = case.setup() + wrapper = case.make_wrapper(state) + patch_wrapper_to_skip_native_compute(wrapper, case.noop_return_kind) + synchronize() + + def call() -> None: + case.twoshot(state, wrapper) + + benchmark.pedantic( + call, + rounds=TWOSHOT_ROUNDS, + iterations=TWOSHOT_ITERATIONS, + warmup_rounds=0, + ) diff --git a/python/cuda_cccl/benchmarks/compute/host/twoshot_call.py b/python/cuda_cccl/benchmarks/compute/host/twoshot_call.py deleted file mode 100644 index 40370fa1aec..00000000000 --- a/python/cuda_cccl/benchmarks/compute/host/twoshot_call.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. -# -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -from __future__ import annotations - -import argparse - -import cuda.compute as cc - -from common import ( - add_case_filter, - add_json_output, - measure_call, - patch_wrapper_to_skip_native_compute, - print_results, - select_cases, - synchronize, - write_results_json, -) - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Measure cached cuda.compute wrapper __call__ host overhead." - ) - parser.add_argument( - "--repeat", - type=int, - default=20, - help="Number of timing samples.", - ) - parser.add_argument( - "--number", - type=int, - default=1000, - help="Number of calls per timing sample.", - ) - add_case_filter(parser) - add_json_output(parser) - args = parser.parse_args() - - results = [] - for case in select_cases(args.case): - cc.clear_all_caches() - state = case.setup() - wrapper = case.make_wrapper(state) - patch_wrapper_to_skip_native_compute(wrapper, case.noop_return_kind) - synchronize() - - results.append( - measure_call( - case.name, - lambda case=case, state=state, wrapper=wrapper: case.twoshot( - state, wrapper - ), - repeat=args.repeat, - number=args.number, - ) - ) - - print_results(results) - if args.json is not None: - write_results_json( - args.json, - benchmark="twoshot_call", - results=results, - config={ - "repeat": args.repeat, - "number": args.number, - "case": args.case, - }, - ) - - -if __name__ == "__main__": - main() diff --git a/python/cuda_cccl/benchmarks/compute/pixi.toml b/python/cuda_cccl/benchmarks/compute/pixi.toml index 35ed8792506..b53fee44387 100644 --- a/python/cuda_cccl/benchmarks/compute/pixi.toml +++ b/python/cuda_cccl/benchmarks/compute/pixi.toml @@ -19,6 +19,7 @@ cuda-version = "13.1.*" python = "3.13.*" numpy = "*" cupy = "*" +pytest-benchmark = "*" pyyaml = "*" pre-commit = "*" diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml index 967bc86d58b..98e0e6533c0 100644 --- a/python/cuda_cccl/pyproject.toml +++ b/python/cuda_cccl/pyproject.toml @@ -88,10 +88,30 @@ test-cu12 = [ test-cu13 = ["cuda-cccl[cu13]", "pytest", "pytest-xdist", "cupy-cuda13x"] test-sysctk12 = ["cuda-cccl[sysctk12]", "pytest", "pytest-xdist", "cupy-cuda12x"] test-sysctk13 = ["cuda-cccl[sysctk13]", "pytest", "pytest-xdist", "cupy-cuda13x"] -bench-cu12 = ["cuda-cccl[cu12]", "cuda-bench[cu12]", "cupy-cuda12x"] -bench-cu13 = ["cuda-cccl[cu13]", "cuda-bench[cu13]", "cupy-cuda13x"] -bench-sysctk12 = ["cuda-cccl[sysctk12]", "cuda-bench[cu12]", "cupy-cuda12x"] -bench-sysctk13 = ["cuda-cccl[sysctk13]", "cuda-bench[cu13]", "cupy-cuda13x"] +bench-cu12 = [ + "cuda-cccl[cu12]", + "cuda-bench[cu12]", + "cupy-cuda12x", + "pytest-benchmark", +] +bench-cu13 = [ + "cuda-cccl[cu13]", + "cuda-bench[cu13]", + "cupy-cuda13x", + "pytest-benchmark", +] +bench-sysctk12 = [ + "cuda-cccl[sysctk12]", + "cuda-bench[cu12]", + "cupy-cuda12x", + "pytest-benchmark", +] +bench-sysctk13 = [ + "cuda-cccl[sysctk13]", + "cuda-bench[cu13]", + "cupy-cuda13x", + "pytest-benchmark", +] [project.urls] Homepage = "https://github.com/NVIDIA/cccl" From a9219391427996f600bb66916ea76cc99ceb8891 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 12 Jun 2026 21:34:18 +0000 Subject: [PATCH 07/16] [pre-commit.ci] auto code formatting --- .../benchmarks/compute/host/host_benchmark_cases.py | 8 ++------ .../benchmarks/compute/host/test_host_pytest_benchmark.py | 5 ++--- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py b/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py index 3fa97827435..769846405bf 100644 --- a/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py +++ b/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py @@ -269,9 +269,7 @@ def _twoshot_scan(state: SimpleNamespace, wrapper) -> None: def _setup_segmented_reduce() -> SimpleNamespace: d_in = cp.arange(NUM_ITEMS, dtype=cp.int32) d_out = cp.empty(NUM_SEGMENTS, dtype=cp.int32) - offsets = cp.asarray( - np.linspace(0, NUM_ITEMS, NUM_SEGMENTS + 1, dtype=np.int64) - ) + offsets = cp.asarray(np.linspace(0, NUM_ITEMS, NUM_SEGMENTS + 1, dtype=np.int64)) return SimpleNamespace( d_in=d_in, d_out=d_out, @@ -687,9 +685,7 @@ def _twoshot_radix_sort(state: SimpleNamespace, wrapper) -> None: def _setup_segmented_sort() -> SimpleNamespace: state = _setup_sort() - offsets = cp.asarray( - np.linspace(0, NUM_ITEMS, NUM_SEGMENTS + 1, dtype=np.int64) - ) + offsets = cp.asarray(np.linspace(0, NUM_ITEMS, NUM_SEGMENTS + 1, dtype=np.int64)) state.start_offsets = offsets[:-1] state.end_offsets = offsets[1:] state.num_segments = NUM_SEGMENTS diff --git a/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py b/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py index 1d64260400e..4e62cdd3a6d 100644 --- a/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py +++ b/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py @@ -5,9 +5,6 @@ from __future__ import annotations import pytest - -import cuda.compute as cc - from host_benchmark_cases import ( CASES, HostBenchmarkCase, @@ -15,6 +12,8 @@ synchronize, ) +import cuda.compute as cc + pytest.importorskip("pytest_benchmark") BUILD_TIME_ROUNDS = 10 From 1571d769a36d7135ff760e9742face5a2a97c6a2 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Fri, 12 Jun 2026 16:49:10 -0500 Subject: [PATCH 08/16] Add a case that accepts a stream --- .../compute/host/host_benchmark_cases.py | 460 ++++++++++++++++++ .../host/test_host_pytest_benchmark.py | 11 +- 2 files changed, 466 insertions(+), 5 deletions(-) diff --git a/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py b/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py index 769846405bf..9550650567f 100644 --- a/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py +++ b/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py @@ -12,6 +12,7 @@ import numpy as np import cuda.compute as cc +from cuda.core import Device from cuda.compute._cpp_compile import compile_cpp_op_code from cuda.compute.op import RawOp @@ -81,6 +82,12 @@ def make_tiny_temp_storage() -> cp.ndarray: return cp.empty(NOOP_TEMP_STORAGE_BYTES, dtype=cp.uint8) +def make_stream(): + device = Device() + device.set_current() + return device.create_stream() + + def synchronize() -> None: cp.cuda.Device().synchronize() @@ -228,6 +235,29 @@ def _twoshot_reduce(state: SimpleNamespace, wrapper) -> None: ) +def _oneshot_reduce_stream(state: SimpleNamespace) -> None: + cc.reduce_into( + d_in=state.d_in, + d_out=state.d_out[:1], + num_items=state.num_items, + op=state.op, + h_init=state.h_init, + stream=state.stream, + ) + + +def _twoshot_reduce_stream(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_in=state.d_in, + d_out=state.d_out[:1], + num_items=state.num_items, + op=state.op, + h_init=state.h_init, + stream=state.stream, + ) + + def _setup_scan() -> SimpleNamespace: state = _setup_unary_input_output() state.h_init = np.array([0], dtype=np.int32) @@ -266,6 +296,29 @@ def _twoshot_scan(state: SimpleNamespace, wrapper) -> None: ) +def _oneshot_scan_stream(state: SimpleNamespace) -> None: + cc.exclusive_scan( + d_in=state.d_in, + d_out=state.d_out, + op=state.op, + init_value=state.h_init, + num_items=state.num_items, + stream=state.stream, + ) + + +def _twoshot_scan_stream(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_in=state.d_in, + d_out=state.d_out, + op=state.op, + init_value=state.h_init, + num_items=state.num_items, + stream=state.stream, + ) + + def _setup_segmented_reduce() -> SimpleNamespace: d_in = cp.arange(NUM_ITEMS, dtype=cp.int32) d_out = cp.empty(NUM_SEGMENTS, dtype=cp.int32) @@ -318,6 +371,33 @@ def _twoshot_segmented_reduce(state: SimpleNamespace, wrapper) -> None: ) +def _oneshot_segmented_reduce_stream(state: SimpleNamespace) -> None: + cc.segmented_reduce( + d_in=state.d_in, + d_out=state.d_out, + num_segments=state.num_segments, + start_offsets_in=state.start_offsets, + end_offsets_in=state.end_offsets, + op=state.op, + h_init=state.h_init, + stream=state.stream, + ) + + +def _twoshot_segmented_reduce_stream(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_in=state.d_in, + d_out=state.d_out, + num_segments=state.num_segments, + start_offsets_in=state.start_offsets, + end_offsets_in=state.end_offsets, + op=state.op, + h_init=state.h_init, + stream=state.stream, + ) + + def _make_unary_transform(state: SimpleNamespace): return cc.make_unary_transform( d_in=state.d_in, @@ -344,6 +424,26 @@ def _twoshot_unary_transform(state: SimpleNamespace, wrapper) -> None: ) +def _oneshot_unary_transform_stream(state: SimpleNamespace) -> None: + cc.unary_transform( + d_in=state.d_in, + d_out=state.d_out, + op=state.op, + num_items=state.num_items, + stream=state.stream, + ) + + +def _twoshot_unary_transform_stream(state: SimpleNamespace, wrapper) -> None: + wrapper( + d_in=state.d_in, + d_out=state.d_out, + op=state.op, + num_items=state.num_items, + stream=state.stream, + ) + + def _make_binary_transform(state: SimpleNamespace): return cc.make_binary_transform( d_in1=state.d_in1, @@ -373,6 +473,28 @@ def _twoshot_binary_transform(state: SimpleNamespace, wrapper) -> None: ) +def _oneshot_binary_transform_stream(state: SimpleNamespace) -> None: + cc.binary_transform( + d_in1=state.d_in1, + d_in2=state.d_in2, + d_out=state.d_out, + op=state.op, + num_items=state.num_items, + stream=state.stream, + ) + + +def _twoshot_binary_transform_stream(state: SimpleNamespace, wrapper) -> None: + wrapper( + d_in1=state.d_in1, + d_in2=state.d_in2, + d_out=state.d_out, + op=state.op, + num_items=state.num_items, + stream=state.stream, + ) + + def _setup_histogram() -> SimpleNamespace: d_samples = cp.arange(NUM_ITEMS, dtype=cp.int32) num_output_levels = 17 @@ -427,6 +549,31 @@ def _twoshot_histogram(state: SimpleNamespace, wrapper) -> None: ) +def _oneshot_histogram_stream(state: SimpleNamespace) -> None: + cc.histogram_even( + d_samples=state.d_samples, + d_histogram=state.d_histogram, + num_output_levels=state.num_output_levels, + lower_level=state.lower_level, + upper_level=state.upper_level, + num_samples=state.num_samples, + stream=state.stream, + ) + + +def _twoshot_histogram_stream(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_samples=state.d_samples, + d_histogram=state.d_histogram, + h_num_output_levels=state.h_num_output_levels, + h_lower_level=state.h_lower_level, + h_upper_level=state.h_upper_level, + num_samples=state.num_samples, + stream=state.stream, + ) + + def _setup_binary_search() -> SimpleNamespace: d_data = cp.arange(NUM_ITEMS, dtype=cp.int32) d_values = cp.arange(0, NUM_ITEMS, 2, dtype=cp.int32) @@ -472,6 +619,30 @@ def _twoshot_lower_bound(state: SimpleNamespace, wrapper) -> None: ) +def _oneshot_lower_bound_stream(state: SimpleNamespace) -> None: + cc.lower_bound( + d_data=state.d_data, + num_items=state.num_items, + d_values=state.d_values, + num_values=state.num_values, + d_out=state.d_out, + comp=state.comp, + stream=state.stream, + ) + + +def _twoshot_lower_bound_stream(state: SimpleNamespace, wrapper) -> None: + wrapper( + d_data=state.d_data, + num_items=state.num_items, + d_values=state.d_values, + num_values=state.num_values, + d_out=state.d_out, + comp=state.comp, + stream=state.stream, + ) + + def _setup_select() -> SimpleNamespace: state = _setup_unary_input_output() state.d_num_selected = cp.empty(1, dtype=np.uint64) @@ -510,6 +681,29 @@ def _twoshot_select(state: SimpleNamespace, wrapper) -> None: ) +def _oneshot_select_stream(state: SimpleNamespace) -> None: + cc.select( + d_in=state.d_in, + d_out=state.d_out, + d_num_selected_out=state.d_num_selected, + cond=state.cond, + num_items=state.num_items, + stream=state.stream, + ) + + +def _twoshot_select_stream(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_in=state.d_in, + d_out=state.d_out, + d_num_selected_out=state.d_num_selected, + cond=state.cond, + num_items=state.num_items, + stream=state.stream, + ) + + def _setup_three_way_partition() -> SimpleNamespace: state = _setup_unary_input_output() state.d_first = cp.empty_like(state.d_in) @@ -561,6 +755,35 @@ def _twoshot_three_way_partition(state: SimpleNamespace, wrapper) -> None: ) +def _oneshot_three_way_partition_stream(state: SimpleNamespace) -> None: + cc.three_way_partition( + d_in=state.d_in, + d_first_part_out=state.d_first, + d_second_part_out=state.d_second, + d_unselected_out=state.d_unselected, + d_num_selected_out=state.d_num_selected, + select_first_part_op=state.first_op, + select_second_part_op=state.second_op, + num_items=state.num_items, + stream=state.stream, + ) + + +def _twoshot_three_way_partition_stream(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_in=state.d_in, + d_first_part_out=state.d_first, + d_second_part_out=state.d_second, + d_unselected_out=state.d_unselected, + d_num_selected_out=state.d_num_selected, + select_first_part_op=state.first_op, + select_second_part_op=state.second_op, + num_items=state.num_items, + stream=state.stream, + ) + + def _setup_unique_by_key() -> SimpleNamespace: d_keys = cp.arange(NUM_ITEMS, dtype=cp.int32) d_items = cp.arange(NUM_ITEMS, dtype=cp.int32) @@ -612,6 +835,33 @@ def _twoshot_unique_by_key(state: SimpleNamespace, wrapper) -> None: ) +def _oneshot_unique_by_key_stream(state: SimpleNamespace) -> None: + cc.unique_by_key( + d_in_keys=state.d_in_keys, + d_in_items=state.d_in_items, + d_out_keys=state.d_out_keys, + d_out_items=state.d_out_items, + d_out_num_selected=state.d_num_selected, + op=state.op, + num_items=state.num_items, + stream=state.stream, + ) + + +def _twoshot_unique_by_key_stream(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_in_keys=state.d_in_keys, + d_in_items=state.d_in_items, + d_out_keys=state.d_out_keys, + d_out_items=state.d_out_items, + d_out_num_selected=state.d_num_selected, + op=state.op, + num_items=state.num_items, + stream=state.stream, + ) + + def _setup_sort() -> SimpleNamespace: d_in_keys = cp.arange(NUM_ITEMS, 0, -1, dtype=cp.int32) d_out_keys = cp.empty_like(d_in_keys) @@ -653,6 +903,29 @@ def _twoshot_merge_sort(state: SimpleNamespace, wrapper) -> None: ) +def _oneshot_merge_sort_stream(state: SimpleNamespace) -> None: + cc.merge_sort( + d_in_keys=state.d_in_keys, + d_out_keys=state.d_out_keys, + num_items=state.num_items, + op=state.op, + stream=state.stream, + ) + + +def _twoshot_merge_sort_stream(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_in_keys=state.d_in_keys, + d_in_values=None, + d_out_keys=state.d_out_keys, + d_out_values=None, + num_items=state.num_items, + op=state.op, + stream=state.stream, + ) + + def _make_radix_sort(state: SimpleNamespace): return cc.make_radix_sort( d_in_keys=state.d_in_keys, @@ -683,6 +956,28 @@ def _twoshot_radix_sort(state: SimpleNamespace, wrapper) -> None: ) +def _oneshot_radix_sort_stream(state: SimpleNamespace) -> None: + cc.radix_sort( + d_in_keys=state.d_in_keys, + d_out_keys=state.d_out_keys, + num_items=state.num_items, + order=cc.SortOrder.ASCENDING, + stream=state.stream, + ) + + +def _twoshot_radix_sort_stream(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_in_keys=state.d_in_keys, + d_out_keys=state.d_out_keys, + d_in_values=None, + d_out_values=None, + num_items=state.num_items, + stream=state.stream, + ) + + def _setup_segmented_sort() -> SimpleNamespace: state = _setup_sort() offsets = cp.asarray(np.linspace(0, NUM_ITEMS, NUM_SEGMENTS + 1, dtype=np.int64)) @@ -732,6 +1027,36 @@ def _twoshot_segmented_sort(state: SimpleNamespace, wrapper) -> None: ) +def _oneshot_segmented_sort_stream(state: SimpleNamespace) -> None: + cc.segmented_sort( + d_in_keys=state.d_in_keys, + d_out_keys=state.d_out_keys, + d_in_values=None, + d_out_values=None, + num_items=state.num_items, + num_segments=state.num_segments, + start_offsets_in=state.start_offsets, + end_offsets_in=state.end_offsets, + order=cc.SortOrder.ASCENDING, + stream=state.stream, + ) + + +def _twoshot_segmented_sort_stream(state: SimpleNamespace, wrapper) -> None: + wrapper( + temp_storage=state.temp_storage, + d_in_keys=state.d_in_keys, + d_out_keys=state.d_out_keys, + d_in_values=None, + d_out_values=None, + num_items=state.num_items, + num_segments=state.num_segments, + start_offsets_in=state.start_offsets, + end_offsets_in=state.end_offsets, + stream=state.stream, + ) + + def _setup_with_values( setup_fn: Callable[[], SimpleNamespace], **values: Any ) -> Callable[[], SimpleNamespace]: @@ -756,6 +1081,17 @@ def setup() -> SimpleNamespace: return setup +def _setup_with_stream( + setup_fn: Callable[[], SimpleNamespace], +) -> Callable[[], SimpleNamespace]: + def setup() -> SimpleNamespace: + state = setup_fn() + state.stream = make_stream() + return state + + return setup + + def _make_case( name: str, setup: Callable[[], SimpleNamespace], @@ -1067,3 +1403,127 @@ def _make_case( "temp_storage_and_selector", ), ] + + +STREAM_CASES = [ + _make_case( + "reduce.plus.stream", + _setup_with_stream(_setup_with_values(_setup_reduce, op=cc.OpKind.PLUS)), + _make_reduce, + _oneshot_reduce_stream, + _twoshot_reduce_stream, + "temp_storage_bytes", + ), + _make_case( + "exclusive_scan.plus.stream", + _setup_with_stream(_setup_with_values(_setup_scan, op=cc.OpKind.PLUS)), + _make_scan, + _oneshot_scan_stream, + _twoshot_scan_stream, + "temp_storage_bytes", + ), + _make_case( + "segmented_reduce.plus.stream", + _setup_with_stream( + _setup_with_values(_setup_segmented_reduce, op=cc.OpKind.PLUS) + ), + _make_segmented_reduce, + _oneshot_segmented_reduce_stream, + _twoshot_segmented_reduce_stream, + "temp_storage_bytes", + ), + _make_case( + "unary_transform.identity.stream", + _setup_with_stream( + _setup_with_values(_setup_unary_input_output, op=cc.OpKind.IDENTITY) + ), + _make_unary_transform, + _oneshot_unary_transform_stream, + _twoshot_unary_transform_stream, + "none", + ), + _make_case( + "binary_transform.plus.stream", + _setup_with_stream( + _setup_with_values(_setup_binary_input_output, op=cc.OpKind.PLUS) + ), + _make_binary_transform, + _oneshot_binary_transform_stream, + _twoshot_binary_transform_stream, + "none", + ), + _make_case( + "histogram_even.stream", + _setup_with_stream(_setup_histogram), + _make_histogram, + _oneshot_histogram_stream, + _twoshot_histogram_stream, + "temp_storage_bytes", + ), + _make_case( + "lower_bound.less.stream", + _setup_with_stream(_setup_with_values(_setup_binary_search, comp=cc.OpKind.LESS)), + _make_lower_bound, + _oneshot_lower_bound_stream, + _twoshot_lower_bound_stream, + "none", + ), + _make_case( + "select.logical_not.stream", + _setup_with_stream(_setup_with_values(_setup_select, cond=cc.OpKind.LOGICAL_NOT)), + _make_select, + _oneshot_select_stream, + _twoshot_select_stream, + "temp_storage_bytes", + ), + _make_case( + "three_way_partition.logical_not.stream", + _setup_with_stream( + _setup_with_values( + _setup_three_way_partition, + first_op=cc.OpKind.LOGICAL_NOT, + second_op=cc.OpKind.LOGICAL_NOT, + ) + ), + _make_three_way_partition, + _oneshot_three_way_partition_stream, + _twoshot_three_way_partition_stream, + "temp_storage_bytes", + ), + _make_case( + "unique_by_key.equal.stream", + _setup_with_stream( + _setup_with_values(_setup_unique_by_key, op=cc.OpKind.EQUAL_TO) + ), + _make_unique_by_key, + _oneshot_unique_by_key_stream, + _twoshot_unique_by_key_stream, + "temp_storage_bytes", + ), + _make_case( + "merge_sort.less.stream", + _setup_with_stream(_setup_with_values(_setup_sort, op=cc.OpKind.LESS)), + _make_merge_sort, + _oneshot_merge_sort_stream, + _twoshot_merge_sort_stream, + "temp_storage_bytes", + ), + _make_case( + "radix_sort.stream", + _setup_with_stream(_setup_sort), + _make_radix_sort, + _oneshot_radix_sort_stream, + _twoshot_radix_sort_stream, + "temp_storage_and_selector", + ), + _make_case( + "segmented_sort.stream", + _setup_with_stream(_setup_segmented_sort), + _make_segmented_sort, + _oneshot_segmented_sort_stream, + _twoshot_segmented_sort_stream, + "temp_storage_and_selector", + ), +] + +CALL_CASES = CASES + STREAM_CASES diff --git a/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py b/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py index 4e62cdd3a6d..1fb6b8953cf 100644 --- a/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py +++ b/python/cuda_cccl/benchmarks/compute/host/test_host_pytest_benchmark.py @@ -6,6 +6,7 @@ import pytest from host_benchmark_cases import ( + CALL_CASES, CASES, HostBenchmarkCase, patch_wrapper_to_skip_native_compute, @@ -23,9 +24,9 @@ TWOSHOT_ITERATIONS = 1000 -def _case_params() -> list[pytest.ParameterSet]: +def _case_params(cases: list[HostBenchmarkCase]) -> list[pytest.ParameterSet]: params = [] - for case in CASES: + for case in cases: marks = [] if case.skip_reason is not None: marks.append(pytest.mark.skip(reason=case.skip_reason)) @@ -34,7 +35,7 @@ def _case_params() -> list[pytest.ParameterSet]: @pytest.mark.benchmark(group="cuda.compute.host.build_time") -@pytest.mark.parametrize("case", _case_params()) +@pytest.mark.parametrize("case", _case_params(CASES)) def test_build_time(benchmark, case: HostBenchmarkCase): state = case.setup() synchronize() @@ -55,7 +56,7 @@ def build(): @pytest.mark.benchmark(group="cuda.compute.host.oneshot_cached") -@pytest.mark.parametrize("case", _case_params()) +@pytest.mark.parametrize("case", _case_params(CALL_CASES)) def test_oneshot_cached_host_overhead(benchmark, case: HostBenchmarkCase): cc.clear_all_caches() state = case.setup() @@ -75,7 +76,7 @@ def call() -> None: @pytest.mark.benchmark(group="cuda.compute.host.twoshot_call") -@pytest.mark.parametrize("case", _case_params()) +@pytest.mark.parametrize("case", _case_params(CALL_CASES)) def test_twoshot_call_host_overhead(benchmark, case: HostBenchmarkCase): cc.clear_all_caches() state = case.setup() From 8172fd6a09ae068632a7d7cc5782a26bbf2b6bc0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 12 Jun 2026 21:53:27 +0000 Subject: [PATCH 09/16] [pre-commit.ci] auto code formatting --- .../benchmarks/compute/host/host_benchmark_cases.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py b/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py index 9550650567f..ceaa51709d6 100644 --- a/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py +++ b/python/cuda_cccl/benchmarks/compute/host/host_benchmark_cases.py @@ -12,9 +12,9 @@ import numpy as np import cuda.compute as cc -from cuda.core import Device from cuda.compute._cpp_compile import compile_cpp_op_code from cuda.compute.op import RawOp +from cuda.core import Device NOOP_TEMP_STORAGE_BYTES = 1 NUM_ITEMS = 128 @@ -1462,7 +1462,9 @@ def _make_case( ), _make_case( "lower_bound.less.stream", - _setup_with_stream(_setup_with_values(_setup_binary_search, comp=cc.OpKind.LESS)), + _setup_with_stream( + _setup_with_values(_setup_binary_search, comp=cc.OpKind.LESS) + ), _make_lower_bound, _oneshot_lower_bound_stream, _twoshot_lower_bound_stream, @@ -1470,7 +1472,9 @@ def _make_case( ), _make_case( "select.logical_not.stream", - _setup_with_stream(_setup_with_values(_setup_select, cond=cc.OpKind.LOGICAL_NOT)), + _setup_with_stream( + _setup_with_values(_setup_select, cond=cc.OpKind.LOGICAL_NOT) + ), _make_select, _oneshot_select_stream, _twoshot_select_stream, From c564d3e13e9931f25052099ec0f461ccee824e08 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Fri, 12 Jun 2026 17:04:23 -0500 Subject: [PATCH 10/16] Address comments --- ci/test_cuda_compute_minimal_python.sh | 4 ++-- python/cuda_cccl/tests/compute/test_binary_search.py | 1 + python/cuda_cccl/tests/compute/test_three_way_partition.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ci/test_cuda_compute_minimal_python.sh b/ci/test_cuda_compute_minimal_python.sh index c48b313ed31..672338bbf33 100755 --- a/ci/test_cuda_compute_minimal_python.sh +++ b/ci/test_cuda_compute_minimal_python.sh @@ -19,8 +19,8 @@ setup_python_env "${py_version}" # Fetch or build the cuda_cccl wheel: if [[ -n "${GITHUB_ACTIONS:-}" ]]; then wheel_artifact_name=$("$ci_dir/util/workflow/get_wheel_artifact_name.sh") - "$ci_dir/util/artifacts/download.sh" "${wheel_artifact_name}" /home/coder/cccl/ - wheelhouse_dir="/home/coder/cccl/wheelhouse" + "$ci_dir/util/artifacts/download.sh" "${wheel_artifact_name}" "${repo_root}/" + wheelhouse_dir="${repo_root}/wheelhouse" else "$ci_dir/build_cuda_cccl_python.sh" -py-version "${py_version}" wheelhouse_dir="${repo_root}/wheelhouse" diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py index 585a998c44c..1e24ca00116 100644 --- a/python/cuda_cccl/tests/compute/test_binary_search.py +++ b/python/cuda_cccl/tests/compute/test_binary_search.py @@ -47,6 +47,7 @@ def disable_sass_check(monkeypatch): (cuda.compute.upper_bound, "right"), ], ) +@pytest.mark.no_numba def test_binary_search_explicit_opkind_less(search, side): h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32) h_values = np.array([0, 3, 4, 10], dtype=np.int32) diff --git a/python/cuda_cccl/tests/compute/test_three_way_partition.py b/python/cuda_cccl/tests/compute/test_three_way_partition.py index 9f2683f6798..b9813558de3 100644 --- a/python/cuda_cccl/tests/compute/test_three_way_partition.py +++ b/python/cuda_cccl/tests/compute/test_three_way_partition.py @@ -71,7 +71,7 @@ def _raw_less_than_i32(name: str, threshold: int) -> RawOp: source = f""" extern "C" __device__ void {name}(void* x, void* result) {{ int value = *static_cast(x); - *static_cast(result) = value < {threshold}; + *static_cast(result) = value < {threshold} ? 1 : 0; }} """ return RawOp(ltoir=compile_cpp_op_code(source), name=name) From 79df8e0a62cce4a6b943a369406da3987c989aa8 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Fri, 12 Jun 2026 17:11:35 -0500 Subject: [PATCH 11/16] fix merge conflict --- python/cuda_cccl/tests/compute/test_binary_search.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py index 9b07eeb8815..1e24ca00116 100644 --- a/python/cuda_cccl/tests/compute/test_binary_search.py +++ b/python/cuda_cccl/tests/compute/test_binary_search.py @@ -47,10 +47,7 @@ def disable_sass_check(monkeypatch): (cuda.compute.upper_bound, "right"), ], ) -<<<<<<< HEAD @pytest.mark.no_numba -======= ->>>>>>> main def test_binary_search_explicit_opkind_less(search, side): h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32) h_values = np.array([0, 3, 4, 10], dtype=np.int32) From c2e7c01b5732f968520ef25c21503f244fb02a6b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 12 Jun 2026 22:16:44 +0000 Subject: [PATCH 12/16] [pre-commit.ci] auto code formatting --- python/cuda_cccl/tests/compute/test_radix_sort.py | 2 +- python/cuda_cccl/tests/compute/test_scan.py | 2 +- python/cuda_cccl/tests/compute/test_select.py | 2 +- python/cuda_cccl/tests/compute/test_three_way_partition.py | 2 +- python/cuda_cccl/tests/compute/test_unique_by_key.py | 2 +- python/cuda_cccl/tests/compute/test_zip_iterator.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cuda_cccl/tests/compute/test_radix_sort.py b/python/cuda_cccl/tests/compute/test_radix_sort.py index 67186d3a4f7..d39a7e07fad 100644 --- a/python/cuda_cccl/tests/compute/test_radix_sort.py +++ b/python/cuda_cccl/tests/compute/test_radix_sort.py @@ -10,11 +10,11 @@ import pytest import cuda.compute -from cuda.core import Device from cuda.compute import ( DoubleBuffer, SortOrder, ) +from cuda.core import Device def get_mark(dt, log_size): diff --git a/python/cuda_cccl/tests/compute/test_scan.py b/python/cuda_cccl/tests/compute/test_scan.py index 97f5fae630a..0eb3af34c49 100644 --- a/python/cuda_cccl/tests/compute/test_scan.py +++ b/python/cuda_cccl/tests/compute/test_scan.py @@ -8,7 +8,6 @@ import pytest import cuda.compute -from cuda.core import Device from cuda.compute import ( CountingIterator, OpKind, @@ -16,6 +15,7 @@ TransformOutputIterator, gpu_struct, ) +from cuda.core import Device def scan_host(h_input: np.ndarray, op, h_init, force_inclusive): diff --git a/python/cuda_cccl/tests/compute/test_select.py b/python/cuda_cccl/tests/compute/test_select.py index d467d33c635..3dcaf98a507 100644 --- a/python/cuda_cccl/tests/compute/test_select.py +++ b/python/cuda_cccl/tests/compute/test_select.py @@ -7,9 +7,9 @@ import pytest import cuda.compute +from cuda.compute import CacheModifiedInputIterator, ZipIterator, gpu_struct from cuda.compute._cpp_compile import compile_cpp_op_code from cuda.compute.op import RawOp -from cuda.compute import CacheModifiedInputIterator, ZipIterator, gpu_struct DTYPE_LIST = [ np.uint8, diff --git a/python/cuda_cccl/tests/compute/test_three_way_partition.py b/python/cuda_cccl/tests/compute/test_three_way_partition.py index b9813558de3..8def513681b 100644 --- a/python/cuda_cccl/tests/compute/test_three_way_partition.py +++ b/python/cuda_cccl/tests/compute/test_three_way_partition.py @@ -7,9 +7,9 @@ import pytest import cuda.compute +from cuda.compute import CacheModifiedInputIterator, gpu_struct from cuda.compute._cpp_compile import compile_cpp_op_code from cuda.compute.op import RawOp -from cuda.compute import CacheModifiedInputIterator, gpu_struct DTYPE_LIST = [ np.uint8, diff --git a/python/cuda_cccl/tests/compute/test_unique_by_key.py b/python/cuda_cccl/tests/compute/test_unique_by_key.py index 5f7d310d71e..945d4d99028 100644 --- a/python/cuda_cccl/tests/compute/test_unique_by_key.py +++ b/python/cuda_cccl/tests/compute/test_unique_by_key.py @@ -8,13 +8,13 @@ import pytest import cuda.compute -from cuda.core import Device from cuda.compute import ( CacheModifiedInputIterator, DiscardIterator, OpKind, gpu_struct, ) +from cuda.core import Device DTYPE_LIST = [ np.uint8, diff --git a/python/cuda_cccl/tests/compute/test_zip_iterator.py b/python/cuda_cccl/tests/compute/test_zip_iterator.py index aea1f167f9f..9fbed38359f 100644 --- a/python/cuda_cccl/tests/compute/test_zip_iterator.py +++ b/python/cuda_cccl/tests/compute/test_zip_iterator.py @@ -6,13 +6,13 @@ import pytest import cuda.compute -from cuda.core import Device from cuda.compute import ( CountingIterator, TransformIterator, ZipIterator, gpu_struct, ) +from cuda.core import Device @pytest.mark.parametrize("num_items", [10, 1_000, 100_000]) From b09c9c6ecb2df85d4e4fed2279fcfd7efe255254 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Fri, 12 Jun 2026 17:26:21 -0500 Subject: [PATCH 13/16] Fix pre-commit --- .../tests/compute/test_void_ptr_wrapper_validation.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py index e4d8185eb91..2788a3c0893 100644 --- a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py +++ b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py @@ -16,8 +16,12 @@ numba = pytest.importorskip("numba") types = numba.types -from cuda.compute._odr_helpers import _ArgMode, _ArgSpec, _create_void_ptr_wrapper -from cuda.compute._utils import sanitize_identifier +from cuda.compute._odr_helpers import ( # noqa: E402 + _ArgMode, + _ArgSpec, + _create_void_ptr_wrapper, +) +from cuda.compute._utils import sanitize_identifier # noqa: E402 def _make_arg_specs(): From 4e5b9ccb0d7b3bda7dbf3d55b71d3d0ff7879bba Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Mon, 15 Jun 2026 12:10:01 -0500 Subject: [PATCH 14/16] Document new caching behavior and add examples --- docs/python/compute/index.rst | 41 ++++++++++-- .../examples/free_threading/__init__.py | 3 + .../examples/free_threading/direct_api.py | 46 ++++++++++++++ .../examples/free_threading/object_api.py | 63 +++++++++++++++++++ 4 files changed, 149 insertions(+), 4 deletions(-) create mode 100644 python/cuda_cccl/tests/compute/examples/free_threading/__init__.py create mode 100644 python/cuda_cccl/tests/compute/examples/free_threading/direct_api.py create mode 100644 python/cuda_cccl/tests/compute/examples/free_threading/object_api.py diff --git a/docs/python/compute/index.rst b/docs/python/compute/index.rst index ba8519a8352..f193cbe95da 100644 --- a/docs/python/compute/index.rst +++ b/docs/python/compute/index.rst @@ -250,10 +250,14 @@ When working with structured data, there are two common memory layouts: Caching ------- -Algorithms in ``cuda.compute`` are compiled to GPU code at runtime. To avoid -recompiling on every call, build results are cached in memory. When you invoke -an algorithm with the same configuration—same dtypes, iterator kinds, operator, -and compute capability—the cached build is reused. +Algorithms in ``cuda.compute`` are compiled to GPU code at runtime. To +avoid recompiling on every call, build results are cached in memory. +When you invoke an algorithm with the same configuration—same dtypes, +iterator kinds, operator, compute capability, and current device—the +cached build is reused. On systems with multiple GPUs, builds may be +cached separately for each GPU. When free-threaded Python is enabled, +compiled build results may be reused by multiple threads in the same +process. What determines the cache key +++++++++++++++++++++++++++++ @@ -265,12 +269,41 @@ Each algorithm computes a cache key from: * **Operator identity** — for user-defined functions, the function's bytecode, constants, and closure contents (see below) * **Compute capability** — the GPU architecture of the current device +* **Current device** — the CUDA device active when the algorithm is built * **Algorithm-specific parameters** — such as initial value dtype or determinism mode Note that array *contents* or *pointers* are not part of the cache key—only the array's dtype. This means you can reuse a cached algorithm across different arrays of the same type. +Multi-GPU behavior +++++++++++++++++++ + +Cached builds are device-specific. If the same algorithm configuration is used +on multiple GPUs, ``cuda.compute`` may compile and cache a separate build for +each device. Set the intended current CUDA device before constructing or invoking +an algorithm, and pass arrays that are valid on that device. + +Free-threaded Python +++++++++++++++++++++ + +When ``cuda.compute`` is built for a free-threaded Python interpreter, +independent calls from multiple Python threads can reuse compiled build results +within the same process. + +The cache is local to the current Python process. Separate Python processes build +and cache independently, even if they use the same GPU and algorithm +configuration. + +This does not make user-provided memory or CUDA work automatically safe to share. +Users are still responsible for avoiding data races, such as two threads writing +to the same output array at the same time. For concurrent use, prefer the direct +algorithm APIs, such as +:func:`reduce_into `, or create a separate +reusable algorithm object in each thread (for example, the object returned by +:func:`make_reduce_into `). If multiple +threads share one of these objects, serialize access to that object. + How user-defined functions are cached +++++++++++++++++++++++++++++++++++++ diff --git a/python/cuda_cccl/tests/compute/examples/free_threading/__init__.py b/python/cuda_cccl/tests/compute/examples/free_threading/__init__.py new file mode 100644 index 00000000000..8bbe3ce1ab8 --- /dev/null +++ b/python/cuda_cccl/tests/compute/examples/free_threading/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception diff --git a/python/cuda_cccl/tests/compute/examples/free_threading/direct_api.py b/python/cuda_cccl/tests/compute/examples/free_threading/direct_api.py new file mode 100644 index 00000000000..02cbe05a6f6 --- /dev/null +++ b/python/cuda_cccl/tests/compute/examples/free_threading/direct_api.py @@ -0,0 +1,46 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# example-begin +""" +Run independent direct API calls from multiple Python threads. +""" + +from concurrent.futures import ThreadPoolExecutor + +import cupy as cp +import numpy as np + +import cuda.compute +from cuda.compute import OpKind + + +def reduce_values(h_input): + dtype = np.int32 + h_init = np.array([0], dtype=dtype) + d_input = cp.asarray(h_input, dtype=dtype) + d_output = cp.empty(1, dtype=dtype) + + cuda.compute.reduce_into( + d_in=d_input, + d_out=d_output, + num_items=len(h_input), + op=OpKind.PLUS, + h_init=h_init, + ) + + return int(d_output.get()[0]) + + +inputs = [ + np.array([1, 2, 3, 4], dtype=np.int32), + np.array([5, 6, 7, 8], dtype=np.int32), +] + +with ThreadPoolExecutor(max_workers=len(inputs)) as executor: + results = list(executor.map(reduce_values, inputs)) + +expected = [int(np.sum(h_input)) for h_input in inputs] +assert results == expected +print(f"Free-threaded direct API results: {results}") diff --git a/python/cuda_cccl/tests/compute/examples/free_threading/object_api.py b/python/cuda_cccl/tests/compute/examples/free_threading/object_api.py new file mode 100644 index 00000000000..2b7ec1de3ba --- /dev/null +++ b/python/cuda_cccl/tests/compute/examples/free_threading/object_api.py @@ -0,0 +1,63 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# example-begin +""" +Run independent object-based API calls from multiple Python threads. +""" + +from concurrent.futures import ThreadPoolExecutor + +import cupy as cp +import numpy as np + +import cuda.compute +from cuda.compute import OpKind + + +def reduce_values(h_input): + dtype = np.int32 + h_init = np.array([0], dtype=dtype) + d_input = cp.asarray(h_input, dtype=dtype) + d_output = cp.empty(1, dtype=dtype) + + reducer = cuda.compute.make_reduce_into( + d_in=d_input, + d_out=d_output, + op=OpKind.PLUS, + h_init=h_init, + ) + temp_storage_size = reducer( + temp_storage=None, + d_in=d_input, + d_out=d_output, + num_items=len(h_input), + op=OpKind.PLUS, + h_init=h_init, + ) + d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8) + + reducer( + temp_storage=d_temp_storage, + d_in=d_input, + d_out=d_output, + num_items=len(h_input), + op=OpKind.PLUS, + h_init=h_init, + ) + + return int(d_output.get()[0]) + + +inputs = [ + np.array([1, 2, 3, 4], dtype=np.int32), + np.array([5, 6, 7, 8], dtype=np.int32), +] + +with ThreadPoolExecutor(max_workers=len(inputs)) as executor: + results = list(executor.map(reduce_values, inputs)) + +expected = [int(np.sum(h_input)) for h_input in inputs] +assert results == expected +print(f"Free-threaded object API results: {results}") From cd0858e6066545cdda8fb3a97175d564ca3fcad9 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Mon, 15 Jun 2026 14:41:24 -0500 Subject: [PATCH 15/16] Update developer documentation --- docs/python/compute/developer_overview.rst | 170 +++++++++++++++++++++ docs/python/compute/index.rst | 5 +- 2 files changed, 174 insertions(+), 1 deletion(-) diff --git a/docs/python/compute/developer_overview.rst b/docs/python/compute/developer_overview.rst index c6d30971b01..6cec0b38e5f 100644 --- a/docs/python/compute/developer_overview.rst +++ b/docs/python/compute/developer_overview.rst @@ -451,6 +451,176 @@ as an example: At this point, the kernels stored in the reduction object are launched and the reduction is performed. +Caching and free-threaded Python +-------------------------------- + +The user-facing cache behavior is described in :ref:`cuda.compute.caching`. This +section describes the implementation contracts that keep that behavior correct +for free-threaded Python and multi-GPU use. + +Design requirements ++++++++++++++++++++ + +The free-threading design is constrained by the following requirements: + +* Importing ``cuda.compute`` in a free-threaded CPython interpreter must not + re-enable the GIL. +* Free-threading support should not add global locking or shared-state + contention to the normal single-threaded execution path. Wrapper cache hits + should be thread-local, and normal algorithm execution should not take a + global cache lock. +* Mutable wrapper state must not be shared across threads. +* Expensive native build results should still be shared across threads when they + are safe to share. +* Same-key concurrent cold builds should build once; waiters should receive the + same result or observe the same exception. + +Build and validation requirements ++++++++++++++++++++++++++++++++++ + +The Cython extension that backs ``cuda.compute`` must opt in to free-threaded +execution: + +.. code-block:: cython + + # cython: freethreading_compatible=True + +Without this marker, importing the extension in a free-threaded CPython process +can cause CPython to re-enable the GIL. The generated extension should advertise +``Py_MOD_GIL_NOT_USED`` and importing ``cuda.compute`` should leave +``sys._is_gil_enabled()`` false. + +The free-threaded wheel must also keep its free-threaded ABI tag after repair and +merge steps. For CPython 3.14, the expected wheel tag contains +``cp314-cp314t`` rather than the regular ``cp314-cp314`` tag. The acceptance +criteria for a free-threaded build are: + +* the wheel has the expected ``cp314-cp314t`` ABI tag; +* importing ``cuda.compute`` does not re-enable the GIL; +* the free-threading stress suite passes without forcing ``PYTHON_GIL=0`` or + ``-X gil=0``. + +Two cache layers +++++++++++++++++ + +Internally, ``cuda.compute`` separates two kinds of cached state: + +* **Wrapper objects** are the Python objects returned by ``make_*`` APIs, such as + ``make_reduce_into``. They own per-call descriptor state and are cached per + Python thread by ``cache_with_registered_key_functions`` in + ``cuda/compute/_caching.py``. Keeping wrapper caches thread-local avoids + sharing mutable wrapper state across concurrent calls from free-threaded + Python. +* **Build results** are the Cython objects that own the native C parallel build + state, such as loaded CUDA libraries, kernels, policy state, and other + read-only data needed to invoke an algorithm. They are cached by + ``cache_build_result`` and may be shared by wrapper objects in different + Python threads. + +The normal cache-hit path is intentionally cheap. A wrapper-cache hit is +thread-local and does not take the shared build-cache lock. The shared +build-cache lock is used when constructing a wrapper that needs to look up, +coordinate, or create a native build result, not during ordinary execution of an +already-returned wrapper object. + +Device keying ++++++++++++++ + +Both cache layers include the current CUDA runtime device ordinal and compute +capability in their keys. The compute capability identifies the architecture used +for code generation and policy selection. The device ordinal keeps native build +state associated with the device on which it was built. + +The first implementation intentionally keys shared build results by CUDA runtime +device ordinal rather than by CUDA context handle. User-managed CUDA driver +contexts are not a target use case for ``cuda.compute``. CUDA runtime, +``cuda.core``, CuPy, and PyTorch-style applications are expected to use the +primary-context model, and language frontends generally prefer that model. + +The first implementation also does not share build results across devices that +happen to have the same compute capability. Native build results are not treated +as pure SM-level code artifacts. They can contain CUDA-facing build/load state, +and CUB launch paths may resolve a ``CUkernel`` to the current-context +``CUfunction`` before occupancy queries or launch. Some paths also get or set +kernel attributes on the resolved function, and CUDA kernel-attribute behavior +is device-specific. Until every build-result path is audited for same-SM +cross-device sharing, separate device ordinals build and cache separate native +results. + +Concurrent build coordination ++++++++++++++++++++++++++++++ + +``cache_build_result`` is responsible for coordinating concurrent cache misses. +The first thread to miss a build-result key runs the builder, while other +threads wait for that in-flight build to complete. If the build succeeds, all +waiting threads receive the same cached build result. If it fails, the exception +is propagated to the waiting threads and the failed build is not stored in the +cache. + +When adding a new algorithm, the factory that returns the reusable wrapper object +should use ``cache_with_registered_key_functions``. The wrapper constructor +should pass the expensive native build operation to ``cache_build_result`` if +that native state is safe to share across threads. Do not perform an expensive +native build before entering ``cache_build_result``; otherwise same-key cold +factory calls can duplicate the build and bypass single-flight coordination. + +The specialization key must include every argument that can affect generated +code, type layout, policy selection, or native build state. It should not include +runtime-only values such as array pointers, array contents, item counts, streams, +or temporary-storage pointers unless those values change the compiled interface. + +User-object and descriptor contracts +++++++++++++++++++++++++++++++++++++ + +Wrapper objects returned by ``make_*`` APIs are not thread-reentrant. If two +threads need the same algorithm specialization, each thread should call the +factory and receive its own wrapper object, or the caller must externally +serialize access to a shared wrapper. The wrapper updates its Cython +``Iterator``, ``Op``, ``Value``, and algorithm-specific descriptors before each +native call, so concurrent calls through the same wrapper could overwrite the +descriptor state another thread is about to use. + +Read-only iterator and operator objects may be shared across threads. The +iterator base class uses a per-iterator lock for first-time lazy construction of +advance, input-dereference, and output-dereference ``Op`` objects; cached access +after that remains lock-free. This lock does not make arbitrary mutation safe: +concurrent mutation of iterator state, operator state, captured state, or child +iterators remains unsupported unless the caller synchronizes externally. + +Mutable execution state belongs to one thread at a time unless the caller +provides synchronization. This includes output arrays, temporary-storage buffers, +streams, ``DoubleBuffer`` instances, and other objects whose state changes as +part of a launch. + +Backend-specific notes +++++++++++++++++++++++ + +The v1 NVRTC/nvJitLink backend and the v2 HostJIT backend have different +free-threading risk surfaces and must be audited independently. v1 stresses +NVRTC, nvJitLink, CUDA library loading, and CUB host dispatch. v2 adds HostJIT +compiler state, LLVM/Clang initialization, persistent PCH paths, generated +source/cubin artifacts, and dynamic loader lifetime. + +Transform has one additional v1 native-cache rule. In CPython 3.14 +free-threaded builds, ``python/cuda_cccl/CMakeLists.txt`` defines +``CCCL_PYTHON_FREE_THREADED`` for the bundled C parallel target, and +``c/parallel/src/transform.cu`` uses that macro to bypass the native +``async_config`` / ``prefetch_config`` cache. Normal non-free-threaded builds +keep the existing lazy native cache path. This avoids adding launch-path locking +for transform in free-threaded Python builds while preserving the existing +single-threaded behavior elsewhere. + +Clearing caches ++++++++++++++++ + +``clear_all_caches()`` is process-local. It clears all known per-thread wrapper +caches through a weak registry of live thread cache containers, and it clears the +shared build-result cache. Separate Python processes build and cache +independently. + +Calling ``clear_all_caches()`` concurrently with active factory calls or +algorithm execution is not supported unless the caller synchronizes externally. + For readers who want to connect this overview back to the source tree: diff --git a/docs/python/compute/index.rst b/docs/python/compute/index.rst index f193cbe95da..e5f82bb8386 100644 --- a/docs/python/compute/index.rst +++ b/docs/python/compute/index.rst @@ -297,7 +297,10 @@ configuration. This does not make user-provided memory or CUDA work automatically safe to share. Users are still responsible for avoiding data races, such as two threads writing -to the same output array at the same time. For concurrent use, prefer the direct +to the same output array at the same time. Read-only iterator and operator +objects may be shared across threads, but concurrent mutation of those objects, +captured state, or underlying arrays requires external synchronization. For +concurrent use, prefer the direct algorithm APIs, such as :func:`reduce_into `, or create a separate reusable algorithm object in each thread (for example, the object returned by From 15652c00259b5650977a4fb33d66d4bd680559b6 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Tue, 23 Jun 2026 09:34:33 -0500 Subject: [PATCH 16/16] Move no numba tests to separate file --- ci/test_cuda_compute_minimal_python.sh | 21 +- .../tests/compute/test_binary_search.py | 1 - .../cuda_cccl/tests/compute/test_bindings.py | 2 - .../compute/test_deferred_annotations.py | 2 - .../tests/compute/test_func_caching.py | 9 - .../cuda_cccl/tests/compute/test_histogram.py | 2 - .../cuda_cccl/tests/compute/test_iterators.py | 10 +- .../tests/compute/test_merge_sort.py | 55 +- .../tests/compute/test_nested_struct.py | 5 - .../cuda_cccl/tests/compute/test_no_numba.py | 560 +++++++++++++++++- .../compute/test_permutation_iterator.py | 6 - .../tests/compute/test_radix_sort.py | 76 ++- python/cuda_cccl/tests/compute/test_reduce.py | 38 +- python/cuda_cccl/tests/compute/test_scan.py | 16 +- .../tests/compute/test_segmented_reduce.py | 3 - .../tests/compute/test_segmented_sort.py | 51 +- python/cuda_cccl/tests/compute/test_select.py | 31 - .../tests/compute/test_shuffle_iterator.py | 1 - .../compute/test_struct_field_validation.py | 2 - .../tests/compute/test_three_way_partition.py | 46 -- .../cuda_cccl/tests/compute/test_transform.py | 5 - .../tests/compute/test_unique_by_key.py | 69 ++- .../test_void_ptr_wrapper_validation.py | 12 +- .../tests/compute/test_zip_iterator.py | 11 +- 24 files changed, 716 insertions(+), 318 deletions(-) diff --git a/ci/test_cuda_compute_minimal_python.sh b/ci/test_cuda_compute_minimal_python.sh index 672338bbf33..a43bbac7bfa 100755 --- a/ci/test_cuda_compute_minimal_python.sh +++ b/ci/test_cuda_compute_minimal_python.sh @@ -27,25 +27,10 @@ else fi # Install cuda_cccl with the minimal CUDA extra. This intentionally avoids the -# full cu* extras because those pull in numba/numba-cuda. In a clean minimal -# environment, the test phase below runs only tests marked no_numba. +# full cu* extras because those pull in numba/numba-cuda. CUDA_CCCL_WHEEL_PATH="$(ls "${wheelhouse_dir}"/cuda_cccl-*.whl)" python -m pip install "${CUDA_CCCL_WHEEL_PATH}[minimal-cu${cuda_major_version}]" python -m pip install pytest pytest-xdist "cupy-cuda${cuda_major_version}x" -if python - <<'PY' -try: - import numba.cuda # noqa: F401 -except Exception as exc: - print(f"numba.cuda unavailable; running no_numba subset: {exc!r}") - raise SystemExit(1) -else: - print("numba.cuda available; running full compute test suite.") -PY -then - cd "${repo_root}/python/cuda_cccl/tests/" - python -m pytest -n 6 -v compute/ -m "not large" -else - cd "${repo_root}/python/cuda_cccl/tests/" - python -m pytest -n 6 -v compute/ -m "not large and no_numba" -fi +cd "${repo_root}/python/cuda_cccl/tests/" +python -m pytest -n 6 -v compute/test_no_numba.py diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py index 1e24ca00116..585a998c44c 100644 --- a/python/cuda_cccl/tests/compute/test_binary_search.py +++ b/python/cuda_cccl/tests/compute/test_binary_search.py @@ -47,7 +47,6 @@ def disable_sass_check(monkeypatch): (cuda.compute.upper_bound, "right"), ], ) -@pytest.mark.no_numba def test_binary_search_explicit_opkind_less(search, side): h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32) h_values = np.array([0, 3, 4, 10], dtype=np.int32) diff --git a/python/cuda_cccl/tests/compute/test_bindings.py b/python/cuda_cccl/tests/compute/test_bindings.py index ccfbd8d0bc3..2f16da620ef 100644 --- a/python/cuda_cccl/tests/compute/test_bindings.py +++ b/python/cuda_cccl/tests/compute/test_bindings.py @@ -4,8 +4,6 @@ import cuda.compute._bindings as bindings -pytestmark = pytest.mark.no_numba - @pytest.fixture( params=[ diff --git a/python/cuda_cccl/tests/compute/test_deferred_annotations.py b/python/cuda_cccl/tests/compute/test_deferred_annotations.py index a584b8ffbe2..c30aeda4068 100644 --- a/python/cuda_cccl/tests/compute/test_deferred_annotations.py +++ b/python/cuda_cccl/tests/compute/test_deferred_annotations.py @@ -7,12 +7,10 @@ import cupy as cp import numpy as np -import pytest from cuda.compute import OpKind, TransformIterator, gpu_struct, reduce_into -@pytest.mark.no_numba def test_deferred_annotations(): # test that we can use @gpu_struct with deferred annotations # GH: #6421 diff --git a/python/cuda_cccl/tests/compute/test_func_caching.py b/python/cuda_cccl/tests/compute/test_func_caching.py index e94e0069529..8fcf47b32a7 100644 --- a/python/cuda_cccl/tests/compute/test_func_caching.py +++ b/python/cuda_cccl/tests/compute/test_func_caching.py @@ -1,12 +1,10 @@ import numpy as np -import pytest from cuda.compute._caching import CachableFunction global_x = 1 -@pytest.mark.no_numba def test_func_caching_basic(): def func(x): return x @@ -21,7 +19,6 @@ def func(x): assert f1 == f2 -@pytest.mark.no_numba def test_func_caching_different_names(): def func(x): return x @@ -36,7 +33,6 @@ def func2(x): assert f1 != f2 -@pytest.mark.no_numba def test_func_caching_different_code(): def func(x): return x @@ -50,7 +46,6 @@ def func(x): assert f1 != f2 -@pytest.mark.no_numba def test_func_caching_with_closure(): def factory(x): def func(y): @@ -66,7 +61,6 @@ def func(y): assert f1 != f3 -@pytest.mark.no_numba def test_func_caching_with_numpy_numeric_scalar_closure(): def factory(indexlength, regularsize): index_dtype = np.int64 @@ -86,7 +80,6 @@ def func(counter): assert f1 != f3 -@pytest.mark.no_numba def test_func_caching_with_global_variable(): global global_x @@ -133,7 +126,6 @@ def func(x): assert CachableFunction(func1) != CachableFunction(func3) -@pytest.mark.no_numba def test_func_caching_with_global_np_ufunc(): def make_func(): def func(x): @@ -153,7 +145,6 @@ def func(x): assert CachableFunction(func1) != CachableFunction(func2) -@pytest.mark.no_numba def test_func_caching_with_aliased_np_ufunc(): def make_func1(): amin = np.argmin diff --git a/python/cuda_cccl/tests/compute/test_histogram.py b/python/cuda_cccl/tests/compute/test_histogram.py index 5c104114754..0a9d96674e6 100644 --- a/python/cuda_cccl/tests/compute/test_histogram.py +++ b/python/cuda_cccl/tests/compute/test_histogram.py @@ -12,8 +12,6 @@ CountingIterator, ) -pytestmark = pytest.mark.no_numba - DTYPE_LIST = [ np.uint8, np.uint16, diff --git a/python/cuda_cccl/tests/compute/test_iterators.py b/python/cuda_cccl/tests/compute/test_iterators.py index c3b6acc9fc9..49e2ab39d15 100644 --- a/python/cuda_cccl/tests/compute/test_iterators.py +++ b/python/cuda_cccl/tests/compute/test_iterators.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import cupy as cp +import numba.cuda import numpy as np import pytest @@ -21,7 +22,6 @@ ) -@pytest.mark.no_numba def test_constant_iterator_equality(): it1 = ConstantIterator(np.int32(0)) it2 = ConstantIterator(np.int32(0)) @@ -32,7 +32,6 @@ def test_constant_iterator_equality(): assert it1.kind != it4.kind -@pytest.mark.no_numba def test_counting_iterator_equality(): it1 = CountingIterator(np.int32(0)) it2 = CountingIterator(np.int32(0)) @@ -43,7 +42,6 @@ def test_counting_iterator_equality(): assert it1.kind != it4.kind -@pytest.mark.no_numba def test_cache_modified_input_iterator_equality(): ary1 = cp.asarray([0, 1, 2], dtype="int32") ary2 = cp.asarray([3, 4, 5], dtype="int32") @@ -124,14 +122,11 @@ def reverse_iterator_array(request): if array_type == "cupy": array = cp.array(base_array) else: - import numba.cuda - array = numba.cuda.to_device(base_array) return array -@pytest.mark.no_numba def test_reverse_input_iterator_equality(): ary1 = cp.asarray([0, 1, 2], dtype="int32") ary2 = cp.asarray([3, 4, 5], dtype="int32") @@ -146,7 +141,6 @@ def test_reverse_input_iterator_equality(): assert it1.kind != it4.kind -@pytest.mark.no_numba def test_reverse_output_iterator_equality(): ary1 = cp.asarray([0, 1, 2], dtype="int32") ary2 = cp.asarray([3, 4, 5], dtype="int32") @@ -161,7 +155,6 @@ def test_reverse_output_iterator_equality(): assert it1.kind != it4.kind -@pytest.mark.no_numba @pytest.mark.parametrize( "shape, itemsize, expected", [ @@ -186,7 +179,6 @@ def test_compute_c_contiguous_strides_in_bytes(shape, itemsize, expected): assert result == expected -@pytest.mark.no_numba @pytest.mark.parametrize( "shape, dtype", [ diff --git a/python/cuda_cccl/tests/compute/test_merge_sort.py b/python/cuda_cccl/tests/compute/test_merge_sort.py index db97d5aca01..33d7d15de58 100644 --- a/python/cuda_cccl/tests/compute/test_merge_sort.py +++ b/python/cuda_cccl/tests/compute/test_merge_sort.py @@ -5,6 +5,7 @@ from typing import List import cupy as cp +import numba.cuda import numpy as np import pytest @@ -81,11 +82,11 @@ def compare_op(lhs, rhs): def test_merge_sort_keys(dtype, num_items, op): h_in_keys = random_array(num_items, dtype) - d_in_keys = cp.asarray(h_in_keys) + d_in_keys = numba.cuda.to_device(h_in_keys) merge_sort_device(d_in_keys, None, d_in_keys, None, op, num_items) - h_out_keys = d_in_keys.get() + h_out_keys = d_in_keys.copy_to_host() h_in_keys.sort() np.testing.assert_array_equal(h_out_keys, h_in_keys) @@ -101,13 +102,13 @@ def test_merge_sort_pairs(dtype, num_items, op, monkeypatch): h_in_keys = random_array(num_items, dtype) h_in_items = random_array(num_items, np.float32) - d_in_keys = cp.asarray(h_in_keys) - d_in_items = cp.asarray(h_in_items) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_items = numba.cuda.to_device(h_in_items) merge_sort_device(d_in_keys, d_in_items, d_in_keys, d_in_items, op, num_items) - h_out_keys = d_in_keys.get() - h_out_items = d_in_items.get() + h_out_keys = d_in_keys.copy_to_host() + h_out_items = d_in_items.copy_to_host() argsort = np.argsort(h_in_keys, stable=True) h_in_keys = np.array(h_in_keys)[argsort] @@ -122,12 +123,12 @@ def test_merge_sort_keys_copy(dtype, num_items, op): h_in_keys = random_array(num_items, dtype) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = cp.asarray(h_in_keys) - d_out_keys = cp.asarray(h_out_keys) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_out_keys = numba.cuda.to_device(h_out_keys) merge_sort_device(d_in_keys, None, d_out_keys, None, op, num_items) - h_out_keys = d_out_keys.get() + h_out_keys = d_out_keys.copy_to_host() h_in_keys.sort() np.testing.assert_array_equal(h_out_keys, h_in_keys) @@ -145,15 +146,15 @@ def test_merge_sort_pairs_copy(dtype, num_items, op, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_items = np.empty(num_items, dtype=np.float32) - d_in_keys = cp.asarray(h_in_keys) - d_in_items = cp.asarray(h_in_items) - d_out_keys = cp.asarray(h_out_keys) - d_out_items = cp.asarray(h_out_items) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_items = numba.cuda.to_device(h_in_items) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_items = numba.cuda.to_device(h_out_items) merge_sort_device(d_in_keys, d_in_items, d_out_keys, d_out_items, op, num_items) - h_out_keys = d_out_keys.get() - h_out_items = d_out_items.get() + h_out_keys = d_out_keys.copy_to_host() + h_out_items = d_out_items.copy_to_host() argsort = np.argsort(h_in_keys, stable=True) h_in_keys = np.array(h_in_keys)[argsort] @@ -224,11 +225,11 @@ def compare_complex(lhs, rhs): imaginary = random_array(num_items, np.int64, max_value) h_in_keys = real + 1j * imaginary - d_in_keys = cp.asarray(h_in_keys) + d_in_keys = numba.cuda.to_device(h_in_keys) merge_sort_device(d_in_keys, None, d_in_keys, None, compare_complex, num_items) - h_out_keys = d_in_keys.get() + h_out_keys = d_in_keys.copy_to_host() h_in_keys = h_in_keys[np.argsort(h_in_keys.real, stable=True)] np.testing.assert_array_equal(h_out_keys, h_in_keys) @@ -239,15 +240,15 @@ def test_merge_sort_keys_copy_iterator_input(dtype, num_items, op): h_in_keys = random_array(num_items, dtype) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = cp.asarray(h_in_keys) - d_out_keys = cp.asarray(h_out_keys) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_out_keys = numba.cuda.to_device(h_out_keys) i_input = CacheModifiedInputIterator(d_in_keys, modifier="stream") merge_sort_device(i_input, None, d_out_keys, None, op, num_items) h_in_keys.sort() - h_out_keys = d_out_keys.get() + h_out_keys = d_out_keys.copy_to_host() np.testing.assert_array_equal(h_out_keys, h_in_keys) @@ -264,10 +265,10 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch) h_out_keys = np.empty(num_items, dtype=dtype) h_out_items = np.empty(num_items, dtype=np.float32) - d_in_keys = cp.asarray(h_in_keys) - d_in_items = cp.asarray(h_in_items) - d_out_keys = cp.asarray(h_out_keys) - d_out_items = cp.asarray(h_out_items) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_items = numba.cuda.to_device(h_in_items) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_items = numba.cuda.to_device(h_out_items) i_input_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream") i_input_items = CacheModifiedInputIterator(d_in_items, modifier="stream") @@ -276,8 +277,8 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch) i_input_keys, i_input_items, d_out_keys, d_out_items, op, num_items ) - h_out_keys = d_out_keys.get() - h_out_items = d_out_items.get() + h_out_keys = d_out_keys.copy_to_host() + h_out_items = d_out_items.copy_to_host() argsort = np.argsort(h_in_keys, stable=True) h_in_keys = np.array(h_in_keys)[argsort] @@ -306,7 +307,6 @@ def test_merge_sort_with_stream(cuda_stream): np.testing.assert_array_equal(got, h_in_keys) -@pytest.mark.no_numba def test_merge_sort_well_known_less(): dtype = np.int32 @@ -326,7 +326,6 @@ def test_merge_sort_well_known_less(): np.testing.assert_equal(d_out_keys.get(), expected) -@pytest.mark.no_numba def test_merge_sort_well_known_greater(): dtype = np.int32 diff --git a/python/cuda_cccl/tests/compute/test_nested_struct.py b/python/cuda_cccl/tests/compute/test_nested_struct.py index 715873af3c8..9cbfa40e393 100644 --- a/python/cuda_cccl/tests/compute/test_nested_struct.py +++ b/python/cuda_cccl/tests/compute/test_nested_struct.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import cupy as cp import numpy as np -import pytest import cuda.compute from cuda.compute import ZipIterator, gpu_struct @@ -150,7 +149,6 @@ def sum_pixels(p1, p2): assert result["color"]["b"] == expected_b -@pytest.mark.no_numba def test_dict_init_nested_struct(): """Test initializing a nested struct with a dictionary.""" Inner = gpu_struct({"a": np.int32, "b": np.float32}) @@ -164,7 +162,6 @@ def test_dict_init_nested_struct(): assert np.isclose(obj.inner.b, 3.14) -@pytest.mark.no_numba def test_dict_init_per_field(): """Test initializing a struct with a dictionary for a nested field.""" Inner = gpu_struct({"a": np.int32, "b": np.float32}) @@ -178,7 +175,6 @@ def test_dict_init_per_field(): assert np.isclose(obj.inner.b, 3.14) -@pytest.mark.no_numba def test_dict_init_deeply_nested(): """Test initializing deeply nested structs (3+ levels) with dictionaries.""" Level1 = gpu_struct({"value": np.int32}) @@ -193,7 +189,6 @@ def test_dict_init_deeply_nested(): assert obj.middle.nested.value == 42 -@pytest.mark.no_numba def test_dict_init_mixed(): """Test mixed initialization with some dicts and some direct values.""" Inner1 = gpu_struct({"a": np.int32, "b": np.int32}) diff --git a/python/cuda_cccl/tests/compute/test_no_numba.py b/python/cuda_cccl/tests/compute/test_no_numba.py index bbab965c8bf..6fb1ef0e811 100644 --- a/python/cuda_cccl/tests/compute/test_no_numba.py +++ b/python/cuda_cccl/tests/compute/test_no_numba.py @@ -1,12 +1,568 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import cupy as cp +import numpy as np import pytest -# Check that tests marked no_numba fail fast if they import numba. +import cuda.compute +from cuda.compute import ( + CacheModifiedInputIterator, + ConstantIterator, + CountingIterator, + DiscardIterator, + OpKind, + PermutationIterator, + ReverseIterator, + ShuffleIterator, + SortOrder, + TransformIterator, + TransformOutputIterator, + ZipIterator, +) +from cuda.compute._cpp_compile import compile_cpp_op_code +from cuda.compute.op import RawOp +from cuda.compute.types import int16 as cccl_int16 +from cuda.compute.types import int32 as cccl_int32 + +# These tests define the minimal-extra integration contract. They intentionally +# use small fixed inputs and avoid the Python-callable operator path. pytestmark = pytest.mark.no_numba -@pytest.mark.no_numba +def _raw_op(source: str, name: str) -> RawOp: + return RawOp(ltoir=compile_cpp_op_code(source), name=name) + + +def _raw_even_i32_op() -> RawOp: + source = """ +extern "C" __device__ void no_numba_even_i32(void* x, void* result) { + int value = *static_cast(x); + *static_cast(result) = (value % 2) == 0; +} +""" + return _raw_op(source, "no_numba_even_i32") + + +def _raw_less_than_i32_op(name: str, threshold: int) -> RawOp: + source = f""" +extern "C" __device__ void {name}(void* x, void* result) {{ + int value = *static_cast(x); + *static_cast(result) = value < {threshold} ? 1 : 0; +}} +""" + return _raw_op(source, name) + + +def _raw_plus_i64_op() -> RawOp: + source = """ +extern "C" __device__ void no_numba_plus_i64( + void* lhs, + void* rhs, + void* result +) { + *static_cast(result) = + *static_cast(lhs) + *static_cast(rhs); +} +""" + return _raw_op(source, "no_numba_plus_i64") + + +def _raw_square_i32_op() -> RawOp: + source = """ +extern "C" __device__ void no_numba_square_i32(void* x, void* result) { + int value = *static_cast(x); + *static_cast(result) = value * value; +} +""" + return _raw_op(source, "no_numba_square_i32") + + +def _raw_zip_sum_i32_op() -> RawOp: + source = """ +struct Zip2I32 { + int field_0; + int field_1; +}; + +extern "C" __device__ void no_numba_zip_sum_i32(void* x, void* result) { + auto values = static_cast(x); + *static_cast(result) = values->field_0 + values->field_1; +} +""" + return _raw_op(source, "no_numba_zip_sum_i32") + + +def _raw_negate_i16_op() -> RawOp: + source = """ +extern "C" __device__ void no_numba_negate_i16(void* x, void* result) { + *static_cast(result) = -*static_cast(x); +} +""" + return _raw_op(source, "no_numba_negate_i16") + + def test_import_numba_raises(): with pytest.raises( ImportError, match="This test is marked 'no_numba' but attempted to import it" ): import numba.cuda # noqa: F401 + + +def test_reduce_well_known_plus(): + h_input = np.arange(1, 14, dtype=np.int32) + d_input = cp.asarray(h_input) + d_output = cp.empty(1, dtype=np.int32) + h_init = np.array([5], dtype=np.int32) + + cuda.compute.reduce_into( + d_in=d_input, + d_out=d_output, + num_items=d_input.size, + op=OpKind.PLUS, + h_init=h_init, + ) + + assert d_output.get()[0] == np.sum(h_input, initial=h_init[0]) + + +def test_exclusive_scan_well_known_plus(): + d_input = cp.asarray([2, 4, 6, 8, 10, 12], dtype=np.uint16) + d_output = cp.empty_like(d_input) + h_init = np.array([1], dtype=np.uint16) + + cuda.compute.exclusive_scan( + d_in=d_input, + d_out=d_output, + op=OpKind.PLUS, + init_value=h_init, + num_items=d_input.size, + ) + + expected = np.asarray([1, 3, 7, 13, 21, 31], dtype=np.uint16) + np.testing.assert_array_equal(d_output.get(), expected) + + +def test_binary_transform_well_known_plus(): + d_lhs = cp.asarray([1.5, 2.5, 3.5, 4.5], dtype=np.float32) + d_rhs = cp.asarray([10.0, 20.0, 30.0, 40.0], dtype=np.float32) + d_output = cp.empty_like(d_lhs) + + cuda.compute.binary_transform( + d_in1=d_lhs, + d_in2=d_rhs, + d_out=d_output, + op=OpKind.PLUS, + num_items=d_lhs.size, + ) + + np.testing.assert_allclose(d_output.get(), d_lhs.get() + d_rhs.get()) + + +def test_unary_transform_well_known_negate(): + d_input = cp.asarray([-4, -2, 0, 2, 4], dtype=np.int8) + d_output = cp.empty_like(d_input) + + cuda.compute.unary_transform( + d_in=d_input, + d_out=d_output, + op=OpKind.NEGATE, + num_items=d_input.size, + ) + + np.testing.assert_array_equal(d_output.get(), np.asarray([4, 2, 0, -2, -4])) + + +@pytest.mark.parametrize( + "search, side", + [ + (cuda.compute.lower_bound, "left"), + (cuda.compute.upper_bound, "right"), + ], +) +def test_binary_search_explicit_opkind_less(search, side): + h_data = np.asarray([1, 3, 3, 7, 9, 11], dtype=np.int64) + h_values = np.asarray([0, 3, 4, 10, 12], dtype=np.int64) + d_out = cp.empty(h_values.size, dtype=np.uintp) + + search( + d_data=cp.asarray(h_data), + num_items=h_data.size, + d_values=cp.asarray(h_values), + num_values=h_values.size, + d_out=d_out, + comp=OpKind.LESS, + ) + + expected = np.searchsorted(h_data, h_values, side=side).astype(np.uintp) + np.testing.assert_array_equal(d_out.get(), expected) + + +def test_segmented_reduce_well_known_plus(monkeypatch): + monkeypatch.setattr(cuda.compute._cccl_interop, "_check_sass", False) + + d_input = cp.asarray([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.uint32) + d_starts = cp.asarray([0, 3, 5], dtype=np.int32) + d_ends = cp.asarray([3, 5, 8], dtype=np.int32) + d_output = cp.empty(3, dtype=np.uint32) + h_init = np.array([0], dtype=np.uint32) + + cuda.compute.segmented_reduce( + d_in=d_input, + d_out=d_output, + num_segments=3, + start_offsets_in=d_starts, + end_offsets_in=d_ends, + op=OpKind.PLUS, + h_init=h_init, + ) + + np.testing.assert_array_equal(d_output.get(), np.asarray([6, 9, 21])) + + +def test_merge_sort_well_known_less(): + d_input = cp.asarray([3.5, -1.0, 2.25, 2.0, 7.0], dtype=np.float64) + d_output = cp.empty_like(d_input) + + cuda.compute.merge_sort( + d_in_keys=d_input, + d_in_values=None, + d_out_keys=d_output, + d_out_values=None, + num_items=d_input.size, + op=OpKind.LESS, + ) + + np.testing.assert_array_equal(d_output.get(), np.sort(d_input.get())) + + +def test_radix_sort_key_value_pairs(): + h_keys = np.asarray([4, -2, 7, 1, -2, 0], dtype=np.int16) + h_values = np.asarray([40, 20, 70, 10, 21, 0], dtype=np.uint8) + d_out_keys = cp.empty_like(cp.asarray(h_keys)) + d_out_values = cp.empty_like(cp.asarray(h_values)) + + cuda.compute.radix_sort( + d_in_keys=cp.asarray(h_keys), + d_out_keys=d_out_keys, + d_in_values=cp.asarray(h_values), + d_out_values=d_out_values, + num_items=h_keys.size, + order=SortOrder.ASCENDING, + ) + + order = np.argsort(h_keys, stable=True) + np.testing.assert_array_equal(d_out_keys.get(), h_keys[order]) + np.testing.assert_array_equal(d_out_values.get(), h_values[order]) + + +def test_segmented_sort_keys(): + h_keys = np.asarray([3, 1, 2, 9, 7, 8, 6, 5], dtype=np.uint64) + h_offsets = np.asarray([0, 3, 6, 8], dtype=np.int64) + d_output = cp.empty_like(cp.asarray(h_keys)) + + cuda.compute.segmented_sort( + d_in_keys=cp.asarray(h_keys), + d_out_keys=d_output, + d_in_values=None, + d_out_values=None, + num_items=h_keys.size, + num_segments=h_offsets.size - 1, + start_offsets_in=cp.asarray(h_offsets[:-1]), + end_offsets_in=cp.asarray(h_offsets[1:]), + order=SortOrder.ASCENDING, + ) + + expected = np.asarray([1, 2, 3, 7, 8, 9, 5, 6], dtype=np.uint64) + np.testing.assert_array_equal(d_output.get(), expected) + + +def test_unique_by_key_well_known_equal_to(monkeypatch): + cc_major, _ = cuda.compute._cccl_interop.CudaDevice().compute_capability + if cc_major >= 9: + monkeypatch.setattr(cuda.compute._cccl_interop, "_check_sass", False) + + d_keys = cp.asarray([1, 1, 2, 2, 2, 3, 4, 4], dtype=np.int16) + d_values = cp.asarray([10, 11, 20, 21, 22, 30, 40, 41], dtype=np.int8) + d_out_keys = cp.empty_like(d_keys) + d_out_values = cp.empty_like(d_values) + d_num_selected = cp.empty(1, dtype=np.int64) + + cuda.compute.unique_by_key( + d_in_keys=d_keys, + d_in_items=d_values, + d_out_keys=d_out_keys, + d_out_items=d_out_values, + d_out_num_selected=d_num_selected, + op=OpKind.EQUAL_TO, + num_items=d_keys.size, + ) + + num_selected = int(d_num_selected.get()[0]) + np.testing.assert_array_equal(d_out_keys.get()[:num_selected], [1, 2, 3, 4]) + np.testing.assert_array_equal(d_out_values.get()[:num_selected], [10, 20, 30, 40]) + + +def test_histogram_even_small_range(): + h_samples = np.asarray([0.5, 1.5, 2.5, 2.75, 3.0, 3.5], dtype=np.float32) + d_histogram = cp.empty(4, dtype=np.int32) + + cuda.compute.histogram_even( + d_samples=cp.asarray(h_samples), + d_histogram=d_histogram, + num_output_levels=5, + lower_level=np.float32(0.0), + upper_level=np.float32(4.0), + num_samples=h_samples.size, + ) + + expected, _ = np.histogram(h_samples, bins=4, range=(0.0, 4.0)) + np.testing.assert_array_equal(d_histogram.get(), expected.astype(np.int32)) + + +def test_select_raw_op(): + h_input = np.arange(12, dtype=np.int32) + d_output = cp.empty_like(cp.asarray(h_input)) + d_num_selected = cp.empty(1, dtype=np.uint64) + + cuda.compute.select( + d_in=cp.asarray(h_input), + d_out=d_output, + d_num_selected_out=d_num_selected, + cond=_raw_even_i32_op(), + num_items=h_input.size, + ) + + num_selected = int(d_num_selected.get()[0]) + np.testing.assert_array_equal(d_output.get()[:num_selected], h_input[::2]) + + +def test_three_way_partition_raw_op(): + h_input = np.arange(12, dtype=np.int32) + d_first = cp.empty_like(cp.asarray(h_input)) + d_second = cp.empty_like(cp.asarray(h_input)) + d_unselected = cp.empty_like(cp.asarray(h_input)) + d_num_selected = cp.empty(2, dtype=np.uint64) + + cuda.compute.three_way_partition( + d_in=cp.asarray(h_input), + d_first_part_out=d_first, + d_second_part_out=d_second, + d_unselected_out=d_unselected, + d_num_selected_out=d_num_selected, + select_first_part_op=_raw_less_than_i32_op("no_numba_less_than_4_i32", 4), + select_second_part_op=_raw_less_than_i32_op("no_numba_less_than_8_i32", 8), + num_items=h_input.size, + ) + + selected = d_num_selected.get() + first_count = int(selected[0]) + second_count = int(selected[1]) + unselected_count = h_input.size - first_count - second_count + + np.testing.assert_array_equal(d_first.get()[:first_count], h_input[:4]) + np.testing.assert_array_equal(d_second.get()[:second_count], h_input[4:8]) + np.testing.assert_array_equal(d_unselected.get()[:unselected_count], h_input[8:]) + + +def test_raw_op_reduce(): + h_input = np.asarray([10, 20, 30, 40], dtype=np.int64) + d_output = cp.empty(1, dtype=np.int64) + + cuda.compute.reduce_into( + d_in=cp.asarray(h_input), + d_out=d_output, + num_items=h_input.size, + op=_raw_plus_i64_op(), + h_init=np.array([5], dtype=np.int64), + ) + + assert d_output.get()[0] == 105 + + +def test_stream_argument(cuda_stream): + d_lhs = cp.asarray([2, 4, 6, 8, 10], dtype=np.int32) + d_rhs = cp.asarray([1, 3, 5, 7, 9], dtype=np.int32) + d_output = cp.empty_like(d_lhs) + + cuda.compute.binary_transform( + d_in1=d_lhs, + d_in2=d_rhs, + d_out=d_output, + op=OpKind.PLUS, + num_items=d_lhs.size, + stream=cuda_stream, + ) + + cp.cuda.Device().synchronize() + np.testing.assert_array_equal(d_output.get(), np.asarray([3, 7, 11, 15, 19])) + + +def test_counting_iterator_reduce(): + d_output = cp.empty(1, dtype=np.int32) + + cuda.compute.reduce_into( + d_in=CountingIterator(np.int32(3)), + d_out=d_output, + num_items=8, + op=OpKind.PLUS, + h_init=np.array([0], dtype=np.int32), + ) + + assert d_output.get()[0] == 52 + + +def test_constant_iterator_reduce(): + d_output = cp.empty(1, dtype=np.float32) + + cuda.compute.reduce_into( + d_in=ConstantIterator(np.float32(1.5)), + d_out=d_output, + num_items=8, + op=OpKind.PLUS, + h_init=np.array([0], dtype=np.float32), + ) + + np.testing.assert_allclose(d_output.get()[0], np.float32(12.0)) + + +def test_cache_modified_input_iterator_reduce(): + d_input = cp.asarray([2, 4, 6, 8, 10], dtype=np.uint16) + d_output = cp.empty(1, dtype=np.uint16) + iterator = CacheModifiedInputIterator(d_input, modifier="stream") + + cuda.compute.reduce_into( + d_in=iterator, + d_out=d_output, + num_items=d_input.size, + op=OpKind.PLUS, + h_init=np.array([0], dtype=np.uint16), + ) + + assert d_output.get()[0] == 30 + + +def test_reverse_input_iterator_scan(): + d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32) + d_output = cp.empty_like(d_input) + + cuda.compute.inclusive_scan( + d_in=ReverseIterator(d_input), + d_out=d_output, + op=OpKind.PLUS, + init_value=np.array([0], dtype=np.int32), + num_items=d_input.size, + ) + + np.testing.assert_array_equal(d_output.get(), np.asarray([5, 9, 12, 14, 15])) + + +def test_reverse_output_iterator_scan(): + d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32) + d_output = cp.empty_like(d_input) + + cuda.compute.inclusive_scan( + d_in=d_input, + d_out=ReverseIterator(d_output), + op=OpKind.PLUS, + init_value=np.array([0], dtype=np.int32), + num_items=d_input.size, + ) + + np.testing.assert_array_equal(d_output.get(), np.asarray([15, 10, 6, 3, 1])) + + +def test_permutation_iterator_reduce(): + d_values = cp.asarray([10, 20, 30, 40, 50, 60], dtype=np.int64) + d_indices = cp.asarray([4, 2, 5, 1], dtype=np.int32) + d_output = cp.empty(1, dtype=np.int64) + + cuda.compute.reduce_into( + d_in=PermutationIterator(d_values, d_indices), + d_out=d_output, + num_items=d_indices.size, + op=OpKind.PLUS, + h_init=np.array([0], dtype=np.int64), + ) + + assert d_output.get()[0] == 160 + + +def test_transform_iterator_reduce(): + d_output = cp.empty(1, dtype=np.int32) + iterator = TransformIterator( + CountingIterator(np.int32(1)), _raw_square_i32_op(), value_type=cccl_int32 + ) + + cuda.compute.reduce_into( + d_in=iterator, + d_out=d_output, + num_items=6, + op=OpKind.PLUS, + h_init=np.array([0], dtype=np.int32), + ) + + assert d_output.get()[0] == 91 + + +def test_transform_output_iterator_reduce(): + d_input = cp.asarray([1, 2, 3, 4], dtype=np.int16) + d_output = cp.empty(1, dtype=np.int16) + output_iterator = TransformOutputIterator( + d_output, _raw_negate_i16_op(), output_value_type=cccl_int16 + ) + + cuda.compute.reduce_into( + d_in=d_input, + d_out=output_iterator, + num_items=d_input.size, + op=OpKind.PLUS, + h_init=np.array([0], dtype=np.int16), + ) + + assert d_output.get()[0] == -10 + + +def test_zip_iterator_transform(): + d_lhs = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32) + d_rhs = cp.asarray([10, 20, 30, 40, 50], dtype=np.int32) + d_output = cp.empty_like(d_lhs) + + cuda.compute.unary_transform( + d_in=ZipIterator(d_lhs, d_rhs), + d_out=d_output, + op=_raw_zip_sum_i32_op(), + num_items=d_lhs.size, + ) + + np.testing.assert_array_equal(d_output.get(), d_lhs.get() + d_rhs.get()) + + +def test_shuffle_iterator_transform(): + num_items = 17 + d_output = cp.empty(num_items, dtype=np.int64) + + cuda.compute.unary_transform( + d_in=ShuffleIterator(num_items, seed=123), + d_out=d_output, + op=OpKind.IDENTITY, + num_items=num_items, + ) + + result = d_output.get() + assert sorted(result.tolist()) == list(range(num_items)) + + +def test_discard_iterator_transform(): + d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32) + d_reference = cp.full_like(d_input, -1) + + cuda.compute.unary_transform( + d_in=d_input, + d_out=DiscardIterator(d_reference), + op=OpKind.IDENTITY, + num_items=d_input.size, + ) + + np.testing.assert_array_equal(d_reference.get(), np.full(5, -1, dtype=np.int32)) diff --git a/python/cuda_cccl/tests/compute/test_permutation_iterator.py b/python/cuda_cccl/tests/compute/test_permutation_iterator.py index 226904b75da..74d74a7b4e6 100644 --- a/python/cuda_cccl/tests/compute/test_permutation_iterator.py +++ b/python/cuda_cccl/tests/compute/test_permutation_iterator.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import cupy as cp import numpy as np -import pytest import cuda.compute from cuda.compute.iterators import ( @@ -13,7 +12,6 @@ ) -@pytest.mark.no_numba def test_permutation_iterator_equality(): values1 = cp.asarray([10, 20, 30, 40, 50], dtype="int32") values2 = cp.asarray([100, 200, 300], dtype="int32") @@ -39,7 +37,6 @@ def test_permutation_iterator_equality(): assert it1.kind != it5.kind -@pytest.mark.no_numba def test_permutation_iterator_with_array_values(): values = cp.asarray([10, 20, 30, 40, 50], dtype="int32") indices = cp.asarray([2, 0, 4, 1], dtype="int32") @@ -57,7 +54,6 @@ def test_permutation_iterator_with_array_values(): assert d_output[0] == values[indices].sum() -@pytest.mark.no_numba def test_permutation_iterator_with_iterator_values(): values_it = CountingIterator(np.int32(10)) indices = cp.asarray([2, 0, 4, 1], dtype="int32") @@ -161,7 +157,6 @@ def op(a): assert cp.all(d_out == expected) -@pytest.mark.no_numba def test_caching_permutation_iterator(): """Test that iterator compilation is cached across instances with the same structure.""" from cuda.compute._cpp_compile import compile_cpp_op_code @@ -209,7 +204,6 @@ def test_caching_permutation_iterator(): ) -@pytest.mark.no_numba def test_permutation_iterator_advance(): """Test PermutationIterator.__add__ only advances indices, not values.""" # Create values array [10, 20, 30, 40, 50, 60, 70] diff --git a/python/cuda_cccl/tests/compute/test_radix_sort.py b/python/cuda_cccl/tests/compute/test_radix_sort.py index d39a7e07fad..62b8e7dcb44 100644 --- a/python/cuda_cccl/tests/compute/test_radix_sort.py +++ b/python/cuda_cccl/tests/compute/test_radix_sort.py @@ -6,6 +6,7 @@ from typing import Tuple import cupy as cp +import numba import numpy as np import pytest @@ -14,7 +15,6 @@ DoubleBuffer, SortOrder, ) -from cuda.core import Device def get_mark(dt, log_size): @@ -148,7 +148,7 @@ def host_sort(h_in_keys, h_in_values, order, begin_bit=None, end_bit=None) -> Tu DTYPE_SIZE, ) def test_radix_sort_keys(dtype, num_items, monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -164,12 +164,12 @@ def test_radix_sort_keys(dtype, num_items, monkeypatch): h_in_keys = random_array(num_items, dtype, max_value=20) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = cp.asarray(h_in_keys) - d_out_keys = cp.asarray(h_out_keys) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_out_keys = numba.cuda.to_device(h_out_keys) radix_sort_device(d_in_keys, d_out_keys, None, None, order, num_items) - h_out_keys = d_out_keys.get() + h_out_keys = d_out_keys.copy_to_host() h_in_keys, _ = host_sort(h_in_keys, None, order) @@ -195,17 +195,17 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = cp.asarray(h_in_keys) - d_in_values = cp.asarray(h_in_values) - d_out_keys = cp.asarray(h_out_keys) - d_out_values = cp.asarray(h_out_values) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_values = numba.cuda.to_device(h_in_values) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_values = numba.cuda.to_device(h_out_values) radix_sort_device( d_in_keys, d_out_keys, d_in_values, d_out_values, order, num_items ) - h_out_keys = d_out_keys.get() - h_out_values = d_out_values.get() + h_out_keys = d_out_keys.copy_to_host() + h_out_values = d_out_values.copy_to_host() h_in_keys, h_in_values = host_sort(h_in_keys, h_in_values, order) @@ -218,7 +218,7 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch): DTYPE_SIZE, ) def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -234,14 +234,14 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch): h_in_keys = random_array(num_items, dtype, max_value=20) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = cp.asarray(h_in_keys) - d_out_keys = cp.asarray(h_out_keys) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_out_keys = numba.cuda.to_device(h_out_keys) keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys) radix_sort_device(keys_double_buffer, None, None, None, order, num_items) - h_out_keys = keys_double_buffer.current().get() + h_out_keys = keys_double_buffer.current().copy_to_host() h_in_keys, _ = host_sort(h_in_keys, None, order) @@ -253,7 +253,7 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch): DTYPE_SIZE, ) def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # NOTE: int16 failures seen only with NVRTC 13.1: if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)): import cuda.compute._cccl_interop @@ -270,10 +270,10 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = cp.asarray(h_in_keys) - d_in_values = cp.asarray(h_in_values) - d_out_keys = cp.asarray(h_out_keys) - d_out_values = cp.asarray(h_out_values) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_values = numba.cuda.to_device(h_in_values) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_values = numba.cuda.to_device(h_out_values) keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys) values_double_buffer = DoubleBuffer(d_in_values, d_out_values) @@ -282,8 +282,8 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): keys_double_buffer, None, values_double_buffer, None, order, num_items ) - h_out_keys = keys_double_buffer.current().get() - h_out_values = values_double_buffer.current().get() + h_out_keys = keys_double_buffer.current().copy_to_host() + h_out_values = values_double_buffer.current().copy_to_host() h_in_keys, h_in_values = host_sort(h_in_keys, h_in_values, order) @@ -304,7 +304,7 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): DTYPE_SIZE_BIT_WINDOW, ) def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # NOTE: int16 failures seen only with NVRTC 13.1: if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)): import cuda.compute._cccl_interop @@ -329,10 +329,10 @@ def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = cp.asarray(h_in_keys) - d_in_values = cp.asarray(h_in_values) - d_out_keys = cp.asarray(h_out_keys) - d_out_values = cp.asarray(h_out_values) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_values = numba.cuda.to_device(h_in_values) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_values = numba.cuda.to_device(h_out_values) radix_sort_device( d_in_keys, @@ -345,8 +345,8 @@ def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch): end_bit, ) - h_out_keys = d_out_keys.get() - h_out_values = d_out_values.get() + h_out_keys = d_out_keys.copy_to_host() + h_out_values = d_out_values.copy_to_host() h_in_keys, h_in_values = host_sort( h_in_keys, h_in_values, order, begin_bit, end_bit @@ -384,10 +384,10 @@ def test_radix_sort_pairs_double_buffer_bit_window(dtype, num_items, monkeypatch h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = cp.asarray(h_in_keys) - d_in_values = cp.asarray(h_in_values) - d_out_keys = cp.asarray(h_out_keys) - d_out_values = cp.asarray(h_out_values) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_values = numba.cuda.to_device(h_in_values) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_values = numba.cuda.to_device(h_out_values) keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys) values_double_buffer = DoubleBuffer(d_in_values, d_out_values) @@ -403,8 +403,8 @@ def test_radix_sort_pairs_double_buffer_bit_window(dtype, num_items, monkeypatch end_bit, ) - h_out_keys = keys_double_buffer.current().get() - h_out_values = values_double_buffer.current().get() + h_out_keys = keys_double_buffer.current().copy_to_host() + h_out_values = values_double_buffer.current().copy_to_host() h_in_keys, h_in_values = host_sort( h_in_keys, h_in_values, order, begin_bit, end_bit @@ -469,9 +469,8 @@ def test_radix_sort_with_stream(cuda_stream): np.testing.assert_array_equal(got, h_in_keys) -@pytest.mark.no_numba def test_radix_sort(monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -519,9 +518,8 @@ def test_radix_sort(monkeypatch): np.testing.assert_array_equal(h_out_items, h_in_values) -@pytest.mark.no_numba def test_radix_sort_double_buffer(monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: diff --git a/python/cuda_cccl/tests/compute/test_reduce.py b/python/cuda_cccl/tests/compute/test_reduce.py index 15f11428506..df3f024f0ee 100644 --- a/python/cuda_cccl/tests/compute/test_reduce.py +++ b/python/cuda_cccl/tests/compute/test_reduce.py @@ -6,6 +6,7 @@ import random import cupy as cp +import numba.cuda import numpy as np import pytest from cupy.cuda import runtime @@ -72,15 +73,15 @@ def add_op(a, b): def test_device_reduce(dtype, num_items, op): init_value = 42 h_init = np.array([init_value], dtype=dtype) - d_output = cp.empty(1, dtype=dtype) + d_output = numba.cuda.device_array(1, dtype=dtype) h_input = random_int(num_items, dtype) - d_input = cp.asarray(h_input) + d_input = numba.cuda.to_device(h_input) cuda.compute.reduce_into( d_in=d_input, d_out=d_output, num_items=d_input.size, op=op, h_init=h_init ) - h_output = d_output.get() + h_output = d_output.copy_to_host() assert h_output[0] == pytest.approx( sum(h_input) + init_value, rel=0.08 if dtype == np.float16 else 0 ) # obtained relative error value from c2h/include/c2h/check_results.cuh @@ -93,10 +94,10 @@ def test_device_reduce_with_lambda(): num_items = 1024 h_init = np.array([init_value], dtype=dtype) - d_output = cp.empty(1, dtype=dtype) + d_output = numba.cuda.device_array(1, dtype=dtype) h_input = random_int(num_items, dtype) - d_input = cp.asarray(h_input) + d_input = numba.cuda.to_device(h_input) # Use a lambda function directly as the reducer cuda.compute.reduce_into( @@ -106,7 +107,7 @@ def test_device_reduce_with_lambda(): op=lambda a, b: a + b, h_init=h_init, ) - h_output = d_output.get() + h_output = d_output.copy_to_host() assert h_output[0] == sum(h_input) + init_value @@ -117,10 +118,10 @@ def test_device_reduce_with_lambda_variable(): num_items = 1024 h_init = np.array([init_value], dtype=dtype) - d_output = cp.empty(1, dtype=dtype) + d_output = numba.cuda.device_array(1, dtype=dtype) h_input = random_int(num_items, dtype) - d_input = cp.asarray(h_input) + d_input = numba.cuda.to_device(h_input) # Use a lambda function assigned to a variable as the reducer cuda.compute.reduce_into( @@ -130,24 +131,24 @@ def test_device_reduce_with_lambda_variable(): op=add_op_lambda, h_init=h_init, ) - h_output = d_output.get() + h_output = d_output.copy_to_host() assert h_output[0] == sum(h_input) + init_value def test_complex_device_reduce(): h_init = np.array([40.0 + 2.0j], dtype=complex) - d_output = cp.empty(1, dtype=complex) + d_output = numba.cuda.device_array(1, dtype=complex) for num_items in [42, 420000]: real_imag = np.random.random((2, num_items)) h_input = real_imag[0] + 1j * real_imag[1] - d_input = cp.asarray(h_input) + d_input = numba.cuda.to_device(h_input) assert d_input.size == num_items cuda.compute.reduce_into( d_in=d_input, d_out=d_output, num_items=num_items, op=add_op, h_init=h_init ) - result = d_output.get()[0] + result = d_output.copy_to_host()[0] expected = np.sum(h_input, initial=h_init[0]) assert result == pytest.approx(expected) @@ -161,11 +162,11 @@ def _test_device_sum_with_iterator( if use_numpy_array: h_input = np.array(l_varr, dtype_inp) - d_input = cp.asarray(h_input) + d_input = numba.cuda.to_device(h_input) else: d_input = i_input - d_output = cp.empty(1, dtype_out) # to store device sum + d_output = numba.cuda.device_array(1, dtype_out) # to store device sum h_init = np.array([start_sum_with], dtype_out) @@ -173,7 +174,7 @@ def _test_device_sum_with_iterator( d_in=d_input, d_out=d_output, num_items=len(l_varr), op=add_op, h_init=h_init ) - h_output = d_output.get() + h_output = d_output.copy_to_host() assert h_output[0] == expected_result @@ -215,7 +216,7 @@ def test_device_sum_cache_modified_input_it( l_varr = [rng.randrange(100) for _ in range(num_items)] dtype_inp = np.dtype(supported_value_type) dtype_out = dtype_inp - input_devarr = cp.asarray(np.array(l_varr, dtype=dtype_inp)) + input_devarr = numba.cuda.to_device(np.array(l_varr, dtype=dtype_inp)) i_input = CacheModifiedInputIterator(input_devarr, modifier="stream") _test_device_sum_with_iterator( l_varr, start_sum_with, i_input, dtype_inp, dtype_out, use_numpy_array @@ -690,7 +691,6 @@ def add_op(x, y): ) -@pytest.mark.no_numba def test_device_reduce_well_known_plus(): dtype = np.int32 h_init = np.array([0], dtype=dtype) @@ -709,7 +709,6 @@ def test_device_reduce_well_known_plus(): assert (d_output == expected_output).all() -@pytest.mark.no_numba def test_device_reduce_well_known_minimum(): dtype = np.int32 h_init = np.array([100], dtype=dtype) @@ -728,7 +727,6 @@ def test_device_reduce_well_known_minimum(): assert (d_output == expected_output).all() -@pytest.mark.no_numba def test_device_reduce_well_known_maximum(): dtype = np.int32 h_init = np.array([-100], dtype=dtype) @@ -927,7 +925,6 @@ def sqrt(x: dtype) -> dtype: np.testing.assert_allclose(d_output.get(), expected.get(), atol=1e-6) -@pytest.mark.no_numba def test_reduce_with_not_guaranteed_determinism(floating_array): dtype = floating_array.dtype h_init = np.array([0], dtype=dtype) @@ -945,7 +942,6 @@ def test_reduce_with_not_guaranteed_determinism(floating_array): ) -@pytest.mark.no_numba def test_reduce_bool(): h_init = np.array([False]) d_input = cp.array([True, False, True]) diff --git a/python/cuda_cccl/tests/compute/test_scan.py b/python/cuda_cccl/tests/compute/test_scan.py index 0eb3af34c49..af6d941b11b 100644 --- a/python/cuda_cccl/tests/compute/test_scan.py +++ b/python/cuda_cccl/tests/compute/test_scan.py @@ -4,6 +4,7 @@ import cupy as cp +import numba.cuda import numpy as np import pytest @@ -15,7 +16,6 @@ TransformOutputIterator, gpu_struct, ) -from cuda.core import Device def scan_host(h_input: np.ndarray, op, h_init, force_inclusive): @@ -52,7 +52,7 @@ def scan_device(d_input, d_output, num_items, op, h_init, force_inclusive, strea [True, False], ) def test_scan_array_input(force_inclusive, input_array, monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip sass verification if input is complex # as LDL/STL instructions are emitted for complex types. # Also skip for: @@ -214,7 +214,6 @@ def op(a, b): np.testing.assert_allclose(expected, got, rtol=1e-5) -@pytest.mark.no_numba def test_exclusive_scan_well_known_plus(): dtype = np.int32 h_init = np.array([0], dtype=dtype) @@ -233,9 +232,8 @@ def test_exclusive_scan_well_known_plus(): np.testing.assert_equal(d_output.get(), expected) -@pytest.mark.no_numba def test_inclusive_scan_well_known_plus(monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip SASS check for CC 9.0+, due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -356,7 +354,7 @@ def add_op(a, b): def test_reverse_input_iterator(monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip SASS check for CC 9.0+, due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -411,7 +409,6 @@ def add_op(a, b): np.testing.assert_equal(d_output.get(), expected) -@pytest.mark.no_numba @pytest.mark.parametrize( "force_inclusive", [True, False], @@ -433,14 +430,13 @@ def test_future_init_value(force_inclusive): np.testing.assert_array_equal(expected, got) -@pytest.mark.no_numba def test_no_init_value(monkeypatch): force_inclusive = True num_items = 1024 dtype = np.dtype("int32") # Skip SASS check for CC 9.0 due to LDL/STL CI failure. - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability if cc_major >= 9: import cuda.compute._cccl_interop @@ -460,7 +456,6 @@ def test_no_init_value(monkeypatch): np.testing.assert_array_equal(expected, got) -@pytest.mark.no_numba def test_no_init_value_iterator(): force_inclusive = True num_items = 1024 @@ -498,7 +493,6 @@ def test_inclusive_scan_with_lambda(): np.testing.assert_array_equal(d_output.get(), expected) -@pytest.mark.no_numba @pytest.mark.parametrize("force_inclusive", [True, False]) def test_scan_bool_maximum(force_inclusive): h_init = np.array([False], dtype=np.bool_) diff --git a/python/cuda_cccl/tests/compute/test_segmented_reduce.py b/python/cuda_cccl/tests/compute/test_segmented_reduce.py index 815c98b8edb..950d860a14d 100644 --- a/python/cuda_cccl/tests/compute/test_segmented_reduce.py +++ b/python/cuda_cccl/tests/compute/test_segmented_reduce.py @@ -269,7 +269,6 @@ def _plus(a, b): ) -@pytest.mark.no_numba def test_segmented_reduce_well_known_plus(monkeypatch): # Disable SASS verification for this test (LDL instruction in SASS). monkeypatch.setattr( @@ -300,7 +299,6 @@ def test_segmented_reduce_well_known_plus(monkeypatch): np.testing.assert_equal(d_output.get(), expected) -@pytest.mark.no_numba def test_segmented_reduce_well_known_maximum(monkeypatch): # Disable SASS verification for this test (LDL instruction in SASS). monkeypatch.setattr( @@ -331,7 +329,6 @@ def test_segmented_reduce_well_known_maximum(monkeypatch): np.testing.assert_equal(d_output.get(), expected) -@pytest.mark.no_numba def test_segmented_reduce_bool_maximum(monkeypatch): # Disable SASS verification for this test (LDL instruction in SASS). monkeypatch.setattr( diff --git a/python/cuda_cccl/tests/compute/test_segmented_sort.py b/python/cuda_cccl/tests/compute/test_segmented_sort.py index 9c2ab9f2de4..2bdd03abb42 100644 --- a/python/cuda_cccl/tests/compute/test_segmented_sort.py +++ b/python/cuda_cccl/tests/compute/test_segmented_sort.py @@ -5,13 +5,12 @@ from typing import Tuple import cupy as cp +import numba import numpy as np import pytest import cuda.compute -pytestmark = pytest.mark.no_numba - DTYPE_LIST = [ np.uint8, np.int16, @@ -119,8 +118,8 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch): h_in_keys = random_array(num_items, dtype, max_value=50) start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = cp.asarray(h_in_keys) - d_out_keys = cp.asarray(np.empty_like(h_in_keys)) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) cuda.compute.segmented_sort( d_in_keys=d_in_keys, @@ -134,7 +133,7 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch): order=order, ) - h_out_keys = d_out_keys.get() + h_out_keys = d_out_keys.copy_to_host() expected_keys, _ = host_segmented_sort( h_in_keys, None, start_offsets, end_offsets, order ) @@ -154,10 +153,10 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size): start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = cp.asarray(h_in_keys) - d_in_vals = cp.asarray(h_in_vals) - d_out_keys = cp.asarray(np.empty_like(h_in_keys)) - d_out_vals = cp.asarray(np.empty_like(h_in_vals)) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_vals = numba.cuda.to_device(h_in_vals) + d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) + d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals)) cuda.compute.segmented_sort( d_in_keys=d_in_keys, @@ -171,8 +170,8 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size): order=order, ) - h_out_keys = d_out_keys.get() - h_out_vals = d_out_vals.get() + h_out_keys = d_out_keys.copy_to_host() + h_out_vals = d_out_vals.copy_to_host() expected_keys, expected_vals = host_segmented_sort( h_in_keys, h_in_vals, start_offsets, end_offsets, order @@ -190,8 +189,8 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size): h_in_keys = random_array(num_items, dtype, max_value=20) start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = cp.asarray(h_in_keys) - d_tmp_keys = cp.asarray(np.empty_like(h_in_keys)) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys) cuda.compute.segmented_sort( @@ -206,7 +205,7 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size): order=order, ) - h_out_keys = keys_db.current().get() + h_out_keys = keys_db.current().copy_to_host() expected_keys, _ = host_segmented_sort( h_in_keys, None, start_offsets, end_offsets, order ) @@ -225,10 +224,10 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size): start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = cp.asarray(h_in_keys) - d_in_vals = cp.asarray(h_in_vals) - d_tmp_keys = cp.asarray(np.empty_like(h_in_keys)) - d_tmp_vals = cp.asarray(np.empty_like(h_in_vals)) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_vals = numba.cuda.to_device(h_in_vals) + d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) + d_tmp_vals = numba.cuda.to_device(np.empty_like(h_in_vals)) keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys) vals_db = cuda.compute.DoubleBuffer(d_in_vals, d_tmp_vals) @@ -245,8 +244,8 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size): order=order, ) - h_out_keys = keys_db.current().get() - h_out_vals = vals_db.current().get() + h_out_keys = keys_db.current().copy_to_host() + h_out_vals = vals_db.current().copy_to_host() expected_keys, expected_vals = host_segmented_sort( h_in_keys, h_in_vals, start_offsets, end_offsets, order @@ -298,10 +297,10 @@ def test_segmented_sort_variable_segment_sizes(num_segments): h_in_keys = random_array(num_items, np.int32, max_value=100) h_in_vals = random_array(num_items, np.float32) - d_in_keys = cp.asarray(h_in_keys) - d_in_vals = cp.asarray(h_in_vals) - d_out_keys = cp.asarray(np.empty_like(h_in_keys)) - d_out_vals = cp.asarray(np.empty_like(h_in_vals)) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_vals = numba.cuda.to_device(h_in_vals) + d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) + d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals)) cuda.compute.segmented_sort( d_in_keys=d_in_keys, @@ -315,8 +314,8 @@ def test_segmented_sort_variable_segment_sizes(num_segments): order=order, ) - h_out_keys = d_out_keys.get() - h_out_vals = d_out_vals.get() + h_out_keys = d_out_keys.copy_to_host() + h_out_vals = d_out_vals.copy_to_host() expected_keys, expected_vals = host_segmented_sort( h_in_keys, h_in_vals, start_offsets, end_offsets, order ) diff --git a/python/cuda_cccl/tests/compute/test_select.py b/python/cuda_cccl/tests/compute/test_select.py index 3dcaf98a507..f2483860a4a 100644 --- a/python/cuda_cccl/tests/compute/test_select.py +++ b/python/cuda_cccl/tests/compute/test_select.py @@ -8,8 +8,6 @@ import cuda.compute from cuda.compute import CacheModifiedInputIterator, ZipIterator, gpu_struct -from cuda.compute._cpp_compile import compile_cpp_op_code -from cuda.compute.op import RawOp DTYPE_LIST = [ np.uint8, @@ -61,35 +59,6 @@ def _host_select(h_in: np.ndarray, cond): return selected, np.int64(selected.size) -def _raw_even_i32_op() -> RawOp: - source = """ -extern "C" __device__ void is_even_i32(void* x, void* result) { - int value = *static_cast(x); - *static_cast(result) = (value % 2) == 0; -} -""" - return RawOp(ltoir=compile_cpp_op_code(source), name="is_even_i32") - - -@pytest.mark.no_numba -def test_select_raw_op_minimal(): - h_in = np.arange(10, dtype=np.int32) - d_in = cp.asarray(h_in) - d_out = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.uint64) - - cuda.compute.select( - d_in=d_in, - d_out=d_out, - d_num_selected_out=d_num_selected, - cond=_raw_even_i32_op(), - num_items=len(d_in), - ) - - num_selected = int(d_num_selected[0].get()) - np.testing.assert_array_equal(d_out.get()[:num_selected], h_in[h_in % 2 == 0]) - - @pytest.mark.parametrize("dtype,num_items", select_params) def test_select_basic(dtype, num_items): h_in = random_array(num_items, dtype, max_value=100) diff --git a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py index 715f9e51235..5c82fe1ad38 100644 --- a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py +++ b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py @@ -87,7 +87,6 @@ def test_shuffle_iterator_with_permutation_iterator(): assert sorted(result) == sorted(d_values.get()) -@pytest.mark.no_numba def test_shuffle_iterator_invalid_num_items(): with pytest.raises(ValueError, match="num_items must be > 0"): ShuffleIterator(0, seed=42) diff --git a/python/cuda_cccl/tests/compute/test_struct_field_validation.py b/python/cuda_cccl/tests/compute/test_struct_field_validation.py index cf1edd7e681..3ddaee54d41 100644 --- a/python/cuda_cccl/tests/compute/test_struct_field_validation.py +++ b/python/cuda_cccl/tests/compute/test_struct_field_validation.py @@ -14,8 +14,6 @@ from cuda.compute import gpu_struct -pytestmark = pytest.mark.no_numba - def test_newline_in_field_name_is_rejected(): """Field names with newlines must be rejected — they are the exec() injection vector.""" diff --git a/python/cuda_cccl/tests/compute/test_three_way_partition.py b/python/cuda_cccl/tests/compute/test_three_way_partition.py index 8def513681b..3fb5b102275 100644 --- a/python/cuda_cccl/tests/compute/test_three_way_partition.py +++ b/python/cuda_cccl/tests/compute/test_three_way_partition.py @@ -8,8 +8,6 @@ import cuda.compute from cuda.compute import CacheModifiedInputIterator, gpu_struct -from cuda.compute._cpp_compile import compile_cpp_op_code -from cuda.compute.op import RawOp DTYPE_LIST = [ np.uint8, @@ -67,50 +65,6 @@ def _host_three_way_partition(h_in: np.ndarray, less_than_op, greater_equal_op): ) -def _raw_less_than_i32(name: str, threshold: int) -> RawOp: - source = f""" -extern "C" __device__ void {name}(void* x, void* result) {{ - int value = *static_cast(x); - *static_cast(result) = value < {threshold} ? 1 : 0; -}} -""" - return RawOp(ltoir=compile_cpp_op_code(source), name=name) - - -@pytest.mark.no_numba -def test_three_way_partition_raw_op_minimal(): - h_in = np.arange(10, dtype=np.int32) - d_in = cp.asarray(h_in) - d_first = cp.empty_like(d_in) - d_second = cp.empty_like(d_in) - d_unselected = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.uint64) - - cuda.compute.three_way_partition( - d_in=d_in, - d_first_part_out=d_first, - d_second_part_out=d_second, - d_unselected_out=d_unselected, - d_num_selected_out=d_num_selected, - select_first_part_op=_raw_less_than_i32("less_than_3_i32", 3), - select_second_part_op=_raw_less_than_i32("less_than_6_i32", 6), - num_items=len(d_in), - ) - - selected = d_num_selected.get() - first_count = int(selected[0]) - second_count = int(selected[1]) - unselected_count = len(h_in) - first_count - second_count - - np.testing.assert_array_equal(d_first.get()[:first_count], h_in[h_in < 3]) - np.testing.assert_array_equal( - d_second.get()[:second_count], h_in[(h_in >= 3) & (h_in < 6)] - ) - np.testing.assert_array_equal( - d_unselected.get()[:unselected_count], h_in[h_in >= 6] - ) - - @pytest.mark.parametrize("dtype,num_items", three_way_partition_params) def test_three_way_partition_basic(dtype, num_items, monkeypatch): # NOTE: the SASS check failure is seen only with NVRTC 13.1: diff --git a/python/cuda_cccl/tests/compute/test_transform.py b/python/cuda_cccl/tests/compute/test_transform.py index b8429726d77..c7c3ca2818a 100644 --- a/python/cuda_cccl/tests/compute/test_transform.py +++ b/python/cuda_cccl/tests/compute/test_transform.py @@ -266,7 +266,6 @@ def op2(a): np.testing.assert_allclose(expected, got) -@pytest.mark.no_numba def test_unary_transform_well_known_negate(): """Test unary transform with well-known NEGATE operation.""" dtype = np.int32 @@ -283,7 +282,6 @@ def test_unary_transform_well_known_negate(): np.testing.assert_equal(d_output.get(), expected) -@pytest.mark.no_numba def test_unary_transform_well_known_identity(): """Test unary transform with well-known IDENTITY operation.""" dtype = np.int32 @@ -300,7 +298,6 @@ def test_unary_transform_well_known_identity(): np.testing.assert_equal(d_output.get(), expected) -@pytest.mark.no_numba @pytest.mark.parametrize("dtype", [np.int32, np.float16]) def test_binary_transform_well_known_plus(dtype): """Test binary transform with well-known PLUS operation.""" @@ -322,7 +319,6 @@ def test_binary_transform_well_known_plus(dtype): np.testing.assert_equal(d_output.get(), expected) -@pytest.mark.no_numba def test_binary_transform_well_known_multiplies(): """Test binary transform with well-known MULTIPLIES operation.""" dtype = np.int32 @@ -590,7 +586,6 @@ def test_binary_transform_with_lambda(): np.testing.assert_array_equal(d_out.get(), expected) -@pytest.mark.no_numba def test_binary_transform_bool_equal_to(): d_input1 = cp.array([True, False, True, False], dtype=np.bool_) d_input2 = cp.array([True, True, False, False], dtype=np.bool_) diff --git a/python/cuda_cccl/tests/compute/test_unique_by_key.py b/python/cuda_cccl/tests/compute/test_unique_by_key.py index 945d4d99028..83a4a17db67 100644 --- a/python/cuda_cccl/tests/compute/test_unique_by_key.py +++ b/python/cuda_cccl/tests/compute/test_unique_by_key.py @@ -4,6 +4,7 @@ import cupy as cp +import numba.cuda import numpy as np import pytest @@ -14,7 +15,6 @@ OpKind, gpu_struct, ) -from cuda.core import Device DTYPE_LIST = [ np.uint8, @@ -125,7 +125,7 @@ def compare_op(lhs, rhs): @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params) def test_unique_by_key(dtype, num_items, op, monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -143,11 +143,11 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch): h_out_items = np.empty(num_items, dtype=np.float32) h_out_num_selected = np.empty(1, np.int32) - d_in_keys = cp.asarray(h_in_keys) - d_in_items = cp.asarray(h_in_items) - d_out_keys = cp.asarray(h_out_keys) - d_out_items = cp.asarray(h_out_items) - d_out_num_selected = cp.asarray(h_out_num_selected) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_items = numba.cuda.to_device(h_in_items) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_items = numba.cuda.to_device(h_out_items) + d_out_num_selected = numba.cuda.to_device(h_out_num_selected) unique_by_key_device( d_in_keys, @@ -159,10 +159,10 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch): num_items, ) - h_out_num_selected = d_out_num_selected.get() + h_out_num_selected = d_out_num_selected.copy_to_host() num_selected = h_out_num_selected[0] - h_out_keys = d_out_keys.get()[:num_selected] - h_out_items = d_out_items.get()[:num_selected] + h_out_keys = d_out_keys.copy_to_host()[:num_selected] + h_out_items = d_out_items.copy_to_host()[:num_selected] expected_keys, expected_items = unique_by_key_host(h_in_keys, h_in_items) @@ -172,7 +172,7 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch): @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params) def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip sass verification for CC 9.0+, due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -190,11 +190,11 @@ def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch): h_out_items = np.empty(num_items, dtype=np.float32) h_out_num_selected = np.empty(1, np.int64) - d_in_keys = cp.asarray(h_in_keys) - d_in_items = cp.asarray(h_in_items) - d_out_keys = cp.asarray(h_out_keys) - d_out_items = cp.asarray(h_out_items) - d_out_num_selected = cp.asarray(h_out_num_selected) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_items = numba.cuda.to_device(h_in_items) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_items = numba.cuda.to_device(h_out_items) + d_out_num_selected = numba.cuda.to_device(h_out_num_selected) i_in_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream") i_in_items = CacheModifiedInputIterator(d_in_items, modifier="stream") @@ -209,10 +209,10 @@ def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch): num_items, ) - h_out_num_selected = d_out_num_selected.get() + h_out_num_selected = d_out_num_selected.copy_to_host() num_selected = h_out_num_selected[0] - h_out_keys = d_out_keys.get()[:num_selected] - h_out_items = d_out_items.get()[:num_selected] + h_out_keys = d_out_keys.copy_to_host()[:num_selected] + h_out_items = d_out_items.copy_to_host()[:num_selected] expected_keys, expected_items = unique_by_key_host(h_in_keys, h_in_items) @@ -226,9 +226,9 @@ def test_unique_by_key_keys_only(): h_out_keys = np.empty(num_items, dtype=np.int32) h_out_num_selected = np.empty(1, np.int32) - d_in_keys = cp.asarray(h_in_keys) - d_out_keys = cp.asarray(h_out_keys) - d_out_num_selected = cp.asarray(h_out_num_selected) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_num_selected = numba.cuda.to_device(h_out_num_selected) unique_by_key_device( d_in_keys, @@ -240,9 +240,9 @@ def test_unique_by_key_keys_only(): num_items, ) - h_out_num_selected = d_out_num_selected.get() + h_out_num_selected = d_out_num_selected.copy_to_host() num_selected = h_out_num_selected[0] - h_out_keys = d_out_keys.get()[:num_selected] + h_out_keys = d_out_keys.copy_to_host()[:num_selected] expected_keys, _ = unique_by_key_host( h_in_keys, @@ -267,11 +267,11 @@ def compare_complex(lhs, rhs): h_out_items = np.empty(num_items, dtype=np.float32) h_out_num_selected = np.empty(1, np.int32) - d_in_keys = cp.asarray(h_in_keys) - d_in_items = cp.asarray(h_in_items) - d_out_keys = cp.asarray(h_out_keys) - d_out_items = cp.asarray(h_out_items) - d_out_num_selected = cp.asarray(h_out_num_selected) + d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_items = numba.cuda.to_device(h_in_items) + d_out_keys = numba.cuda.to_device(h_out_keys) + d_out_items = numba.cuda.to_device(h_out_items) + d_out_num_selected = numba.cuda.to_device(h_out_num_selected) unique_by_key_device( d_in_keys, @@ -283,10 +283,10 @@ def compare_complex(lhs, rhs): num_items, ) - h_out_num_selected = d_out_num_selected.get() + h_out_num_selected = d_out_num_selected.copy_to_host() num_selected = h_out_num_selected[0] - h_out_keys = d_out_keys.get()[:num_selected] - h_out_items = d_out_items.get()[:num_selected] + h_out_keys = d_out_keys.copy_to_host()[:num_selected] + h_out_items = d_out_items.copy_to_host()[:num_selected] expected_keys, expected_items = unique_by_key_host( h_in_keys, h_in_items, compare_complex @@ -363,7 +363,7 @@ def struct_compare_op(lhs, rhs): def test_unique_by_key_with_stream(cuda_stream, monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -417,9 +417,8 @@ def test_unique_by_key_with_stream(cuda_stream, monkeypatch): np.testing.assert_array_equal(h_out_items, expected_items) -@pytest.mark.no_numba def test_unique_by_key_well_known_equal_to(monkeypatch): - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: diff --git a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py index 2788a3c0893..e08709b6e1a 100644 --- a/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py +++ b/python/cuda_cccl/tests/compute/test_void_ptr_wrapper_validation.py @@ -12,16 +12,10 @@ """ import pytest +from numba import types -numba = pytest.importorskip("numba") -types = numba.types - -from cuda.compute._odr_helpers import ( # noqa: E402 - _ArgMode, - _ArgSpec, - _create_void_ptr_wrapper, -) -from cuda.compute._utils import sanitize_identifier # noqa: E402 +from cuda.compute._odr_helpers import _ArgMode, _ArgSpec, _create_void_ptr_wrapper +from cuda.compute._utils import sanitize_identifier def _make_arg_specs(): diff --git a/python/cuda_cccl/tests/compute/test_zip_iterator.py b/python/cuda_cccl/tests/compute/test_zip_iterator.py index 9fbed38359f..85630bb4f5f 100644 --- a/python/cuda_cccl/tests/compute/test_zip_iterator.py +++ b/python/cuda_cccl/tests/compute/test_zip_iterator.py @@ -12,7 +12,6 @@ ZipIterator, gpu_struct, ) -from cuda.core import Device @pytest.mark.parametrize("num_items", [10, 1_000, 100_000]) @@ -262,8 +261,10 @@ def min_pairs(p1, p2): @pytest.mark.parametrize("num_items", [10, 1000]) def test_output_zip_iterator_with_scan(monkeypatch, num_items): """Test ZipIterator as output iterator with scan operations.""" + import numba.cuda + # Skip SASS check for CC 8.0+ due to LDL/STL CI failure. - cc_major, _ = Device().compute_capability + cc_major, _ = numba.cuda.get_current_device().compute_capability if cc_major >= 8: monkeypatch.setattr( cuda.compute._cccl_interop, @@ -425,7 +426,9 @@ def sum_nested_zips(v1, v2): ], ) def test_nested_output_zip_iterator_with_scan(monkeypatch, num_items, dtype_map): - cc_major, _ = Device().compute_capability + import numba.cuda + + cc_major, _ = numba.cuda.get_current_device().compute_capability if cc_major >= 8: monkeypatch.setattr( cuda.compute._cccl_interop, @@ -499,7 +502,6 @@ def g(x): assert it1.kind != it2.kind -@pytest.mark.no_numba def test_caching_zip_iterator(): """Test that iterator compilation is cached across instances with the same structure.""" from cuda.compute._cpp_compile import compile_cpp_op_code @@ -575,7 +577,6 @@ def test_caching_zip_iterator(): assert len(set(kinds)) == 1, "Same CountingIterator types should have same kind" -@pytest.mark.no_numba def test_compilation_caching_across_iterator_types(): """Test that compilation caching works across different iterator types.""" from cuda.compute import ConstantIterator