diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 6146d863c87..1fcb1df75ae 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -84,6 +84,7 @@ workflows: - {jobs: ['test'], project: 'python', ctk: ['12.X', '13.X'], py_version: ['3.10'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']} - {jobs: ['test'], project: 'python', ctk: ['12.X','13.0', '13.X'], py_version: ['3.14'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']} - {jobs: ['test'], project: 'python', py_version: '3.14', gpu: 'h100', cxx: 'gcc13'} + - {jobs: ['test_py_compute_minimal'], project: 'python', ctk: '13.X', py_version: '3.14', gpu: 'l4', cxx: 'gcc13'} # CCCL packaging: - {jobs: ['test'], project: 'packaging', ctk: '12.0', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080', args: '-min-cmake'} - {jobs: ['test'], project: 'packaging', ctk: '12.X', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080'} @@ -539,6 +540,7 @@ jobs: test_py_headers: { name: "Test cuda.cccl.headers", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_headers'} } test_py_coop: { name: "Test cuda.coop._experimental", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_coop'} } test_py_par: { name: "Test cuda.compute", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute'} } + test_py_compute_minimal: { name: "Test cuda.compute minimal", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute_minimal'} } test_py_examples: { name: "Test cuda.cccl.examples", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_examples'} } # Run jobs for 'target' project (ci/util/build_and_test_targets.sh): diff --git a/ci/test_cuda_compute_minimal_python.sh b/ci/test_cuda_compute_minimal_python.sh new file mode 100755 index 00000000000..a43bbac7bfa --- /dev/null +++ b/ci/test_cuda_compute_minimal_python.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +set -euo pipefail + +ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(cd "$ci_dir/.." && pwd)" +source "$ci_dir/pyenv_helper.sh" + +# Parse common arguments +source "$ci_dir/util/python/common_arg_parser.sh" +parse_python_args "$@" +require_py_version "Usage: $0 -py-version " + +cuda_major_version=$(nvcc --version | grep release | awk '{print $6}' | tr -d ',' | cut -d '.' -f 1 | cut -d 'V' -f 2) + +# Setup Python environment +setup_python_env "${py_version}" + +# Fetch or build the cuda_cccl wheel: +if [[ -n "${GITHUB_ACTIONS:-}" ]]; then + wheel_artifact_name=$("$ci_dir/util/workflow/get_wheel_artifact_name.sh") + "$ci_dir/util/artifacts/download.sh" "${wheel_artifact_name}" "${repo_root}/" + wheelhouse_dir="${repo_root}/wheelhouse" +else + "$ci_dir/build_cuda_cccl_python.sh" -py-version "${py_version}" + wheelhouse_dir="${repo_root}/wheelhouse" +fi + +# Install cuda_cccl with the minimal CUDA extra. This intentionally avoids the +# full cu* extras because those pull in numba/numba-cuda. +CUDA_CCCL_WHEEL_PATH="$(ls "${wheelhouse_dir}"/cuda_cccl-*.whl)" +python -m pip install "${CUDA_CCCL_WHEEL_PATH}[minimal-cu${cuda_major_version}]" +python -m pip install pytest pytest-xdist "cupy-cuda${cuda_major_version}x" + +cd "${repo_root}/python/cuda_cccl/tests/" +python -m pytest -n 6 -v compute/test_no_numba.py diff --git a/python/cuda_cccl/tests/compute/test_no_numba.py b/python/cuda_cccl/tests/compute/test_no_numba.py index 8f1d271e1ed..6fb1ef0e811 100644 --- a/python/cuda_cccl/tests/compute/test_no_numba.py +++ b/python/cuda_cccl/tests/compute/test_no_numba.py @@ -1,16 +1,108 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + import cupy as cp import numpy as np import pytest import cuda.compute -from cuda.compute import OpKind +from cuda.compute import ( + CacheModifiedInputIterator, + ConstantIterator, + CountingIterator, + DiscardIterator, + OpKind, + PermutationIterator, + ReverseIterator, + ShuffleIterator, + SortOrder, + TransformIterator, + TransformOutputIterator, + ZipIterator, +) +from cuda.compute._cpp_compile import compile_cpp_op_code +from cuda.compute.op import RawOp +from cuda.compute.types import int16 as cccl_int16 +from cuda.compute.types import int32 as cccl_int32 -# Mainly, these tests check that we can use algorithms with OpKind -# operators while not requiring numba to be installed. +# These tests define the minimal-extra integration contract. They intentionally +# use small fixed inputs and avoid the Python-callable operator path. pytestmark = pytest.mark.no_numba -@pytest.mark.no_numba +def _raw_op(source: str, name: str) -> RawOp: + return RawOp(ltoir=compile_cpp_op_code(source), name=name) + + +def _raw_even_i32_op() -> RawOp: + source = """ +extern "C" __device__ void no_numba_even_i32(void* x, void* result) { + int value = *static_cast(x); + *static_cast(result) = (value % 2) == 0; +} +""" + return _raw_op(source, "no_numba_even_i32") + + +def _raw_less_than_i32_op(name: str, threshold: int) -> RawOp: + source = f""" +extern "C" __device__ void {name}(void* x, void* result) {{ + int value = *static_cast(x); + *static_cast(result) = value < {threshold} ? 1 : 0; +}} +""" + return _raw_op(source, name) + + +def _raw_plus_i64_op() -> RawOp: + source = """ +extern "C" __device__ void no_numba_plus_i64( + void* lhs, + void* rhs, + void* result +) { + *static_cast(result) = + *static_cast(lhs) + *static_cast(rhs); +} +""" + return _raw_op(source, "no_numba_plus_i64") + + +def _raw_square_i32_op() -> RawOp: + source = """ +extern "C" __device__ void no_numba_square_i32(void* x, void* result) { + int value = *static_cast(x); + *static_cast(result) = value * value; +} +""" + return _raw_op(source, "no_numba_square_i32") + + +def _raw_zip_sum_i32_op() -> RawOp: + source = """ +struct Zip2I32 { + int field_0; + int field_1; +}; + +extern "C" __device__ void no_numba_zip_sum_i32(void* x, void* result) { + auto values = static_cast(x); + *static_cast(result) = values->field_0 + values->field_1; +} +""" + return _raw_op(source, "no_numba_zip_sum_i32") + + +def _raw_negate_i16_op() -> RawOp: + source = """ +extern "C" __device__ void no_numba_negate_i16(void* x, void* result) { + *static_cast(result) = -*static_cast(x); +} +""" + return _raw_op(source, "no_numba_negate_i16") + + def test_import_numba_raises(): with pytest.raises( ImportError, match="This test is marked 'no_numba' but attempted to import it" @@ -18,68 +110,459 @@ def test_import_numba_raises(): import numba.cuda # noqa: F401 -def test_reduce_op_kind(): - num_items = 100 - h_input = np.arange(num_items, dtype=np.int32) - d_input = cp.array(h_input) +def test_reduce_well_known_plus(): + h_input = np.arange(1, 14, dtype=np.int32) + d_input = cp.asarray(h_input) d_output = cp.empty(1, dtype=np.int32) + h_init = np.array([5], dtype=np.int32) - h_init = np.array(0, dtype=np.int32) cuda.compute.reduce_into( - d_in=d_input, d_out=d_output, num_items=num_items, op=OpKind.PLUS, h_init=h_init + d_in=d_input, + d_out=d_output, + num_items=d_input.size, + op=OpKind.PLUS, + h_init=h_init, + ) + + assert d_output.get()[0] == np.sum(h_input, initial=h_init[0]) + + +def test_exclusive_scan_well_known_plus(): + d_input = cp.asarray([2, 4, 6, 8, 10, 12], dtype=np.uint16) + d_output = cp.empty_like(d_input) + h_init = np.array([1], dtype=np.uint16) + + cuda.compute.exclusive_scan( + d_in=d_input, + d_out=d_output, + op=OpKind.PLUS, + init_value=h_init, + num_items=d_input.size, ) - result = d_output.get()[0] - expected = np.sum(h_input) - assert result == expected + expected = np.asarray([1, 3, 7, 13, 21, 31], dtype=np.uint16) + np.testing.assert_array_equal(d_output.get(), expected) -def test_binary_transform_op_kind(): - num_items = 100 - h_input1 = np.arange(num_items, dtype=np.int32) - h_input2 = np.arange(num_items, dtype=np.int32) * 2 - d_input1 = cp.array(h_input1) - d_input2 = cp.array(h_input2) - d_output = cp.empty(num_items, dtype=np.int32) +def test_binary_transform_well_known_plus(): + d_lhs = cp.asarray([1.5, 2.5, 3.5, 4.5], dtype=np.float32) + d_rhs = cp.asarray([10.0, 20.0, 30.0, 40.0], dtype=np.float32) + d_output = cp.empty_like(d_lhs) cuda.compute.binary_transform( - d_in1=d_input1, - d_in2=d_input2, + d_in1=d_lhs, + d_in2=d_rhs, d_out=d_output, op=OpKind.PLUS, - num_items=num_items, + num_items=d_lhs.size, ) - result = d_output.get() - expected = h_input1 + h_input2 - assert np.array_equal(result, expected) + np.testing.assert_allclose(d_output.get(), d_lhs.get() + d_rhs.get()) + + +def test_unary_transform_well_known_negate(): + d_input = cp.asarray([-4, -2, 0, 2, 4], dtype=np.int8) + d_output = cp.empty_like(d_input) + + cuda.compute.unary_transform( + d_in=d_input, + d_out=d_output, + op=OpKind.NEGATE, + num_items=d_input.size, + ) + + np.testing.assert_array_equal(d_output.get(), np.asarray([4, 2, 0, -2, -4])) + + +@pytest.mark.parametrize( + "search, side", + [ + (cuda.compute.lower_bound, "left"), + (cuda.compute.upper_bound, "right"), + ], +) +def test_binary_search_explicit_opkind_less(search, side): + h_data = np.asarray([1, 3, 3, 7, 9, 11], dtype=np.int64) + h_values = np.asarray([0, 3, 4, 10, 12], dtype=np.int64) + d_out = cp.empty(h_values.size, dtype=np.uintp) + + search( + d_data=cp.asarray(h_data), + num_items=h_data.size, + d_values=cp.asarray(h_values), + num_values=h_values.size, + d_out=d_out, + comp=OpKind.LESS, + ) + + expected = np.searchsorted(h_data, h_values, side=side).astype(np.uintp) + np.testing.assert_array_equal(d_out.get(), expected) + + +def test_segmented_reduce_well_known_plus(monkeypatch): + monkeypatch.setattr(cuda.compute._cccl_interop, "_check_sass", False) + + d_input = cp.asarray([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.uint32) + d_starts = cp.asarray([0, 3, 5], dtype=np.int32) + d_ends = cp.asarray([3, 5, 8], dtype=np.int32) + d_output = cp.empty(3, dtype=np.uint32) + h_init = np.array([0], dtype=np.uint32) + + cuda.compute.segmented_reduce( + d_in=d_input, + d_out=d_output, + num_segments=3, + start_offsets_in=d_starts, + end_offsets_in=d_ends, + op=OpKind.PLUS, + h_init=h_init, + ) + + np.testing.assert_array_equal(d_output.get(), np.asarray([6, 9, 21])) + + +def test_merge_sort_well_known_less(): + d_input = cp.asarray([3.5, -1.0, 2.25, 2.0, 7.0], dtype=np.float64) + d_output = cp.empty_like(d_input) + + cuda.compute.merge_sort( + d_in_keys=d_input, + d_in_values=None, + d_out_keys=d_output, + d_out_values=None, + num_items=d_input.size, + op=OpKind.LESS, + ) + + np.testing.assert_array_equal(d_output.get(), np.sort(d_input.get())) + + +def test_radix_sort_key_value_pairs(): + h_keys = np.asarray([4, -2, 7, 1, -2, 0], dtype=np.int16) + h_values = np.asarray([40, 20, 70, 10, 21, 0], dtype=np.uint8) + d_out_keys = cp.empty_like(cp.asarray(h_keys)) + d_out_values = cp.empty_like(cp.asarray(h_values)) + cuda.compute.radix_sort( + d_in_keys=cp.asarray(h_keys), + d_out_keys=d_out_keys, + d_in_values=cp.asarray(h_values), + d_out_values=d_out_values, + num_items=h_keys.size, + order=SortOrder.ASCENDING, + ) -def test_segmented_sort_op_kind(): - # Create segments: [3, 1, 4] | [1, 5, 9, 2] | [6, 5] - num_items = 9 - h_keys = np.array([3, 1, 4, 1, 5, 9, 2, 6, 5], dtype=np.int32) - h_offsets = np.array([0, 3, 7, 9], dtype=np.int32) + order = np.argsort(h_keys, stable=True) + np.testing.assert_array_equal(d_out_keys.get(), h_keys[order]) + np.testing.assert_array_equal(d_out_values.get(), h_values[order]) - d_keys_in = cp.array(h_keys) - d_keys_out = cp.empty(num_items, dtype=np.int32) - d_offsets = cp.array(h_offsets) - num_segments = len(h_offsets) - 1 +def test_segmented_sort_keys(): + h_keys = np.asarray([3, 1, 2, 9, 7, 8, 6, 5], dtype=np.uint64) + h_offsets = np.asarray([0, 3, 6, 8], dtype=np.int64) + d_output = cp.empty_like(cp.asarray(h_keys)) cuda.compute.segmented_sort( - d_in_keys=d_keys_in, - d_out_keys=d_keys_out, + d_in_keys=cp.asarray(h_keys), + d_out_keys=d_output, d_in_values=None, d_out_values=None, + num_items=h_keys.size, + num_segments=h_offsets.size - 1, + start_offsets_in=cp.asarray(h_offsets[:-1]), + end_offsets_in=cp.asarray(h_offsets[1:]), + order=SortOrder.ASCENDING, + ) + + expected = np.asarray([1, 2, 3, 7, 8, 9, 5, 6], dtype=np.uint64) + np.testing.assert_array_equal(d_output.get(), expected) + + +def test_unique_by_key_well_known_equal_to(monkeypatch): + cc_major, _ = cuda.compute._cccl_interop.CudaDevice().compute_capability + if cc_major >= 9: + monkeypatch.setattr(cuda.compute._cccl_interop, "_check_sass", False) + + d_keys = cp.asarray([1, 1, 2, 2, 2, 3, 4, 4], dtype=np.int16) + d_values = cp.asarray([10, 11, 20, 21, 22, 30, 40, 41], dtype=np.int8) + d_out_keys = cp.empty_like(d_keys) + d_out_values = cp.empty_like(d_values) + d_num_selected = cp.empty(1, dtype=np.int64) + + cuda.compute.unique_by_key( + d_in_keys=d_keys, + d_in_items=d_values, + d_out_keys=d_out_keys, + d_out_items=d_out_values, + d_out_num_selected=d_num_selected, + op=OpKind.EQUAL_TO, + num_items=d_keys.size, + ) + + num_selected = int(d_num_selected.get()[0]) + np.testing.assert_array_equal(d_out_keys.get()[:num_selected], [1, 2, 3, 4]) + np.testing.assert_array_equal(d_out_values.get()[:num_selected], [10, 20, 30, 40]) + + +def test_histogram_even_small_range(): + h_samples = np.asarray([0.5, 1.5, 2.5, 2.75, 3.0, 3.5], dtype=np.float32) + d_histogram = cp.empty(4, dtype=np.int32) + + cuda.compute.histogram_even( + d_samples=cp.asarray(h_samples), + d_histogram=d_histogram, + num_output_levels=5, + lower_level=np.float32(0.0), + upper_level=np.float32(4.0), + num_samples=h_samples.size, + ) + + expected, _ = np.histogram(h_samples, bins=4, range=(0.0, 4.0)) + np.testing.assert_array_equal(d_histogram.get(), expected.astype(np.int32)) + + +def test_select_raw_op(): + h_input = np.arange(12, dtype=np.int32) + d_output = cp.empty_like(cp.asarray(h_input)) + d_num_selected = cp.empty(1, dtype=np.uint64) + + cuda.compute.select( + d_in=cp.asarray(h_input), + d_out=d_output, + d_num_selected_out=d_num_selected, + cond=_raw_even_i32_op(), + num_items=h_input.size, + ) + + num_selected = int(d_num_selected.get()[0]) + np.testing.assert_array_equal(d_output.get()[:num_selected], h_input[::2]) + + +def test_three_way_partition_raw_op(): + h_input = np.arange(12, dtype=np.int32) + d_first = cp.empty_like(cp.asarray(h_input)) + d_second = cp.empty_like(cp.asarray(h_input)) + d_unselected = cp.empty_like(cp.asarray(h_input)) + d_num_selected = cp.empty(2, dtype=np.uint64) + + cuda.compute.three_way_partition( + d_in=cp.asarray(h_input), + d_first_part_out=d_first, + d_second_part_out=d_second, + d_unselected_out=d_unselected, + d_num_selected_out=d_num_selected, + select_first_part_op=_raw_less_than_i32_op("no_numba_less_than_4_i32", 4), + select_second_part_op=_raw_less_than_i32_op("no_numba_less_than_8_i32", 8), + num_items=h_input.size, + ) + + selected = d_num_selected.get() + first_count = int(selected[0]) + second_count = int(selected[1]) + unselected_count = h_input.size - first_count - second_count + + np.testing.assert_array_equal(d_first.get()[:first_count], h_input[:4]) + np.testing.assert_array_equal(d_second.get()[:second_count], h_input[4:8]) + np.testing.assert_array_equal(d_unselected.get()[:unselected_count], h_input[8:]) + + +def test_raw_op_reduce(): + h_input = np.asarray([10, 20, 30, 40], dtype=np.int64) + d_output = cp.empty(1, dtype=np.int64) + + cuda.compute.reduce_into( + d_in=cp.asarray(h_input), + d_out=d_output, + num_items=h_input.size, + op=_raw_plus_i64_op(), + h_init=np.array([5], dtype=np.int64), + ) + + assert d_output.get()[0] == 105 + + +def test_stream_argument(cuda_stream): + d_lhs = cp.asarray([2, 4, 6, 8, 10], dtype=np.int32) + d_rhs = cp.asarray([1, 3, 5, 7, 9], dtype=np.int32) + d_output = cp.empty_like(d_lhs) + + cuda.compute.binary_transform( + d_in1=d_lhs, + d_in2=d_rhs, + d_out=d_output, + op=OpKind.PLUS, + num_items=d_lhs.size, + stream=cuda_stream, + ) + + cp.cuda.Device().synchronize() + np.testing.assert_array_equal(d_output.get(), np.asarray([3, 7, 11, 15, 19])) + + +def test_counting_iterator_reduce(): + d_output = cp.empty(1, dtype=np.int32) + + cuda.compute.reduce_into( + d_in=CountingIterator(np.int32(3)), + d_out=d_output, + num_items=8, + op=OpKind.PLUS, + h_init=np.array([0], dtype=np.int32), + ) + + assert d_output.get()[0] == 52 + + +def test_constant_iterator_reduce(): + d_output = cp.empty(1, dtype=np.float32) + + cuda.compute.reduce_into( + d_in=ConstantIterator(np.float32(1.5)), + d_out=d_output, + num_items=8, + op=OpKind.PLUS, + h_init=np.array([0], dtype=np.float32), + ) + + np.testing.assert_allclose(d_output.get()[0], np.float32(12.0)) + + +def test_cache_modified_input_iterator_reduce(): + d_input = cp.asarray([2, 4, 6, 8, 10], dtype=np.uint16) + d_output = cp.empty(1, dtype=np.uint16) + iterator = CacheModifiedInputIterator(d_input, modifier="stream") + + cuda.compute.reduce_into( + d_in=iterator, + d_out=d_output, + num_items=d_input.size, + op=OpKind.PLUS, + h_init=np.array([0], dtype=np.uint16), + ) + + assert d_output.get()[0] == 30 + + +def test_reverse_input_iterator_scan(): + d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32) + d_output = cp.empty_like(d_input) + + cuda.compute.inclusive_scan( + d_in=ReverseIterator(d_input), + d_out=d_output, + op=OpKind.PLUS, + init_value=np.array([0], dtype=np.int32), + num_items=d_input.size, + ) + + np.testing.assert_array_equal(d_output.get(), np.asarray([5, 9, 12, 14, 15])) + + +def test_reverse_output_iterator_scan(): + d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32) + d_output = cp.empty_like(d_input) + + cuda.compute.inclusive_scan( + d_in=d_input, + d_out=ReverseIterator(d_output), + op=OpKind.PLUS, + init_value=np.array([0], dtype=np.int32), + num_items=d_input.size, + ) + + np.testing.assert_array_equal(d_output.get(), np.asarray([15, 10, 6, 3, 1])) + + +def test_permutation_iterator_reduce(): + d_values = cp.asarray([10, 20, 30, 40, 50, 60], dtype=np.int64) + d_indices = cp.asarray([4, 2, 5, 1], dtype=np.int32) + d_output = cp.empty(1, dtype=np.int64) + + cuda.compute.reduce_into( + d_in=PermutationIterator(d_values, d_indices), + d_out=d_output, + num_items=d_indices.size, + op=OpKind.PLUS, + h_init=np.array([0], dtype=np.int64), + ) + + assert d_output.get()[0] == 160 + + +def test_transform_iterator_reduce(): + d_output = cp.empty(1, dtype=np.int32) + iterator = TransformIterator( + CountingIterator(np.int32(1)), _raw_square_i32_op(), value_type=cccl_int32 + ) + + cuda.compute.reduce_into( + d_in=iterator, + d_out=d_output, + num_items=6, + op=OpKind.PLUS, + h_init=np.array([0], dtype=np.int32), + ) + + assert d_output.get()[0] == 91 + + +def test_transform_output_iterator_reduce(): + d_input = cp.asarray([1, 2, 3, 4], dtype=np.int16) + d_output = cp.empty(1, dtype=np.int16) + output_iterator = TransformOutputIterator( + d_output, _raw_negate_i16_op(), output_value_type=cccl_int16 + ) + + cuda.compute.reduce_into( + d_in=d_input, + d_out=output_iterator, + num_items=d_input.size, + op=OpKind.PLUS, + h_init=np.array([0], dtype=np.int16), + ) + + assert d_output.get()[0] == -10 + + +def test_zip_iterator_transform(): + d_lhs = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32) + d_rhs = cp.asarray([10, 20, 30, 40, 50], dtype=np.int32) + d_output = cp.empty_like(d_lhs) + + cuda.compute.unary_transform( + d_in=ZipIterator(d_lhs, d_rhs), + d_out=d_output, + op=_raw_zip_sum_i32_op(), + num_items=d_lhs.size, + ) + + np.testing.assert_array_equal(d_output.get(), d_lhs.get() + d_rhs.get()) + + +def test_shuffle_iterator_transform(): + num_items = 17 + d_output = cp.empty(num_items, dtype=np.int64) + + cuda.compute.unary_transform( + d_in=ShuffleIterator(num_items, seed=123), + d_out=d_output, + op=OpKind.IDENTITY, num_items=num_items, - num_segments=num_segments, - start_offsets_in=d_offsets[:-1], - end_offsets_in=d_offsets[1:], - order=cuda.compute.SortOrder.ASCENDING, ) - result = d_keys_out.get() - # Expected: [1, 3, 4] | [1, 2, 5, 9] | [5, 6] - expected = np.array([1, 3, 4, 1, 2, 5, 9, 5, 6], dtype=np.int32) - assert np.array_equal(result, expected) + result = d_output.get() + assert sorted(result.tolist()) == list(range(num_items)) + + +def test_discard_iterator_transform(): + d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32) + d_reference = cp.full_like(d_input, -1) + + cuda.compute.unary_transform( + d_in=d_input, + d_out=DiscardIterator(d_reference), + op=OpKind.IDENTITY, + num_items=d_input.size, + ) + + np.testing.assert_array_equal(d_reference.get(), np.full(5, -1, dtype=np.int32))