diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 6146d863c87..1fcb1df75ae 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -84,6 +84,7 @@ workflows:
     - {jobs: ['test'], project: 'python', ctk: ['12.X',        '13.X'], py_version: ['3.10'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']}
     - {jobs: ['test'], project: 'python', ctk: ['12.X','13.0', '13.X'], py_version: ['3.14'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']}
     - {jobs: ['test'], project: 'python', py_version: '3.14', gpu: 'h100', cxx: 'gcc13'}
+    - {jobs: ['test_py_compute_minimal'], project: 'python', ctk: '13.X', py_version: '3.14', gpu: 'l4', cxx: 'gcc13'}
     # CCCL packaging:
     - {jobs: ['test'], project: 'packaging', ctk: '12.0', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080', args: '-min-cmake'}
     - {jobs: ['test'], project: 'packaging', ctk: '12.X', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080'}
@@ -539,6 +540,7 @@ jobs:
   test_py_headers:  { name: "Test cuda.cccl.headers",      gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_headers'} }
   test_py_coop:     { name: "Test cuda.coop._experimental",  gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_coop'} }
   test_py_par:      { name: "Test cuda.compute",     gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute'} }
+  test_py_compute_minimal: { name: "Test cuda.compute minimal", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute_minimal'} }
   test_py_examples: { name: "Test cuda.cccl.examples",     gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_examples'} }
 
   # Run jobs for 'target' project (ci/util/build_and_test_targets.sh):
diff --git a/ci/test_cuda_compute_minimal_python.sh b/ci/test_cuda_compute_minimal_python.sh
new file mode 100755
index 00000000000..a43bbac7bfa
--- /dev/null
+++ b/ci/test_cuda_compute_minimal_python.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+repo_root="$(cd "$ci_dir/.." && pwd)"
+source "$ci_dir/pyenv_helper.sh"
+
+# Parse common arguments
+source "$ci_dir/util/python/common_arg_parser.sh"
+parse_python_args "$@"
+require_py_version "Usage: $0 -py-version <python_version>"
+
+cuda_major_version=$(nvcc --version | grep release | awk '{print $6}' | tr -d ',' | cut -d '.' -f 1 | cut -d 'V' -f 2)
+
+# Setup Python environment
+setup_python_env "${py_version}"
+
+# Fetch or build the cuda_cccl wheel:
+if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
+  wheel_artifact_name=$("$ci_dir/util/workflow/get_wheel_artifact_name.sh")
+  "$ci_dir/util/artifacts/download.sh" "${wheel_artifact_name}" "${repo_root}/"
+  wheelhouse_dir="${repo_root}/wheelhouse"
+else
+  "$ci_dir/build_cuda_cccl_python.sh" -py-version "${py_version}"
+  wheelhouse_dir="${repo_root}/wheelhouse"
+fi
+
+# Install cuda_cccl with the minimal CUDA extra. This intentionally avoids the
+# full cu* extras because those pull in numba/numba-cuda.
+CUDA_CCCL_WHEEL_PATH="$(ls "${wheelhouse_dir}"/cuda_cccl-*.whl)"
+python -m pip install "${CUDA_CCCL_WHEEL_PATH}[minimal-cu${cuda_major_version}]"
+python -m pip install pytest pytest-xdist "cupy-cuda${cuda_major_version}x"
+
+cd "${repo_root}/python/cuda_cccl/tests/"
+python -m pytest -n 6 -v compute/test_no_numba.py
diff --git a/python/cuda_cccl/tests/compute/test_no_numba.py b/python/cuda_cccl/tests/compute/test_no_numba.py
index 8f1d271e1ed..6fb1ef0e811 100644
--- a/python/cuda_cccl/tests/compute/test_no_numba.py
+++ b/python/cuda_cccl/tests/compute/test_no_numba.py
@@ -1,16 +1,108 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 import cupy as cp
 import numpy as np
 import pytest
 
 import cuda.compute
-from cuda.compute import OpKind
+from cuda.compute import (
+    CacheModifiedInputIterator,
+    ConstantIterator,
+    CountingIterator,
+    DiscardIterator,
+    OpKind,
+    PermutationIterator,
+    ReverseIterator,
+    ShuffleIterator,
+    SortOrder,
+    TransformIterator,
+    TransformOutputIterator,
+    ZipIterator,
+)
+from cuda.compute._cpp_compile import compile_cpp_op_code
+from cuda.compute.op import RawOp
+from cuda.compute.types import int16 as cccl_int16
+from cuda.compute.types import int32 as cccl_int32
 
-# Mainly, these tests check that we can use algorithms with OpKind
-# operators while not requiring numba to be installed.
+# These tests define the minimal-extra integration contract. They intentionally
+# use small fixed inputs and avoid the Python-callable operator path.
 pytestmark = pytest.mark.no_numba
 
 
-@pytest.mark.no_numba
+def _raw_op(source: str, name: str) -> RawOp:
+    return RawOp(ltoir=compile_cpp_op_code(source), name=name)
+
+
+def _raw_even_i32_op() -> RawOp:
+    source = """
+extern "C" __device__ void no_numba_even_i32(void* x, void* result) {
+    int value = *static_cast<int*>(x);
+    *static_cast<bool*>(result) = (value % 2) == 0;
+}
+"""
+    return _raw_op(source, "no_numba_even_i32")
+
+
+def _raw_less_than_i32_op(name: str, threshold: int) -> RawOp:
+    source = f"""
+extern "C" __device__ void {name}(void* x, void* result) {{
+    int value = *static_cast<int*>(x);
+    *static_cast<unsigned char*>(result) = value < {threshold} ? 1 : 0;
+}}
+"""
+    return _raw_op(source, name)
+
+
+def _raw_plus_i64_op() -> RawOp:
+    source = """
+extern "C" __device__ void no_numba_plus_i64(
+    void* lhs,
+    void* rhs,
+    void* result
+) {
+    *static_cast<long long*>(result) =
+        *static_cast<long long*>(lhs) + *static_cast<long long*>(rhs);
+}
+"""
+    return _raw_op(source, "no_numba_plus_i64")
+
+
+def _raw_square_i32_op() -> RawOp:
+    source = """
+extern "C" __device__ void no_numba_square_i32(void* x, void* result) {
+    int value = *static_cast<int*>(x);
+    *static_cast<int*>(result) = value * value;
+}
+"""
+    return _raw_op(source, "no_numba_square_i32")
+
+
+def _raw_zip_sum_i32_op() -> RawOp:
+    source = """
+struct Zip2I32 {
+    int field_0;
+    int field_1;
+};
+
+extern "C" __device__ void no_numba_zip_sum_i32(void* x, void* result) {
+    auto values = static_cast<Zip2I32*>(x);
+    *static_cast<int*>(result) = values->field_0 + values->field_1;
+}
+"""
+    return _raw_op(source, "no_numba_zip_sum_i32")
+
+
+def _raw_negate_i16_op() -> RawOp:
+    source = """
+extern "C" __device__ void no_numba_negate_i16(void* x, void* result) {
+    *static_cast<short*>(result) = -*static_cast<short*>(x);
+}
+"""
+    return _raw_op(source, "no_numba_negate_i16")
+
+
 def test_import_numba_raises():
     with pytest.raises(
         ImportError, match="This test is marked 'no_numba' but attempted to import it"
@@ -18,68 +110,459 @@ def test_import_numba_raises():
         import numba.cuda  # noqa: F401
 
 
-def test_reduce_op_kind():
-    num_items = 100
-    h_input = np.arange(num_items, dtype=np.int32)
-    d_input = cp.array(h_input)
+def test_reduce_well_known_plus():
+    h_input = np.arange(1, 14, dtype=np.int32)
+    d_input = cp.asarray(h_input)
     d_output = cp.empty(1, dtype=np.int32)
+    h_init = np.array([5], dtype=np.int32)
 
-    h_init = np.array(0, dtype=np.int32)
     cuda.compute.reduce_into(
-        d_in=d_input, d_out=d_output, num_items=num_items, op=OpKind.PLUS, h_init=h_init
+        d_in=d_input,
+        d_out=d_output,
+        num_items=d_input.size,
+        op=OpKind.PLUS,
+        h_init=h_init,
+    )
+
+    assert d_output.get()[0] == np.sum(h_input, initial=h_init[0])
+
+
+def test_exclusive_scan_well_known_plus():
+    d_input = cp.asarray([2, 4, 6, 8, 10, 12], dtype=np.uint16)
+    d_output = cp.empty_like(d_input)
+    h_init = np.array([1], dtype=np.uint16)
+
+    cuda.compute.exclusive_scan(
+        d_in=d_input,
+        d_out=d_output,
+        op=OpKind.PLUS,
+        init_value=h_init,
+        num_items=d_input.size,
     )
 
-    result = d_output.get()[0]
-    expected = np.sum(h_input)
-    assert result == expected
+    expected = np.asarray([1, 3, 7, 13, 21, 31], dtype=np.uint16)
+    np.testing.assert_array_equal(d_output.get(), expected)
 
 
-def test_binary_transform_op_kind():
-    num_items = 100
-    h_input1 = np.arange(num_items, dtype=np.int32)
-    h_input2 = np.arange(num_items, dtype=np.int32) * 2
-    d_input1 = cp.array(h_input1)
-    d_input2 = cp.array(h_input2)
-    d_output = cp.empty(num_items, dtype=np.int32)
+def test_binary_transform_well_known_plus():
+    d_lhs = cp.asarray([1.5, 2.5, 3.5, 4.5], dtype=np.float32)
+    d_rhs = cp.asarray([10.0, 20.0, 30.0, 40.0], dtype=np.float32)
+    d_output = cp.empty_like(d_lhs)
 
     cuda.compute.binary_transform(
-        d_in1=d_input1,
-        d_in2=d_input2,
+        d_in1=d_lhs,
+        d_in2=d_rhs,
         d_out=d_output,
         op=OpKind.PLUS,
-        num_items=num_items,
+        num_items=d_lhs.size,
     )
 
-    result = d_output.get()
-    expected = h_input1 + h_input2
-    assert np.array_equal(result, expected)
+    np.testing.assert_allclose(d_output.get(), d_lhs.get() + d_rhs.get())
+
+
+def test_unary_transform_well_known_negate():
+    d_input = cp.asarray([-4, -2, 0, 2, 4], dtype=np.int8)
+    d_output = cp.empty_like(d_input)
+
+    cuda.compute.unary_transform(
+        d_in=d_input,
+        d_out=d_output,
+        op=OpKind.NEGATE,
+        num_items=d_input.size,
+    )
+
+    np.testing.assert_array_equal(d_output.get(), np.asarray([4, 2, 0, -2, -4]))
+
+
+@pytest.mark.parametrize(
+    "search, side",
+    [
+        (cuda.compute.lower_bound, "left"),
+        (cuda.compute.upper_bound, "right"),
+    ],
+)
+def test_binary_search_explicit_opkind_less(search, side):
+    h_data = np.asarray([1, 3, 3, 7, 9, 11], dtype=np.int64)
+    h_values = np.asarray([0, 3, 4, 10, 12], dtype=np.int64)
+    d_out = cp.empty(h_values.size, dtype=np.uintp)
+
+    search(
+        d_data=cp.asarray(h_data),
+        num_items=h_data.size,
+        d_values=cp.asarray(h_values),
+        num_values=h_values.size,
+        d_out=d_out,
+        comp=OpKind.LESS,
+    )
+
+    expected = np.searchsorted(h_data, h_values, side=side).astype(np.uintp)
+    np.testing.assert_array_equal(d_out.get(), expected)
+
+
+def test_segmented_reduce_well_known_plus(monkeypatch):
+    monkeypatch.setattr(cuda.compute._cccl_interop, "_check_sass", False)
+
+    d_input = cp.asarray([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.uint32)
+    d_starts = cp.asarray([0, 3, 5], dtype=np.int32)
+    d_ends = cp.asarray([3, 5, 8], dtype=np.int32)
+    d_output = cp.empty(3, dtype=np.uint32)
+    h_init = np.array([0], dtype=np.uint32)
+
+    cuda.compute.segmented_reduce(
+        d_in=d_input,
+        d_out=d_output,
+        num_segments=3,
+        start_offsets_in=d_starts,
+        end_offsets_in=d_ends,
+        op=OpKind.PLUS,
+        h_init=h_init,
+    )
+
+    np.testing.assert_array_equal(d_output.get(), np.asarray([6, 9, 21]))
+
+
+def test_merge_sort_well_known_less():
+    d_input = cp.asarray([3.5, -1.0, 2.25, 2.0, 7.0], dtype=np.float64)
+    d_output = cp.empty_like(d_input)
+
+    cuda.compute.merge_sort(
+        d_in_keys=d_input,
+        d_in_values=None,
+        d_out_keys=d_output,
+        d_out_values=None,
+        num_items=d_input.size,
+        op=OpKind.LESS,
+    )
+
+    np.testing.assert_array_equal(d_output.get(), np.sort(d_input.get()))
+
+
+def test_radix_sort_key_value_pairs():
+    h_keys = np.asarray([4, -2, 7, 1, -2, 0], dtype=np.int16)
+    h_values = np.asarray([40, 20, 70, 10, 21, 0], dtype=np.uint8)
+    d_out_keys = cp.empty_like(cp.asarray(h_keys))
+    d_out_values = cp.empty_like(cp.asarray(h_values))
 
+    cuda.compute.radix_sort(
+        d_in_keys=cp.asarray(h_keys),
+        d_out_keys=d_out_keys,
+        d_in_values=cp.asarray(h_values),
+        d_out_values=d_out_values,
+        num_items=h_keys.size,
+        order=SortOrder.ASCENDING,
+    )
 
-def test_segmented_sort_op_kind():
-    # Create segments: [3, 1, 4] | [1, 5, 9, 2] | [6, 5]
-    num_items = 9
-    h_keys = np.array([3, 1, 4, 1, 5, 9, 2, 6, 5], dtype=np.int32)
-    h_offsets = np.array([0, 3, 7, 9], dtype=np.int32)
+    order = np.argsort(h_keys, stable=True)
+    np.testing.assert_array_equal(d_out_keys.get(), h_keys[order])
+    np.testing.assert_array_equal(d_out_values.get(), h_values[order])
 
-    d_keys_in = cp.array(h_keys)
-    d_keys_out = cp.empty(num_items, dtype=np.int32)
-    d_offsets = cp.array(h_offsets)
 
-    num_segments = len(h_offsets) - 1
+def test_segmented_sort_keys():
+    h_keys = np.asarray([3, 1, 2, 9, 7, 8, 6, 5], dtype=np.uint64)
+    h_offsets = np.asarray([0, 3, 6, 8], dtype=np.int64)
+    d_output = cp.empty_like(cp.asarray(h_keys))
 
     cuda.compute.segmented_sort(
-        d_in_keys=d_keys_in,
-        d_out_keys=d_keys_out,
+        d_in_keys=cp.asarray(h_keys),
+        d_out_keys=d_output,
         d_in_values=None,
         d_out_values=None,
+        num_items=h_keys.size,
+        num_segments=h_offsets.size - 1,
+        start_offsets_in=cp.asarray(h_offsets[:-1]),
+        end_offsets_in=cp.asarray(h_offsets[1:]),
+        order=SortOrder.ASCENDING,
+    )
+
+    expected = np.asarray([1, 2, 3, 7, 8, 9, 5, 6], dtype=np.uint64)
+    np.testing.assert_array_equal(d_output.get(), expected)
+
+
+def test_unique_by_key_well_known_equal_to(monkeypatch):
+    cc_major, _ = cuda.compute._cccl_interop.CudaDevice().compute_capability
+    if cc_major >= 9:
+        monkeypatch.setattr(cuda.compute._cccl_interop, "_check_sass", False)
+
+    d_keys = cp.asarray([1, 1, 2, 2, 2, 3, 4, 4], dtype=np.int16)
+    d_values = cp.asarray([10, 11, 20, 21, 22, 30, 40, 41], dtype=np.int8)
+    d_out_keys = cp.empty_like(d_keys)
+    d_out_values = cp.empty_like(d_values)
+    d_num_selected = cp.empty(1, dtype=np.int64)
+
+    cuda.compute.unique_by_key(
+        d_in_keys=d_keys,
+        d_in_items=d_values,
+        d_out_keys=d_out_keys,
+        d_out_items=d_out_values,
+        d_out_num_selected=d_num_selected,
+        op=OpKind.EQUAL_TO,
+        num_items=d_keys.size,
+    )
+
+    num_selected = int(d_num_selected.get()[0])
+    np.testing.assert_array_equal(d_out_keys.get()[:num_selected], [1, 2, 3, 4])
+    np.testing.assert_array_equal(d_out_values.get()[:num_selected], [10, 20, 30, 40])
+
+
+def test_histogram_even_small_range():
+    h_samples = np.asarray([0.5, 1.5, 2.5, 2.75, 3.0, 3.5], dtype=np.float32)
+    d_histogram = cp.empty(4, dtype=np.int32)
+
+    cuda.compute.histogram_even(
+        d_samples=cp.asarray(h_samples),
+        d_histogram=d_histogram,
+        num_output_levels=5,
+        lower_level=np.float32(0.0),
+        upper_level=np.float32(4.0),
+        num_samples=h_samples.size,
+    )
+
+    expected, _ = np.histogram(h_samples, bins=4, range=(0.0, 4.0))
+    np.testing.assert_array_equal(d_histogram.get(), expected.astype(np.int32))
+
+
+def test_select_raw_op():
+    h_input = np.arange(12, dtype=np.int32)
+    d_output = cp.empty_like(cp.asarray(h_input))
+    d_num_selected = cp.empty(1, dtype=np.uint64)
+
+    cuda.compute.select(
+        d_in=cp.asarray(h_input),
+        d_out=d_output,
+        d_num_selected_out=d_num_selected,
+        cond=_raw_even_i32_op(),
+        num_items=h_input.size,
+    )
+
+    num_selected = int(d_num_selected.get()[0])
+    np.testing.assert_array_equal(d_output.get()[:num_selected], h_input[::2])
+
+
+def test_three_way_partition_raw_op():
+    h_input = np.arange(12, dtype=np.int32)
+    d_first = cp.empty_like(cp.asarray(h_input))
+    d_second = cp.empty_like(cp.asarray(h_input))
+    d_unselected = cp.empty_like(cp.asarray(h_input))
+    d_num_selected = cp.empty(2, dtype=np.uint64)
+
+    cuda.compute.three_way_partition(
+        d_in=cp.asarray(h_input),
+        d_first_part_out=d_first,
+        d_second_part_out=d_second,
+        d_unselected_out=d_unselected,
+        d_num_selected_out=d_num_selected,
+        select_first_part_op=_raw_less_than_i32_op("no_numba_less_than_4_i32", 4),
+        select_second_part_op=_raw_less_than_i32_op("no_numba_less_than_8_i32", 8),
+        num_items=h_input.size,
+    )
+
+    selected = d_num_selected.get()
+    first_count = int(selected[0])
+    second_count = int(selected[1])
+    unselected_count = h_input.size - first_count - second_count
+
+    np.testing.assert_array_equal(d_first.get()[:first_count], h_input[:4])
+    np.testing.assert_array_equal(d_second.get()[:second_count], h_input[4:8])
+    np.testing.assert_array_equal(d_unselected.get()[:unselected_count], h_input[8:])
+
+
+def test_raw_op_reduce():
+    h_input = np.asarray([10, 20, 30, 40], dtype=np.int64)
+    d_output = cp.empty(1, dtype=np.int64)
+
+    cuda.compute.reduce_into(
+        d_in=cp.asarray(h_input),
+        d_out=d_output,
+        num_items=h_input.size,
+        op=_raw_plus_i64_op(),
+        h_init=np.array([5], dtype=np.int64),
+    )
+
+    assert d_output.get()[0] == 105
+
+
+def test_stream_argument(cuda_stream):
+    d_lhs = cp.asarray([2, 4, 6, 8, 10], dtype=np.int32)
+    d_rhs = cp.asarray([1, 3, 5, 7, 9], dtype=np.int32)
+    d_output = cp.empty_like(d_lhs)
+
+    cuda.compute.binary_transform(
+        d_in1=d_lhs,
+        d_in2=d_rhs,
+        d_out=d_output,
+        op=OpKind.PLUS,
+        num_items=d_lhs.size,
+        stream=cuda_stream,
+    )
+
+    cp.cuda.Device().synchronize()
+    np.testing.assert_array_equal(d_output.get(), np.asarray([3, 7, 11, 15, 19]))
+
+
+def test_counting_iterator_reduce():
+    d_output = cp.empty(1, dtype=np.int32)
+
+    cuda.compute.reduce_into(
+        d_in=CountingIterator(np.int32(3)),
+        d_out=d_output,
+        num_items=8,
+        op=OpKind.PLUS,
+        h_init=np.array([0], dtype=np.int32),
+    )
+
+    assert d_output.get()[0] == 52
+
+
+def test_constant_iterator_reduce():
+    d_output = cp.empty(1, dtype=np.float32)
+
+    cuda.compute.reduce_into(
+        d_in=ConstantIterator(np.float32(1.5)),
+        d_out=d_output,
+        num_items=8,
+        op=OpKind.PLUS,
+        h_init=np.array([0], dtype=np.float32),
+    )
+
+    np.testing.assert_allclose(d_output.get()[0], np.float32(12.0))
+
+
+def test_cache_modified_input_iterator_reduce():
+    d_input = cp.asarray([2, 4, 6, 8, 10], dtype=np.uint16)
+    d_output = cp.empty(1, dtype=np.uint16)
+    iterator = CacheModifiedInputIterator(d_input, modifier="stream")
+
+    cuda.compute.reduce_into(
+        d_in=iterator,
+        d_out=d_output,
+        num_items=d_input.size,
+        op=OpKind.PLUS,
+        h_init=np.array([0], dtype=np.uint16),
+    )
+
+    assert d_output.get()[0] == 30
+
+
+def test_reverse_input_iterator_scan():
+    d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)
+    d_output = cp.empty_like(d_input)
+
+    cuda.compute.inclusive_scan(
+        d_in=ReverseIterator(d_input),
+        d_out=d_output,
+        op=OpKind.PLUS,
+        init_value=np.array([0], dtype=np.int32),
+        num_items=d_input.size,
+    )
+
+    np.testing.assert_array_equal(d_output.get(), np.asarray([5, 9, 12, 14, 15]))
+
+
+def test_reverse_output_iterator_scan():
+    d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)
+    d_output = cp.empty_like(d_input)
+
+    cuda.compute.inclusive_scan(
+        d_in=d_input,
+        d_out=ReverseIterator(d_output),
+        op=OpKind.PLUS,
+        init_value=np.array([0], dtype=np.int32),
+        num_items=d_input.size,
+    )
+
+    np.testing.assert_array_equal(d_output.get(), np.asarray([15, 10, 6, 3, 1]))
+
+
+def test_permutation_iterator_reduce():
+    d_values = cp.asarray([10, 20, 30, 40, 50, 60], dtype=np.int64)
+    d_indices = cp.asarray([4, 2, 5, 1], dtype=np.int32)
+    d_output = cp.empty(1, dtype=np.int64)
+
+    cuda.compute.reduce_into(
+        d_in=PermutationIterator(d_values, d_indices),
+        d_out=d_output,
+        num_items=d_indices.size,
+        op=OpKind.PLUS,
+        h_init=np.array([0], dtype=np.int64),
+    )
+
+    assert d_output.get()[0] == 160
+
+
+def test_transform_iterator_reduce():
+    d_output = cp.empty(1, dtype=np.int32)
+    iterator = TransformIterator(
+        CountingIterator(np.int32(1)), _raw_square_i32_op(), value_type=cccl_int32
+    )
+
+    cuda.compute.reduce_into(
+        d_in=iterator,
+        d_out=d_output,
+        num_items=6,
+        op=OpKind.PLUS,
+        h_init=np.array([0], dtype=np.int32),
+    )
+
+    assert d_output.get()[0] == 91
+
+
+def test_transform_output_iterator_reduce():
+    d_input = cp.asarray([1, 2, 3, 4], dtype=np.int16)
+    d_output = cp.empty(1, dtype=np.int16)
+    output_iterator = TransformOutputIterator(
+        d_output, _raw_negate_i16_op(), output_value_type=cccl_int16
+    )
+
+    cuda.compute.reduce_into(
+        d_in=d_input,
+        d_out=output_iterator,
+        num_items=d_input.size,
+        op=OpKind.PLUS,
+        h_init=np.array([0], dtype=np.int16),
+    )
+
+    assert d_output.get()[0] == -10
+
+
+def test_zip_iterator_transform():
+    d_lhs = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)
+    d_rhs = cp.asarray([10, 20, 30, 40, 50], dtype=np.int32)
+    d_output = cp.empty_like(d_lhs)
+
+    cuda.compute.unary_transform(
+        d_in=ZipIterator(d_lhs, d_rhs),
+        d_out=d_output,
+        op=_raw_zip_sum_i32_op(),
+        num_items=d_lhs.size,
+    )
+
+    np.testing.assert_array_equal(d_output.get(), d_lhs.get() + d_rhs.get())
+
+
+def test_shuffle_iterator_transform():
+    num_items = 17
+    d_output = cp.empty(num_items, dtype=np.int64)
+
+    cuda.compute.unary_transform(
+        d_in=ShuffleIterator(num_items, seed=123),
+        d_out=d_output,
+        op=OpKind.IDENTITY,
         num_items=num_items,
-        num_segments=num_segments,
-        start_offsets_in=d_offsets[:-1],
-        end_offsets_in=d_offsets[1:],
-        order=cuda.compute.SortOrder.ASCENDING,
     )
 
-    result = d_keys_out.get()
-    # Expected: [1, 3, 4] | [1, 2, 5, 9] | [5, 6]
-    expected = np.array([1, 3, 4, 1, 2, 5, 9, 5, 6], dtype=np.int32)
-    assert np.array_equal(result, expected)
+    result = d_output.get()
+    assert sorted(result.tolist()) == list(range(num_items))
+
+
+def test_discard_iterator_transform():
+    d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)
+    d_reference = cp.full_like(d_input, -1)
+
+    cuda.compute.unary_transform(
+        d_in=d_input,
+        d_out=DiscardIterator(d_reference),
+        op=OpKind.IDENTITY,
+        num_items=d_input.size,
+    )
+
+    np.testing.assert_array_equal(d_reference.get(), np.full(5, -1, dtype=np.int32))