From 6e91c891361ae961e09ef89f6bd6c18df66c02da Mon Sep 17 00:00:00 2001 From: Nan An Date: Mon, 1 Jun 2026 14:29:10 -0700 Subject: [PATCH 01/83] tile DeviceTransform policy picker --- .../bench/transform/tile/device_transform.cuh | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 cub/benchmarks/bench/transform/tile/device_transform.cuh diff --git a/cub/benchmarks/bench/transform/tile/device_transform.cuh b/cub/benchmarks/bench/transform/tile/device_transform.cuh new file mode 100644 index 00000000000..2849deb92d5 --- /dev/null +++ b/cub/benchmarks/bench/transform/tile/device_transform.cuh @@ -0,0 +1,66 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +// tile port of cub::DeviceTransform - tile-size policy picker. +// Mirrors the bytes-in-flight target used by cub's transform policy so +// the tile launches land at comparable occupancy. + +#pragma once + +#include +#include + +namespace cub_tile::detail { + +constexpr int min_bytes_in_flight_per_sm(int cc_x10) { + if (cc_x10 >= 1000) return 64 * 1024; // B200 + if (cc_x10 >= 900) return 48 * 1024; // H100/H200 + if (cc_x10 >= 800) return 16 * 1024; // A100 + return 12 * 1024; +} + +constexpr int ceil_div(int a, int b) { return (a + b - 1) / b; } +constexpr int round_up_pow2(int x) { + int p = 1; while (p < x) p *= 2; return p; +} +constexpr int min_size(int a) { return a; } +template constexpr int min_size(int a, int b, Ts... rest) { + int m = a < b ? a : b; return min_size(m, rest...); +} + +// mufu_heavy=true tells the policy the functor body has heavy MUFU usage. +// for small data types, vectorized load will make them arrive packed in registers +// and the compiler unpacks them and packs them back. reducing the compute work per +// thread helps here. +// need profiling to know the exact cause +template +constexpr int pick_tile_size(bool mufu_heavy = false, int cc_x10 = 1000) { + constexpr int threads_per_block = 128; + constexpr int vector_bytes = 16; // LDG.E.128 -> 16 bytes + constexpr int max_items_per_thread = 32; + constexpr int max_occupancy = 16; + + constexpr int min_elem = min_size(int(sizeof(Out)), int(sizeof(Ins))...); + constexpr int items_for_vec = ceil_div(vector_bytes, min_elem); + + // Fill (zero inputs) keeps the same latency target by counting output bytes. + constexpr int bytes_per_iter = (sizeof...(Ins) > 0) + ? (int(sizeof(Ins)) + ... + 0) + : int(sizeof(Out)); + const int target = min_bytes_in_flight_per_sm(cc_x10); + const int items_for_latency = + ceil_div(target, max_occupancy * threads_per_block * bytes_per_iter); + + int items = items_for_vec > items_for_latency ? items_for_vec : items_for_latency; + items = round_up_pow2(items); + if (items > max_items_per_thread) items = max_items_per_thread; + + if (mufu_heavy && min_elem < 4) { + const int byte_cap = vector_bytes / min_elem; // 16 for I8, 8 for I16/half/bf16 + if (items > byte_cap) items = byte_cap; + } + + return items * threads_per_block; +} + +} // namespace cub_tile::detail From 4980aab745a91c8d5611ce83ca5d5ba5d21e6619 Mon Sep 17 00:00:00 2001 From: Nan An Date: Mon, 1 Jun 2026 14:30:07 -0700 Subject: [PATCH 02/83] tile DeviceTransform kernels + public API --- .../bench/transform/tile/device_transform.cuh | 101 +++++++++++++++++- 1 file changed, 98 insertions(+), 3 deletions(-) diff --git a/cub/benchmarks/bench/transform/tile/device_transform.cuh b/cub/benchmarks/bench/transform/tile/device_transform.cuh index 2849deb92d5..57a0b965985 100644 --- a/cub/benchmarks/bench/transform/tile/device_transform.cuh +++ b/cub/benchmarks/bench/transform/tile/device_transform.cuh @@ -1,14 +1,18 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause -// tile port of cub::DeviceTransform - tile-size policy picker. -// Mirrors the bytes-in-flight target used by cub's transform policy so -// the tile launches land at comparable occupancy. +// tile port of cub::DeviceTransform. +// Public surface mirrors cub::DeviceTransform::{Transform, Fill}; the +// kernels themselves are written against the tile DSL (cuda::tiles). #pragma once #include #include +#include +#include + +#include "cuda_tile.h" namespace cub_tile::detail { @@ -63,4 +67,95 @@ constexpr int pick_tile_size(bool mufu_heavy = false, int cc_x10 = 1000) { return items * threads_per_block; } +template +__tile_global__ void transform_kernel(int64_t num_items_, Out* __restrict__ out_, + const Ins* __restrict__... ins_) { + namespace ct = cuda::tiles; + + const auto bx = ct::bid().x; + Fn fn{}; + + auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_)); + auto out = ct::assume_aligned<16>(out_); + + auto out_span = ct::tensor_span{out, ct::extents{num_items}}; + auto out_view = ct::partition_view{out_span, ct::shape{}}; + + auto load_one = [bx, num_items](auto* ptr_) { + auto ptr = ct::assume_aligned<16>(ptr_); + auto span = ct::tensor_span{ptr, ct::extents{num_items}}; + auto view = ct::partition_view{span, ct::shape{}}; + return view.load_masked(bx); + }; + + out_view.store_masked(fn(load_one(ins_)...), bx); +} + +template +cudaError_t launch_impl( + ::cuda::std::tuple inputs, + Out* output, + int64_t num_items, + cudaStream_t stream, + ::cuda::std::index_sequence) { + + if (num_items <= 0) return cudaSuccess; + + const int64_t num_blocks = (num_items + TileSize - 1) / TileSize; + + transform_kernel<<(num_blocks), 1, 0, stream>>>( + num_items, output, ::cuda::std::get(inputs)...); + + return cudaGetLastError(); +} + +template +__tile_global__ void fill_kernel(int64_t num_items_, T* __restrict__ out_, T value) { + namespace ct = cuda::tiles; + const auto bx = ct::bid().x; + + auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_)); + auto out = ct::assume_aligned<16>(out_); + + auto out_span = ct::tensor_span{out, ct::extents{num_items}}; + auto out_view = ct::partition_view{out_span, ct::shape{}}; + using tile_t = ct::tile>; + out_view.store_masked(ct::full(value), bx); +} + } // namespace cub_tile::detail + +namespace cub_tile { + +struct DeviceTransform { + template + static cudaError_t Transform( + ::cuda::std::tuple inputs, + Out* output, + int64_t num_items, + Fn, + cudaStream_t stream = 0) { + constexpr int chosen = (TileSize > 0) + ? TileSize + : detail::pick_tile_size(MufuHeavy); + return detail::launch_impl( + inputs, output, num_items, stream, + ::cuda::std::index_sequence_for{}); + } + + // Fill + template + static cudaError_t Fill(T* output, int64_t num_items, T value, cudaStream_t stream = 0) { + if (num_items <= 0) return cudaSuccess; + constexpr int chosen = (TileSize > 0) ? TileSize : detail::pick_tile_size(); + const int64_t num_blocks = (num_items + chosen - 1) / chosen; + detail::fill_kernel<<(num_blocks), 1, 0, stream>>>( + num_items, output, value); + return cudaGetLastError(); + } +}; + +} // namespace cub_tile From 848a64519f259d56cb287764628973d690534f2e Mon Sep 17 00:00:00 2001 From: Nan An Date: Mon, 1 Jun 2026 15:09:31 -0700 Subject: [PATCH 03/83] bench_init RNG helper --- .../bench/transform/tile/bench_init.cuh | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 cub/benchmarks/bench/transform/tile/bench_init.cuh diff --git a/cub/benchmarks/bench/transform/tile/bench_init.cuh b/cub/benchmarks/bench/transform/tile/bench_init.cuh new file mode 100644 index 00000000000..da3e37f8c40 --- /dev/null +++ b/cub/benchmarks/bench/transform/tile/bench_init.cuh @@ -0,0 +1,67 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include +#include +#include +#include + +namespace bench_init { + +// splitmix64 — fast deterministic PRNG, one mix per element. +__device__ __forceinline__ uint64_t splitmix64(uint64_t x) { + x += 0x9E3779B97F4A7C15ULL; + x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9ULL; + x = (x ^ (x >> 27)) * 0x94D049BB133111EBULL; + return x ^ (x >> 31); +} + +// Map a uint64 to a "reasonable" finite value of T in roughly [-1, 1) for floats, +// or to a non-zero byte for small ints (so neither all-zero nor pathological). +template +__device__ __forceinline__ T from_random(uint64_t r) { + if constexpr (std::is_same_v) { + // 24-bit mantissa precision, range (-1, 1) + uint32_t u = uint32_t(r >> 40); // 24 bits + float f = float(u) * (1.0f / float(1u << 23)) - 1.0f; + return f; + } else if constexpr (std::is_same_v) { + uint64_t u = r >> 11; // 53 bits + double d = double(u) * (1.0 / double(1ull << 52)) - 1.0; + return d; + } else if constexpr (std::is_same_v) { + uint32_t u = uint32_t(r >> 40); + float f = float(u) * (1.0f / float(1u << 23)) - 1.0f; + return __float2half(f); + } else if constexpr (std::is_same_v) { + uint32_t u = uint32_t(r >> 40); + float f = float(u) * (1.0f / float(1u << 23)) - 1.0f; + return __float2bfloat16(f); + } else { + // integer types: small non-zero values, biased away from zero so div is meaningful + int v = int(r & 0x7f) + 1; // 1..128 + if (r & 0x100) v = -v; // sometimes negative + return T(v); + } +} + +template +__global__ void rand_fill_kernel(T* __restrict__ p, int64_t n, uint64_t seed) { + int64_t stride = int64_t(gridDim.x) * blockDim.x; + for (int64_t i = int64_t(blockIdx.x) * blockDim.x + threadIdx.x; i < n; i += stride) { + p[i] = from_random(splitmix64(seed ^ uint64_t(i))); + } +} + +template +inline void rand_fill(T* p, int64_t n, uint64_t seed = 0xC0FFEE) { + int block = 256; + int64_t nblocks = (n + block - 1) / block; + int grid = int(nblocks < 65535 ? nblocks : 65535); + rand_fill_kernel<<>>(p, n, seed); +} + +} // namespace bench_init From 463213628fa7fa2bd5949a9e9e7b520559842366 Mon Sep 17 00:00:00 2001 From: Nan An Date: Mon, 1 Jun 2026 15:09:35 -0700 Subject: [PATCH 04/83] babelstream tile bench --- .../bench/transform/tile/babelstream.cu | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 cub/benchmarks/bench/transform/tile/babelstream.cu diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu new file mode 100644 index 00000000000..cbc7942b037 --- /dev/null +++ b/cub/benchmarks/bench/transform/tile/babelstream.cu @@ -0,0 +1,117 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +// BabelStream-style bandwidth benchmarks on cub_tile::DeviceTransform. +// Mirror of cub/benchmarks/bench/transform/babelstream.cu so we can compare +// numbers side-by-side. + +#include + +#include "device_transform.cuh" + +#include +#include +#include +#include + +#include "bench_init.cuh" + +#ifndef TILE_SIZE +#define TILE_SIZE 0 // 0 = auto-pick via detail::pick_tile_size +#endif +#define STR_(x) #x +#define STR(x) STR_(x) + +struct mul_op { template __tile__ auto operator()(B b) const { return -(b + b); } }; +struct add_op { template __tile__ auto operator()(A a, B b) const { return a + b; } }; +struct triad_op { template __tile__ auto operator()(B b, C c) const { return b - c - c; } }; +struct nstream_op { template __tile__ auto operator()(A a, B b, C c) const { return a + b - c - c; } }; + +// True if `bytes_needed` worth of GPU memory is available, with 5% headroom +// for driver overhead. Caller should `state.skip(...)` on false. +inline bool gpu_mem_available(size_t bytes_needed) { + size_t free_b = 0, total_b = 0; + if (cudaMemGetInfo(&free_b, &total_b) != cudaSuccess) return false; + return bytes_needed + (bytes_needed / 20) < free_b; +} + +template +struct Buffers { + T *a{}, *b{}, *c{}; + int64_t n{}; + Buffers(int64_t n) : n(n) { + cudaMalloc(&a, n * sizeof(T)); + cudaMalloc(&b, n * sizeof(T)); + cudaMalloc(&c, n * sizeof(T)); + // touch every page so HBM is actually backed (not cold-page tricks). + // values don't matter for BW measurement. + bench_init::rand_fill(a, n, 0xA111); + bench_init::rand_fill(b, n, 0xB222); + bench_init::rand_fill(c, n, 0xC333); + cudaDeviceSynchronize(); + } + ~Buffers() { cudaFree(a); cudaFree(b); cudaFree(c); } +}; + +// --- benchmarks --- +template +void mul(nvbench::state& state, nvbench::type_list) { + auto n = state.get_int64("Elements{io}"); + Buffers buf(n); + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + state.exec([&](nvbench::launch& launch) { + cub_tile::DeviceTransform::Transform( + ::cuda::std::make_tuple(buf.b), buf.c, n, mul_op{}, launch.get_stream()); + }); +} + +template +void add(nvbench::state& state, nvbench::type_list) { + auto n = state.get_int64("Elements{io}"); + Buffers buf(n); + state.add_element_count(n); + state.add_global_memory_reads(2 * n); + state.add_global_memory_writes(n); + state.exec([&](nvbench::launch& launch) { + cub_tile::DeviceTransform::Transform( + ::cuda::std::make_tuple(buf.a, buf.b), buf.c, n, add_op{}, launch.get_stream()); + }); +} + +template +void triad(nvbench::state& state, nvbench::type_list) { + auto n = state.get_int64("Elements{io}"); + Buffers buf(n); + state.add_element_count(n); + state.add_global_memory_reads(2 * n); + state.add_global_memory_writes(n); + state.exec([&](nvbench::launch& launch) { + cub_tile::DeviceTransform::Transform( + ::cuda::std::make_tuple(buf.b, buf.c), buf.a, n, triad_op{}, launch.get_stream()); + }); +} + +template +void nstream(nvbench::state& state, nvbench::type_list) { + auto n = state.get_int64("Elements{io}"); + Buffers buf(n); + state.add_element_count(n); + state.add_global_memory_reads(3 * n); + state.add_global_memory_writes(n); + state.exec([&](nvbench::launch& launch) { + cub_tile::DeviceTransform::Transform( + ::cuda::std::make_tuple(buf.a, buf.b, buf.c), buf.a, n, nstream_op{}, launch.get_stream()); + }); +} + +using types = nvbench::type_list; +inline auto sizes = std::vector{16, 20, 24, 28, 31}; + +NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(types)).set_name("tile_mul_ts" STR(TILE_SIZE)).add_int64_power_of_two_axis("Elements{io}", sizes); +NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(types)).set_name("tile_add_ts" STR(TILE_SIZE)).add_int64_power_of_two_axis("Elements{io}", sizes); +NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(types)).set_name("tile_triad_ts" STR(TILE_SIZE)).add_int64_power_of_two_axis("Elements{io}", sizes); +NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(types)).set_name("tile_nstream_ts" STR(TILE_SIZE)).add_int64_power_of_two_axis("Elements{io}", sizes); + +NVBENCH_MAIN From ea6ae70a7ea63cc09dcb9bd2632e070fed83a18a Mon Sep 17 00:00:00 2001 From: Nan An Date: Mon, 1 Jun 2026 15:09:54 -0700 Subject: [PATCH 05/83] pytorch tile bench --- .../bench/transform/tile/pytorch.cu | 111 ++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 cub/benchmarks/bench/transform/tile/pytorch.cu diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu new file mode 100644 index 00000000000..76330520629 --- /dev/null +++ b/cub/benchmarks/bench/transform/tile/pytorch.cu @@ -0,0 +1,111 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +// PyTorch ops on tile. Uses ct::tanh / ct::sin / ct::exp / ct::select. + +#include +#include "device_transform.cuh" +#include +#include +#include +#include +#include + +#include "bench_init.cuh" + +namespace ct = cuda::tiles; + +// --- Unary --- (compute in float, cast back so the same ops work for __half/__bf16/float) +template __tile__ auto as_float(T v) { return ct::element_cast(v); } +template __tile__ auto from_float(F f) { return ct::element_cast>(f); } + +struct relu_op { template __tile__ auto operator()(T v) const { + auto f = as_float(v); return from_float(ct::select(f > 0.0f, f, f - f)); } }; +struct sigmoid_op { template __tile__ auto operator()(T v) const { + auto f = as_float(v); return from_float(1.0f / (1.0f + ct::exp(-f))); } }; +struct tanh_op { template __tile__ auto operator()(T v) const { + return from_float(ct::tanh(as_float(v))); } }; +struct gelu_op { template __tile__ auto operator()(T v) const { + constexpr float k0 = 0.7978845608028654f, k1 = 0.044715f; + auto f = as_float(v); + return from_float(0.5f * f * (1.0f + ct::tanh(k0 * (f + k1 * f * f * f)))); } }; +struct sin_op { template __tile__ auto operator()(T v) const { return from_float(ct::sin(as_float(v))); } }; +struct exp_op { template __tile__ auto operator()(T v) const { return from_float(ct::exp(as_float(v))); } }; + +// --- Binary --- +struct binary_add { template __tile__ auto operator()(A a, B b) const { return a + b; } }; +struct binary_sub { template __tile__ auto operator()(A a, B b) const { return a - b; } }; +struct binary_mul { template __tile__ auto operator()(A a, B b) const { return a * b; } }; +struct binary_div { template __tile__ auto operator()(A a, B b) const { return a / b; } }; +// le/ge: cast the bool result tile to A's element type so it fits the float output buffer +// (CUB does the same implicit cast via its iterator path). +struct binary_le { template __tile__ auto operator()(A a, B b) const { return ct::element_cast>(a <= b); } }; +struct binary_ge { template __tile__ auto operator()(A a, B b) const { return ct::element_cast>(a >= b); } }; +struct binary_fmin { template __tile__ auto operator()(A a, B b) const { return ct::select(a < b, a, b); } }; +struct binary_fmax { template __tile__ auto operator()(A a, B b) const { return ct::select(a > b, a, b); } }; + + +template +void run_unary(nvbench::state& state) { + const auto n = state.get_int64("Elements{io}"); + T *in, *out; + cudaMalloc(&in, n * sizeof(T)); cudaMalloc(&out, n * sizeof(T)); + bench_init::rand_fill(in, n, 0xA111); cudaDeviceSynchronize(); + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + state.exec([&](nvbench::launch& launch) { + cub_tile::DeviceTransform::Transform<0, MufuHeavy>( + ::cuda::std::make_tuple(in), out, n, Op{}, launch.get_stream()); + }); + cudaFree(in); cudaFree(out); +} + +template +void run_binary(nvbench::state& state) { + const auto n = state.get_int64("Elements{io}"); + T *a, *b, *out; + cudaMalloc(&a, n*sizeof(T)); cudaMalloc(&b, n*sizeof(T)); cudaMalloc(&out, n*sizeof(T)); + bench_init::rand_fill(a, n, 0xA111); + bench_init::rand_fill(b, n, 0xB222); + cudaDeviceSynchronize(); + state.add_element_count(n); + state.add_global_memory_reads(2*n); + state.add_global_memory_writes(n); + state.exec([&](nvbench::launch& launch) { + cub_tile::DeviceTransform::Transform( + ::cuda::std::make_tuple(a, b), out, n, Op{}, launch.get_stream()); + }); + cudaFree(a); cudaFree(b); cudaFree(out); +} + +using element_types = nvbench::type_list<__half, __nv_bfloat16, float>; +inline auto pt_sizes = std::vector{16, 20, 24, 28, 31}; + +#define UNARY_BENCH(name, op, mufu) \ + template void name##_bench(nvbench::state& state, nvbench::type_list) { run_unary(state); } \ + NVBENCH_BENCH_TYPES(name##_bench, NVBENCH_TYPE_AXES(element_types)).set_name("tile_" #name).add_int64_power_of_two_axis("Elements{io}", pt_sizes); + +// MufuHeavy hint set for ops dominated by MUFU intrinsics (exp/tanh/sin/cos). +// relu is just compare+select, so no hint. +UNARY_BENCH(relu, relu_op, false) +UNARY_BENCH(sigmoid, sigmoid_op, true) +UNARY_BENCH(tanh, tanh_op, true) +UNARY_BENCH(gelu, gelu_op, true) +UNARY_BENCH(sin, sin_op, true) +UNARY_BENCH(exp, exp_op, true) + +#define BINARY_BENCH(name, op) \ + template void name##_bench(nvbench::state& state, nvbench::type_list) { run_binary(state); } \ + NVBENCH_BENCH_TYPES(name##_bench, NVBENCH_TYPE_AXES(element_types)).set_name("tile_pt_" #name).add_int64_power_of_two_axis("Elements{io}", pt_sizes); + +BINARY_BENCH(add, binary_add) +BINARY_BENCH(sub, binary_sub) +BINARY_BENCH(mul, binary_mul) +BINARY_BENCH(div, binary_div) +BINARY_BENCH(le, binary_le) +BINARY_BENCH(ge, binary_ge) +BINARY_BENCH(fmin, binary_fmin) +BINARY_BENCH(fmax, binary_fmax) + +NVBENCH_MAIN From 57c712c7178d5e668f5e256e16a440d3a93a8d4c Mon Sep 17 00:00:00 2001 From: Nan An Date: Mon, 1 Jun 2026 15:09:58 -0700 Subject: [PATCH 06/83] copy/grayscale/fill tile benches --- cub/benchmarks/bench/transform/tile/copy.cu | 44 +++++++++++++++ cub/benchmarks/bench/transform/tile/fill.cu | 30 +++++++++++ .../bench/transform/tile/grayscale.cu | 53 +++++++++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 cub/benchmarks/bench/transform/tile/copy.cu create mode 100644 cub/benchmarks/bench/transform/tile/fill.cu create mode 100644 cub/benchmarks/bench/transform/tile/grayscale.cu diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu new file mode 100644 index 00000000000..6bb34a7157d --- /dev/null +++ b/cub/benchmarks/bench/transform/tile/copy.cu @@ -0,0 +1,44 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +// Pure copy bench (identity transform) — tile side. +// Isolates the load/store path from any arithmetic on top: useful for +// catching narrow-type store wars (e.g. byte stores capping BW). + +#include +#include "device_transform.cuh" +#include +#include +#include +#include + +#include "bench_init.cuh" + +struct identity { + template __tile__ auto operator()(T v) const { return v; } +}; + +template +void copy(nvbench::state& state, nvbench::type_list) { + auto n = state.get_int64("Elements{io}"); + T *in, *out; + cudaMalloc(&in, n * sizeof(T)); cudaMalloc(&out, n * sizeof(T)); + bench_init::rand_fill(in, n, 0xA111); cudaDeviceSynchronize(); + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + state.exec([&](nvbench::launch& launch) { + cub_tile::DeviceTransform::Transform( + ::cuda::std::make_tuple(in), out, n, identity{}, launch.get_stream()); + }); + cudaFree(in); cudaFree(out); +} + +using types = nvbench::type_list; +inline auto sizes = std::vector{16, 20, 24, 28, 31}; + +NVBENCH_BENCH_TYPES(copy, NVBENCH_TYPE_AXES(types)) + .set_name("tile_copy") + .add_int64_power_of_two_axis("Elements{io}", sizes); + +NVBENCH_MAIN diff --git a/cub/benchmarks/bench/transform/tile/fill.cu b/cub/benchmarks/bench/transform/tile/fill.cu new file mode 100644 index 00000000000..2b0da544d38 --- /dev/null +++ b/cub/benchmarks/bench/transform/tile/fill.cu @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +// Fill: zero-input broadcast. CUB models this as Transform with empty input tuple +// and a no-arg op. Tile can't express zero-input Transform directly, so we use the +// dedicated cub_tile::DeviceTransform::Fill API which writes a constant. + +#include +#include "device_transform.cuh" +#include + +template +void fill(nvbench::state& state, nvbench::type_list) { + const auto n = state.get_int64("Elements{io}"); + T* out; cudaMalloc(&out, n * sizeof(T)); + state.add_element_count(n); + state.add_global_memory_writes(n); + state.exec([&](nvbench::launch& launch) { + cub_tile::DeviceTransform::Fill(out, n, T(42), launch.get_stream()); + }); + cudaFree(out); +} + +// CUB sweeps integral types: int8/16/32/64 +using fill_types = nvbench::type_list; + +NVBENCH_BENCH_TYPES(fill, NVBENCH_TYPE_AXES(fill_types)).set_name("tile_fill") + .add_int64_power_of_two_axis("Elements{io}", std::vector{16, 20, 24, 28, 31}); + +NVBENCH_MAIN diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu new file mode 100644 index 00000000000..1f1db7fc737 --- /dev/null +++ b/cub/benchmarks/bench/transform/tile/grayscale.cu @@ -0,0 +1,53 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +// Grayscale: RGB pixel -> luminance. Uses a 3-component pixel type. +// CUB stores rgb_t (12 bytes) packed; tile may or may not accept this as an +// element type. If tile rejects rgb_t, this bench will fail to compile — +// we'll then fall back to treating R/G/B as three separate float streams. + +#include +#include "device_transform.cuh" +#include "bench_init.cuh" +#include +#include +#include + +// Three-stream version (R, G, B as separate input arrays). +// Computationally equivalent to CUB's packed rgb_t version. +struct rgb_to_y { + template + __tile__ auto operator()(R r, G g, B b) const { + constexpr float w_r = 0.2989f; + constexpr float w_g = 0.587f; + constexpr float w_b = 0.114f; + return w_r * r + w_g * g + w_b * b; + } +}; + +template +void grayscale(nvbench::state& state, nvbench::type_list) { + const auto n = state.get_int64("Elements{io}"); + T *r, *g, *b, *out; + cudaMalloc(&r, n*sizeof(T)); cudaMalloc(&g, n*sizeof(T)); cudaMalloc(&b, n*sizeof(T)); + cudaMalloc(&out, n*sizeof(T)); + bench_init::rand_fill(r, n, 0xA111); + bench_init::rand_fill(g, n, 0xA222); + bench_init::rand_fill(b, n, 0xA333); + + state.add_element_count(n); + state.add_global_memory_reads(3 * n); // matches CUB's rgb_t = 3*sizeof(T) + state.add_global_memory_writes(n); + state.exec([&](nvbench::launch& launch) { + cub_tile::DeviceTransform::Transform( + ::cuda::std::make_tuple(r, g, b), out, n, rgb_to_y{}, launch.get_stream()); + }); + cudaFree(r); cudaFree(g); cudaFree(b); cudaFree(out); +} + +using value_types = nvbench::type_list; + +NVBENCH_BENCH_TYPES(grayscale, NVBENCH_TYPE_AXES(value_types)).set_name("tile_grayscale") + .add_int64_power_of_two_axis("Elements{io}", std::vector{16, 20, 24, 28, 31}); + +NVBENCH_MAIN From 7ecb113fa21fdbfaa9320b65f41907c6afbe497d Mon Sep 17 00:00:00 2001 From: Nan An Date: Mon, 1 Jun 2026 15:10:02 -0700 Subject: [PATCH 07/83] tile DeviceTransform tests --- .../transform/tile/test_device_transform.cu | 171 ++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 cub/benchmarks/bench/transform/tile/test_device_transform.cu diff --git a/cub/benchmarks/bench/transform/tile/test_device_transform.cu b/cub/benchmarks/bench/transform/tile/test_device_transform.cu new file mode 100644 index 00000000000..d787a0df578 --- /dev/null +++ b/cub/benchmarks/bench/transform/tile/test_device_transform.cu @@ -0,0 +1,171 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +// Standalone correctness tests for cub_tile::DeviceTransform. +// Sits next to the benches so it builds against the same tileiras +// toolchain and does not pretend to be part of CCCL's catch2 suite. + +#include "device_transform.cuh" + +#include +#include + +#include +#include +#include +#include +#include + +namespace { + +int g_failures = 0; + +#define CUDA_CHECK(expr) \ + do { \ + cudaError_t _e = (expr); \ + if (_e != cudaSuccess) { \ + std::fprintf(stderr, "%s:%d CUDA error: %s\n", __FILE__, __LINE__, \ + cudaGetErrorString(_e)); \ + std::exit(2); \ + } \ + } while (0) + +template +bool eq(T a, T b) { return a == b; } +inline bool eq(float a, float b) { + float diff = std::fabs(a - b); + float tol = 1e-5f * std::fmax(std::fabs(a), std::fabs(b)); + return diff <= std::fmax(tol, 1e-6f); +} + +template +void expect_array(const char* name, const std::vector& got, const std::vector& want) { + if (got.size() != want.size()) { + std::fprintf(stderr, "[FAIL] %s: size %zu != %zu\n", name, got.size(), want.size()); + ++g_failures; + return; + } + int mismatches = 0; + for (size_t i = 0; i < got.size(); ++i) { + if (!eq(got[i], want[i])) { + if (mismatches < 4) { + std::fprintf(stderr, "[FAIL] %s: idx=%zu got=%g want=%g\n", + name, i, double(got[i]), double(want[i])); + } + ++mismatches; + } + } + if (mismatches) { ++g_failures; std::fprintf(stderr, "[FAIL] %s: %d mismatches\n", name, mismatches); } + else { std::printf("[ OK ] %s (n=%zu)\n", name, got.size()); } +} + +struct identity_op { template __tile__ auto operator()(A a) const { return a; } }; +struct square_op { template __tile__ auto operator()(A a) const { return a * a; } }; +struct add_op { template __tile__ auto operator()(A a, B b) const { return a + b; } }; +struct mul_op { template __tile__ auto operator()(A a, B b) const { return a * b; } }; + +template +std::vector ramp(int64_t n, T start = T{0}, T step = T{1}) { + std::vector v(n); + for (int64_t i = 0; i < n; ++i) v[i] = T(start + step * T(i)); + return v; +} + +template +struct GpuVec { + T* d{}; + int64_t n{}; + explicit GpuVec(int64_t n) : n(n) { CUDA_CHECK(cudaMalloc(&d, n * sizeof(T))); } + explicit GpuVec(const std::vector& h) : GpuVec(int64_t(h.size())) { + CUDA_CHECK(cudaMemcpy(d, h.data(), n * sizeof(T), cudaMemcpyHostToDevice)); + } + ~GpuVec() { if (d) cudaFree(d); } + std::vector to_host() const { + std::vector h(n); + CUDA_CHECK(cudaMemcpy(h.data(), d, n * sizeof(T), cudaMemcpyDeviceToHost)); + return h; + } +}; + +template +void test_identity(int64_t n) { + auto h_in = ramp(n, T{1}, T{1}); + GpuVec dx(h_in), dy(n); + CUDA_CHECK(cub_tile::DeviceTransform::Transform( + ::cuda::std::make_tuple(dx.d), dy.d, n, identity_op{})); + CUDA_CHECK(cudaDeviceSynchronize()); + expect_array("identity", dy.to_host(), h_in); +} + +template +void test_square(int64_t n) { + auto h_in = ramp(n, T{1}, T{1}); + std::vector want(n); + for (int64_t i = 0; i < n; ++i) want[i] = h_in[i] * h_in[i]; + GpuVec dx(h_in), dy(n); + CUDA_CHECK(cub_tile::DeviceTransform::Transform( + ::cuda::std::make_tuple(dx.d), dy.d, n, square_op{})); + CUDA_CHECK(cudaDeviceSynchronize()); + expect_array("square", dy.to_host(), want); +} + +template +void test_add(int64_t n) { + auto ha = ramp(n, T{1}, T{1}); + auto hb = ramp(n, T{100}, T{2}); + std::vector want(n); + for (int64_t i = 0; i < n; ++i) want[i] = ha[i] + hb[i]; + GpuVec da(ha), db(hb), dc(n); + CUDA_CHECK(cub_tile::DeviceTransform::Transform( + ::cuda::std::make_tuple(da.d, db.d), dc.d, n, add_op{})); + CUDA_CHECK(cudaDeviceSynchronize()); + expect_array("add", dc.to_host(), want); +} + +template +void test_mul(int64_t n) { + auto ha = ramp(n, T{1}, T{1}); + auto hb = ramp(n, T{3}, T{1}); + std::vector want(n); + for (int64_t i = 0; i < n; ++i) want[i] = ha[i] * hb[i]; + GpuVec da(ha), db(hb), dc(n); + CUDA_CHECK(cub_tile::DeviceTransform::Transform( + ::cuda::std::make_tuple(da.d, db.d), dc.d, n, mul_op{})); + CUDA_CHECK(cudaDeviceSynchronize()); + expect_array("mul", dc.to_host(), want); +} + +template +void test_fill(int64_t n, T value) { + GpuVec dy(n); + CUDA_CHECK(cub_tile::DeviceTransform::Fill(dy.d, n, value)); + CUDA_CHECK(cudaDeviceSynchronize()); + std::vector want(n, value); + expect_array("fill", dy.to_host(), want); +} + +} // namespace + +int main() { + // pow-2, multiple tiles + test_identity(4096); + test_square(2048); + test_add(4096); + test_mul(2048); + test_fill(1024, 42); + + // non-pow-2 num_items (still multiple of 16 to satisfy assume_divisible<16>) + test_add(4112); // 16 * 257 + test_fill(1008, -7); // 16 * 63 + + // single full tile and below-one-tile (still >=16, div by 16) + test_square(16); + test_add(64); + + if (g_failures) { + std::fprintf(stderr, "\n%d test group(s) FAILED\n", g_failures); + return 1; + } + std::printf("\nall tests passed\n"); + return 0; +} From d5eff222b573c6133e408e2b7a91ecf81a308038 Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 3 Jun 2026 13:27:03 -0700 Subject: [PATCH 08/83] move tile DeviceTransform header into CUB public path The cub_tile::DeviceTransform implementation moves from cub/benchmarks/bench/transform/tile/device_transform.cuh into cub/cub/device/dispatch/dispatch_transform_tile.cuh so the CUB header tree can reference it. The bench .cu files now include from the new path. The hand-rolled ceil_div and round_up_pow2 helpers are replaced with cuda::ceil_div and cuda::next_power_of_two from . --- .../bench/transform/tile/babelstream.cu | 2 +- cub/benchmarks/bench/transform/tile/copy.cu | 2 +- .../bench/transform/tile/device_transform.cuh | 161 --------------- cub/benchmarks/bench/transform/tile/fill.cu | 2 +- .../bench/transform/tile/grayscale.cu | 2 +- .../bench/transform/tile/pytorch.cu | 2 +- .../transform/tile/test_device_transform.cu | 2 +- .../dispatch/dispatch_transform_tile.cuh | 194 ++++++++++++++++++ 8 files changed, 200 insertions(+), 167 deletions(-) delete mode 100644 cub/benchmarks/bench/transform/tile/device_transform.cuh create mode 100644 cub/cub/device/dispatch/dispatch_transform_tile.cuh diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu index cbc7942b037..1e180f850a4 100644 --- a/cub/benchmarks/bench/transform/tile/babelstream.cu +++ b/cub/benchmarks/bench/transform/tile/babelstream.cu @@ -7,7 +7,7 @@ #include -#include "device_transform.cuh" +#include #include #include diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu index 6bb34a7157d..951af8b0fed 100644 --- a/cub/benchmarks/bench/transform/tile/copy.cu +++ b/cub/benchmarks/bench/transform/tile/copy.cu @@ -6,7 +6,7 @@ // catching narrow-type store wars (e.g. byte stores capping BW). #include -#include "device_transform.cuh" +#include #include #include #include diff --git a/cub/benchmarks/bench/transform/tile/device_transform.cuh b/cub/benchmarks/bench/transform/tile/device_transform.cuh deleted file mode 100644 index 57a0b965985..00000000000 --- a/cub/benchmarks/bench/transform/tile/device_transform.cuh +++ /dev/null @@ -1,161 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -// SPDX-License-Identifier: BSD-3-Clause - -// tile port of cub::DeviceTransform. -// Public surface mirrors cub::DeviceTransform::{Transform, Fill}; the -// kernels themselves are written against the tile DSL (cuda::tiles). - -#pragma once - -#include -#include -#include -#include - -#include "cuda_tile.h" - -namespace cub_tile::detail { - -constexpr int min_bytes_in_flight_per_sm(int cc_x10) { - if (cc_x10 >= 1000) return 64 * 1024; // B200 - if (cc_x10 >= 900) return 48 * 1024; // H100/H200 - if (cc_x10 >= 800) return 16 * 1024; // A100 - return 12 * 1024; -} - -constexpr int ceil_div(int a, int b) { return (a + b - 1) / b; } -constexpr int round_up_pow2(int x) { - int p = 1; while (p < x) p *= 2; return p; -} -constexpr int min_size(int a) { return a; } -template constexpr int min_size(int a, int b, Ts... rest) { - int m = a < b ? a : b; return min_size(m, rest...); -} - -// mufu_heavy=true tells the policy the functor body has heavy MUFU usage. -// for small data types, vectorized load will make them arrive packed in registers -// and the compiler unpacks them and packs them back. reducing the compute work per -// thread helps here. -// need profiling to know the exact cause -template -constexpr int pick_tile_size(bool mufu_heavy = false, int cc_x10 = 1000) { - constexpr int threads_per_block = 128; - constexpr int vector_bytes = 16; // LDG.E.128 -> 16 bytes - constexpr int max_items_per_thread = 32; - constexpr int max_occupancy = 16; - - constexpr int min_elem = min_size(int(sizeof(Out)), int(sizeof(Ins))...); - constexpr int items_for_vec = ceil_div(vector_bytes, min_elem); - - // Fill (zero inputs) keeps the same latency target by counting output bytes. - constexpr int bytes_per_iter = (sizeof...(Ins) > 0) - ? (int(sizeof(Ins)) + ... + 0) - : int(sizeof(Out)); - const int target = min_bytes_in_flight_per_sm(cc_x10); - const int items_for_latency = - ceil_div(target, max_occupancy * threads_per_block * bytes_per_iter); - - int items = items_for_vec > items_for_latency ? items_for_vec : items_for_latency; - items = round_up_pow2(items); - if (items > max_items_per_thread) items = max_items_per_thread; - - if (mufu_heavy && min_elem < 4) { - const int byte_cap = vector_bytes / min_elem; // 16 for I8, 8 for I16/half/bf16 - if (items > byte_cap) items = byte_cap; - } - - return items * threads_per_block; -} - -template -__tile_global__ void transform_kernel(int64_t num_items_, Out* __restrict__ out_, - const Ins* __restrict__... ins_) { - namespace ct = cuda::tiles; - - const auto bx = ct::bid().x; - Fn fn{}; - - auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_)); - auto out = ct::assume_aligned<16>(out_); - - auto out_span = ct::tensor_span{out, ct::extents{num_items}}; - auto out_view = ct::partition_view{out_span, ct::shape{}}; - - auto load_one = [bx, num_items](auto* ptr_) { - auto ptr = ct::assume_aligned<16>(ptr_); - auto span = ct::tensor_span{ptr, ct::extents{num_items}}; - auto view = ct::partition_view{span, ct::shape{}}; - return view.load_masked(bx); - }; - - out_view.store_masked(fn(load_one(ins_)...), bx); -} - -template -cudaError_t launch_impl( - ::cuda::std::tuple inputs, - Out* output, - int64_t num_items, - cudaStream_t stream, - ::cuda::std::index_sequence) { - - if (num_items <= 0) return cudaSuccess; - - const int64_t num_blocks = (num_items + TileSize - 1) / TileSize; - - transform_kernel<<(num_blocks), 1, 0, stream>>>( - num_items, output, ::cuda::std::get(inputs)...); - - return cudaGetLastError(); -} - -template -__tile_global__ void fill_kernel(int64_t num_items_, T* __restrict__ out_, T value) { - namespace ct = cuda::tiles; - const auto bx = ct::bid().x; - - auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_)); - auto out = ct::assume_aligned<16>(out_); - - auto out_span = ct::tensor_span{out, ct::extents{num_items}}; - auto out_view = ct::partition_view{out_span, ct::shape{}}; - using tile_t = ct::tile>; - out_view.store_masked(ct::full(value), bx); -} - -} // namespace cub_tile::detail - -namespace cub_tile { - -struct DeviceTransform { - template - static cudaError_t Transform( - ::cuda::std::tuple inputs, - Out* output, - int64_t num_items, - Fn, - cudaStream_t stream = 0) { - constexpr int chosen = (TileSize > 0) - ? TileSize - : detail::pick_tile_size(MufuHeavy); - return detail::launch_impl( - inputs, output, num_items, stream, - ::cuda::std::index_sequence_for{}); - } - - // Fill - template - static cudaError_t Fill(T* output, int64_t num_items, T value, cudaStream_t stream = 0) { - if (num_items <= 0) return cudaSuccess; - constexpr int chosen = (TileSize > 0) ? TileSize : detail::pick_tile_size(); - const int64_t num_blocks = (num_items + chosen - 1) / chosen; - detail::fill_kernel<<(num_blocks), 1, 0, stream>>>( - num_items, output, value); - return cudaGetLastError(); - } -}; - -} // namespace cub_tile diff --git a/cub/benchmarks/bench/transform/tile/fill.cu b/cub/benchmarks/bench/transform/tile/fill.cu index 2b0da544d38..5514c1a1287 100644 --- a/cub/benchmarks/bench/transform/tile/fill.cu +++ b/cub/benchmarks/bench/transform/tile/fill.cu @@ -6,7 +6,7 @@ // dedicated cub_tile::DeviceTransform::Fill API which writes a constant. #include -#include "device_transform.cuh" +#include #include template diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu index 1f1db7fc737..14641c2d872 100644 --- a/cub/benchmarks/bench/transform/tile/grayscale.cu +++ b/cub/benchmarks/bench/transform/tile/grayscale.cu @@ -7,7 +7,7 @@ // we'll then fall back to treating R/G/B as three separate float streams. #include -#include "device_transform.cuh" +#include #include "bench_init.cuh" #include #include diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu index 76330520629..e1eee3e4452 100644 --- a/cub/benchmarks/bench/transform/tile/pytorch.cu +++ b/cub/benchmarks/bench/transform/tile/pytorch.cu @@ -4,7 +4,7 @@ // PyTorch ops on tile. Uses ct::tanh / ct::sin / ct::exp / ct::select. #include -#include "device_transform.cuh" +#include #include #include #include diff --git a/cub/benchmarks/bench/transform/tile/test_device_transform.cu b/cub/benchmarks/bench/transform/tile/test_device_transform.cu index d787a0df578..0df21dc66a3 100644 --- a/cub/benchmarks/bench/transform/tile/test_device_transform.cu +++ b/cub/benchmarks/bench/transform/tile/test_device_transform.cu @@ -5,7 +5,7 @@ // Sits next to the benches so it builds against the same tileiras // toolchain and does not pretend to be part of CCCL's catch2 suite. -#include "device_transform.cuh" +#include #include #include diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh new file mode 100644 index 00000000000..f75cb8c3ccc --- /dev/null +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -0,0 +1,194 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Tile port of cub::DeviceTransform. The public surface mirrors +// cub::DeviceTransform::{Transform, Fill}; the kernels are written against the +// tile DSL (cuda::tiles). This header is only safe to include when nvcc is +// invoked with --enable-tile and CTK >= 13.3. + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include +#include + +#include + +namespace cub_tile::detail +{ + +constexpr int min_bytes_in_flight_per_sm(int cc_x10) +{ + if (cc_x10 >= 1000) + { + return 64 * 1024; // B200 + } + if (cc_x10 >= 900) + { + return 48 * 1024; // H100/H200 + } + if (cc_x10 >= 800) + { + return 16 * 1024; // A100 + } + return 12 * 1024; +} + +constexpr int min_size(int a) +{ + return a; +} +template +constexpr int min_size(int a, int b, Ts... rest) +{ + int m = a < b ? a : b; + return min_size(m, rest...); +} + +// mufu_heavy=true tells the policy the functor body has heavy MUFU usage. +// for small data types, vectorized load will make them arrive packed in +// registers and the compiler unpacks them and packs them back. reducing the +// compute work per thread helps here. need profiling to know the exact cause. +template +constexpr int pick_tile_size(bool mufu_heavy = false, int cc_x10 = 1000) +{ + constexpr int threads_per_block = 128; + constexpr int vector_bytes = 16; // LDG.E.128 -> 16 bytes + constexpr int max_items_per_thread = 32; + constexpr int max_occupancy = 16; + + constexpr int min_elem = min_size(int(sizeof(Out)), int(sizeof(Ins))...); + constexpr int items_for_vec = static_cast(::cuda::ceil_div(vector_bytes, min_elem)); + + // Fill (zero inputs) keeps the same latency target by counting output bytes. + constexpr int bytes_per_iter = (sizeof...(Ins) > 0) ? (int(sizeof(Ins)) + ... + 0) : int(sizeof(Out)); + const int target = min_bytes_in_flight_per_sm(cc_x10); + const int items_for_latency = + static_cast(::cuda::ceil_div(target, max_occupancy * threads_per_block * bytes_per_iter)); + + int items = items_for_vec > items_for_latency ? items_for_vec : items_for_latency; + items = static_cast(::cuda::next_power_of_two(static_cast(items))); + if (items > max_items_per_thread) + { + items = max_items_per_thread; + } + + if (mufu_heavy && min_elem < 4) + { + const int byte_cap = vector_bytes / min_elem; // 16 for I8, 8 for I16/half/bf16 + if (items > byte_cap) + { + items = byte_cap; + } + } + + return items * threads_per_block; +} + +template +__tile_global__ void +transform_kernel(int64_t num_items_, Out* __restrict__ out_, const Ins* __restrict__... ins_) +{ + namespace ct = cuda::tiles; + + const auto bx = ct::bid().x; + Fn fn{}; + + auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_)); + auto out = ct::assume_aligned<16>(out_); + + auto out_span = ct::tensor_span{out, ct::extents{num_items}}; + auto out_view = ct::partition_view{out_span, ct::shape{}}; + + auto load_one = [bx, num_items](auto* ptr_) { + auto ptr = ct::assume_aligned<16>(ptr_); + auto span = ct::tensor_span{ptr, ct::extents{num_items}}; + auto view = ct::partition_view{span, ct::shape{}}; + return view.load_masked(bx); + }; + + out_view.store_masked(fn(load_one(ins_)...), bx); +} + +template +cudaError_t launch_impl( + ::cuda::std::tuple inputs, + Out* output, + int64_t num_items, + cudaStream_t stream, + ::cuda::std::index_sequence) +{ + if (num_items <= 0) + { + return cudaSuccess; + } + + const int64_t num_blocks = (num_items + TileSize - 1) / TileSize; + + transform_kernel<<(num_blocks), 1, 0, stream>>>( + num_items, output, ::cuda::std::get(inputs)...); + + return cudaGetLastError(); +} + +template +__tile_global__ void fill_kernel(int64_t num_items_, T* __restrict__ out_, T value) +{ + namespace ct = cuda::tiles; + const auto bx = ct::bid().x; + + auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_)); + auto out = ct::assume_aligned<16>(out_); + + auto out_span = ct::tensor_span{out, ct::extents{num_items}}; + auto out_view = ct::partition_view{out_span, ct::shape{}}; + using tile_t = ct::tile>; + out_view.store_masked(ct::full(value), bx); +} + +} // namespace cub_tile::detail + +namespace cub_tile +{ + +struct DeviceTransform +{ + template + static cudaError_t + Transform(::cuda::std::tuple inputs, Out* output, int64_t num_items, Fn, cudaStream_t stream = 0) + { + constexpr int chosen = (TileSize > 0) ? TileSize : detail::pick_tile_size(MufuHeavy); + return detail::launch_impl( + inputs, output, num_items, stream, ::cuda::std::index_sequence_for{}); + } + + // Fill + template + static cudaError_t Fill(T* output, int64_t num_items, T value, cudaStream_t stream = 0) + { + if (num_items <= 0) + { + return cudaSuccess; + } + constexpr int chosen = (TileSize > 0) ? TileSize : detail::pick_tile_size(); + const int64_t num_blocks = (num_items + chosen - 1) / chosen; + detail::fill_kernel + <<(num_blocks), 1, 0, stream>>>(num_items, output, value); + return cudaGetLastError(); + } +}; + +} // namespace cub_tile From 5f9885749d9371108f6034cb5e3b35314b4f696d Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 3 Jun 2026 13:32:59 -0700 Subject: [PATCH 09/83] move tile dispatch into cub::detail::transform::tile namespace The cub_tile namespace and its hand-rolled detail layout move under cub::detail::transform::tile to match how CUB groups the existing transform internals. A type alias is kept at cub_tile::DeviceTransform so the benches and tests still compile during the transition. The whole file body is also gated by _CCCL_CTK_AT_LEAST(13, 3) so older toolchains never see the tile DSL types. --- .../dispatch/dispatch_transform_tile.cuh | 49 +++++++++++-------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index f75cb8c3ccc..584d9829963 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -3,8 +3,8 @@ // Tile port of cub::DeviceTransform. The public surface mirrors // cub::DeviceTransform::{Transform, Fill}; the kernels are written against the -// tile DSL (cuda::tiles). This header is only safe to include when nvcc is -// invoked with --enable-tile and CTK >= 13.3. +// tile DSL (cuda::tiles). This header requires CTK 13.3 or newer and nvcc +// invoked with --enable-tile. #pragma once @@ -18,16 +18,20 @@ # pragma system_header #endif // no system header -#include -#include -#include +#if _CCCL_CTK_AT_LEAST(13, 3) -#include -#include +# include +# include +# include -#include +# include +# include -namespace cub_tile::detail +# include + +CUB_NAMESPACE_BEGIN + +namespace detail::transform::tile { constexpr int min_bytes_in_flight_per_sm(int cc_x10) @@ -159,20 +163,14 @@ __tile_global__ void fill_kernel(int64_t num_items_, T* __restrict__ out_, T val out_view.store_masked(ct::full(value), bx); } -} // namespace cub_tile::detail - -namespace cub_tile -{ - struct DeviceTransform { template static cudaError_t Transform(::cuda::std::tuple inputs, Out* output, int64_t num_items, Fn, cudaStream_t stream = 0) { - constexpr int chosen = (TileSize > 0) ? TileSize : detail::pick_tile_size(MufuHeavy); - return detail::launch_impl( - inputs, output, num_items, stream, ::cuda::std::index_sequence_for{}); + constexpr int chosen = (TileSize > 0) ? TileSize : pick_tile_size(MufuHeavy); + return launch_impl(inputs, output, num_items, stream, ::cuda::std::index_sequence_for{}); } // Fill @@ -183,12 +181,23 @@ struct DeviceTransform { return cudaSuccess; } - constexpr int chosen = (TileSize > 0) ? TileSize : detail::pick_tile_size(); + constexpr int chosen = (TileSize > 0) ? TileSize : pick_tile_size(); const int64_t num_blocks = (num_items + chosen - 1) / chosen; - detail::fill_kernel - <<(num_blocks), 1, 0, stream>>>(num_items, output, value); + fill_kernel<<(num_blocks), 1, 0, stream>>>(num_items, output, value); return cudaGetLastError(); } }; +} // namespace detail::transform::tile + +CUB_NAMESPACE_END + +// Compatibility shim. Existing benches and tests still call +// cub_tile::DeviceTransform; once they move to cub::DeviceTransform with named +// functors and the trait dispatch, this alias can be removed. +namespace cub_tile +{ +using DeviceTransform = ::cub::detail::transform::tile::DeviceTransform; } // namespace cub_tile + +#endif // _CCCL_CTK_AT_LEAST(13, 3) From 8ae89ef3d3912d8951714c3e9a704f8e36163230 Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 3 Jun 2026 13:45:33 -0700 Subject: [PATCH 10/83] add tile dispatch trait header --- .../dispatch_transform_tile_traits.cuh | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh new file mode 100644 index 00000000000..8bfdadaac87 --- /dev/null +++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh @@ -0,0 +1,81 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Compile-time policy for cub::DeviceTransform's tile path. +// +// tile_eligible_v answers "should DeviceTransform::Transform +// route to the tile kernel for this (functor, element type, input arity)?". +// tile_mufu_heavy_v hints the tile policy picker that Op spends most of +// its time on MUFU instructions, so the picker caps items/thread at the +// vector width to avoid piling up MUFU work that cannot SIMD on Blackwell +// for sub-4-byte types. +// +// This header is pure trait infrastructure; no callers yet. Specializations +// land here as benches confirm tile wins for a (Op, T, NIn) combination. + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_CTK_AT_LEAST(13, 3) + +# include +# include +# include + +# include + +CUB_NAMESPACE_BEGIN + +namespace detail::transform::tile +{ + +// Primary template: tile path is opt-in. Specialize for combinations where a +// bench has shown the tile kernel beats the existing CUB algorithms. +template +struct tile_eligible : ::cuda::std::false_type +{}; + +template +inline constexpr bool tile_eligible_v = tile_eligible::value; + +// Companion trait: report Op as MUFU-heavy so the tile policy picker caps +// items/thread at the vector width on small element types. Default is false. +template +struct tile_mufu_heavy : ::cuda::std::false_type +{}; + +template +inline constexpr bool tile_mufu_heavy_v = tile_mufu_heavy::value; + +# if _CCCL_HAS_NVFP16() +template <> +struct tile_eligible<::cuda::std::plus<__half>, __half, 2> : ::cuda::std::true_type +{}; +template <> +struct tile_eligible<::cuda::std::multiplies<__half>, __half, 2> : ::cuda::std::true_type +{}; +# endif // _CCCL_HAS_NVFP16() + +# if _CCCL_HAS_NVBF16() +template <> +struct tile_eligible<::cuda::std::plus<__nv_bfloat16>, __nv_bfloat16, 2> : ::cuda::std::true_type +{}; +template <> +struct tile_eligible<::cuda::std::multiplies<__nv_bfloat16>, __nv_bfloat16, 2> : ::cuda::std::true_type +{}; +# endif // _CCCL_HAS_NVBF16() + +} // namespace detail::transform::tile + +CUB_NAMESPACE_END + +#endif // _CCCL_CTK_AT_LEAST(13, 3) From 37c8b718441a828fdbb4f46c5c0fafb9d128d87f Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 3 Jun 2026 13:54:47 -0700 Subject: [PATCH 11/83] split tile dispatch into kernel and tuning headers --- .../dispatch/dispatch_transform_tile.cuh | 113 +----------------- .../kernels/kernel_transform_tile.cuh | 76 ++++++++++++ .../dispatch/tuning/tuning_transform_tile.cuh | 103 ++++++++++++++++ 3 files changed, 182 insertions(+), 110 deletions(-) create mode 100644 cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh create mode 100644 cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 584d9829963..47f25c46bd9 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -20,12 +20,13 @@ #if _CCCL_CTK_AT_LEAST(13, 3) -# include +# include +# include + # include # include # include -# include # include @@ -34,99 +35,6 @@ CUB_NAMESPACE_BEGIN namespace detail::transform::tile { -constexpr int min_bytes_in_flight_per_sm(int cc_x10) -{ - if (cc_x10 >= 1000) - { - return 64 * 1024; // B200 - } - if (cc_x10 >= 900) - { - return 48 * 1024; // H100/H200 - } - if (cc_x10 >= 800) - { - return 16 * 1024; // A100 - } - return 12 * 1024; -} - -constexpr int min_size(int a) -{ - return a; -} -template -constexpr int min_size(int a, int b, Ts... rest) -{ - int m = a < b ? a : b; - return min_size(m, rest...); -} - -// mufu_heavy=true tells the policy the functor body has heavy MUFU usage. -// for small data types, vectorized load will make them arrive packed in -// registers and the compiler unpacks them and packs them back. reducing the -// compute work per thread helps here. need profiling to know the exact cause. -template -constexpr int pick_tile_size(bool mufu_heavy = false, int cc_x10 = 1000) -{ - constexpr int threads_per_block = 128; - constexpr int vector_bytes = 16; // LDG.E.128 -> 16 bytes - constexpr int max_items_per_thread = 32; - constexpr int max_occupancy = 16; - - constexpr int min_elem = min_size(int(sizeof(Out)), int(sizeof(Ins))...); - constexpr int items_for_vec = static_cast(::cuda::ceil_div(vector_bytes, min_elem)); - - // Fill (zero inputs) keeps the same latency target by counting output bytes. - constexpr int bytes_per_iter = (sizeof...(Ins) > 0) ? (int(sizeof(Ins)) + ... + 0) : int(sizeof(Out)); - const int target = min_bytes_in_flight_per_sm(cc_x10); - const int items_for_latency = - static_cast(::cuda::ceil_div(target, max_occupancy * threads_per_block * bytes_per_iter)); - - int items = items_for_vec > items_for_latency ? items_for_vec : items_for_latency; - items = static_cast(::cuda::next_power_of_two(static_cast(items))); - if (items > max_items_per_thread) - { - items = max_items_per_thread; - } - - if (mufu_heavy && min_elem < 4) - { - const int byte_cap = vector_bytes / min_elem; // 16 for I8, 8 for I16/half/bf16 - if (items > byte_cap) - { - items = byte_cap; - } - } - - return items * threads_per_block; -} - -template -__tile_global__ void -transform_kernel(int64_t num_items_, Out* __restrict__ out_, const Ins* __restrict__... ins_) -{ - namespace ct = cuda::tiles; - - const auto bx = ct::bid().x; - Fn fn{}; - - auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_)); - auto out = ct::assume_aligned<16>(out_); - - auto out_span = ct::tensor_span{out, ct::extents{num_items}}; - auto out_view = ct::partition_view{out_span, ct::shape{}}; - - auto load_one = [bx, num_items](auto* ptr_) { - auto ptr = ct::assume_aligned<16>(ptr_); - auto span = ct::tensor_span{ptr, ct::extents{num_items}}; - auto view = ct::partition_view{span, ct::shape{}}; - return view.load_masked(bx); - }; - - out_view.store_masked(fn(load_one(ins_)...), bx); -} - template cudaError_t launch_impl( ::cuda::std::tuple inputs, @@ -148,21 +56,6 @@ cudaError_t launch_impl( return cudaGetLastError(); } -template -__tile_global__ void fill_kernel(int64_t num_items_, T* __restrict__ out_, T value) -{ - namespace ct = cuda::tiles; - const auto bx = ct::bid().x; - - auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_)); - auto out = ct::assume_aligned<16>(out_); - - auto out_span = ct::tensor_span{out, ct::extents{num_items}}; - auto out_view = ct::partition_view{out_span, ct::shape{}}; - using tile_t = ct::tile>; - out_view.store_masked(ct::full(value), bx); -} - struct DeviceTransform { template diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh new file mode 100644 index 00000000000..3d038c9068f --- /dev/null +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -0,0 +1,76 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Tile DSL kernels backing cub::DeviceTransform's tile path. The kernels +// assume 16-byte alignment on every pointer and 16-byte divisibility on +// num_items so the compiler can pick LDG.E.128. Callers in the dispatch +// header are responsible for honoring those preconditions. + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_CTK_AT_LEAST(13, 3) + +# include + +# include + +CUB_NAMESPACE_BEGIN + +namespace detail::transform::tile +{ + +template +__tile_global__ void +transform_kernel(int64_t num_items_, Out* __restrict__ out_, const Ins* __restrict__... ins_) +{ + namespace ct = cuda::tiles; + + const auto bx = ct::bid().x; + Fn fn{}; + + auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_)); + auto out = ct::assume_aligned<16>(out_); + + auto out_span = ct::tensor_span{out, ct::extents{num_items}}; + auto out_view = ct::partition_view{out_span, ct::shape{}}; + + auto load_one = [bx, num_items](auto* ptr_) { + auto ptr = ct::assume_aligned<16>(ptr_); + auto span = ct::tensor_span{ptr, ct::extents{num_items}}; + auto view = ct::partition_view{span, ct::shape{}}; + return view.load_masked(bx); + }; + + out_view.store_masked(fn(load_one(ins_)...), bx); +} + +template +__tile_global__ void fill_kernel(int64_t num_items_, T* __restrict__ out_, T value) +{ + namespace ct = cuda::tiles; + const auto bx = ct::bid().x; + + auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_)); + auto out = ct::assume_aligned<16>(out_); + + auto out_span = ct::tensor_span{out, ct::extents{num_items}}; + auto out_view = ct::partition_view{out_span, ct::shape{}}; + using tile_t = ct::tile>; + out_view.store_masked(ct::full(value), bx); +} + +} // namespace detail::transform::tile + +CUB_NAMESPACE_END + +#endif // _CCCL_CTK_AT_LEAST(13, 3) diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh new file mode 100644 index 00000000000..86c2d1b394f --- /dev/null +++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh @@ -0,0 +1,103 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Policy picker for cub::DeviceTransform's tile path. Mirrors the +// bytes-in-flight target used by CUB's non-tile algorithms (see +// tuning_transform.cuh's cc_to_min_bytes_in_flight) but expresses the +// answer as a TileSize, since tile kernels partition by compile-time +// shape rather than threads*items. + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_CTK_AT_LEAST(13, 3) + +# include + +CUB_NAMESPACE_BEGIN + +namespace detail::transform::tile +{ + +constexpr int min_bytes_in_flight_per_sm(int cc_x10) +{ + if (cc_x10 >= 1000) + { + return 64 * 1024; // B200 + } + if (cc_x10 >= 900) + { + return 48 * 1024; // H100/H200 + } + if (cc_x10 >= 800) + { + return 16 * 1024; // A100 + } + return 12 * 1024; +} + +constexpr int min_size(int a) +{ + return a; +} +template +constexpr int min_size(int a, int b, Ts... rest) +{ + int m = a < b ? a : b; + return min_size(m, rest...); +} + +// mufu_heavy=true tells the policy the functor body has heavy MUFU usage. +// for small data types, vectorized load will make them arrive packed in +// registers and the compiler unpacks them and packs them back. reducing the +// compute work per thread helps here. need profiling to know the exact cause. +template +constexpr int pick_tile_size(bool mufu_heavy = false, int cc_x10 = 1000) +{ + constexpr int threads_per_block = 128; + constexpr int vector_bytes = 16; // LDG.E.128 -> 16 bytes + constexpr int max_items_per_thread = 32; + constexpr int max_occupancy = 16; + + constexpr int min_elem = min_size(int(sizeof(Out)), int(sizeof(Ins))...); + constexpr int items_for_vec = static_cast(::cuda::ceil_div(vector_bytes, min_elem)); + + // Fill (zero inputs) keeps the same latency target by counting output bytes. + constexpr int bytes_per_iter = (sizeof...(Ins) > 0) ? (int(sizeof(Ins)) + ... + 0) : int(sizeof(Out)); + const int target = min_bytes_in_flight_per_sm(cc_x10); + const int items_for_latency = + static_cast(::cuda::ceil_div(target, max_occupancy * threads_per_block * bytes_per_iter)); + + int items = items_for_vec > items_for_latency ? items_for_vec : items_for_latency; + items = static_cast(::cuda::next_power_of_two(static_cast(items))); + if (items > max_items_per_thread) + { + items = max_items_per_thread; + } + + if (mufu_heavy && min_elem < 4) + { + const int byte_cap = vector_bytes / min_elem; // 16 for I8, 8 for I16/half/bf16 + if (items > byte_cap) + { + items = byte_cap; + } + } + + return items * threads_per_block; +} + +} // namespace detail::transform::tile + +CUB_NAMESPACE_END + +#endif // _CCCL_CTK_AT_LEAST(13, 3) From 7fb935fd2c4ea638835cf9948d254c811b761996 Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 3 Jun 2026 14:14:10 -0700 Subject: [PATCH 12/83] wire tile dispatch hook into cub::DeviceTransform --- cub/cub/device/device_transform.cuh | 31 ++++++++ .../dispatch/dispatch_transform_tile.cuh | 77 +++++++++++++++++++ .../dispatch_transform_tile_traits.cuh | 68 ++++++++++++---- 3 files changed, 161 insertions(+), 15 deletions(-) diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index d8ad0354bfc..77c50432e34 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -17,6 +17,10 @@ #include #include +#if _CCCL_CTK_AT_LEAST(13, 3) && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +# include +#endif + #include #include #include @@ -99,6 +103,32 @@ struct DeviceTransform const auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env).get(); +#if _CCCL_CTK_AT_LEAST(13, 3) && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() + // Opt-in tile path. When every compile-time gate passes we route here + // and DO NOT instantiate the standard CUB transform dispatch below -- + // under --enable-tile that path fails to compile for many (Op, T) + // combinations. Runtime alignment / divisibility violations on this + // branch surface as cudaErrorInvalidValue; the caller is expected to + // satisfy the 16-byte preconditions when opting into the tile path. + if constexpr (StableAddress == detail::transform::requires_stable_address::no + && ::cuda::std::is_same_v + && detail::transform::tile::tile_dispatch_eligible_v< + TransformOp, + RandomAccessIteratorOut, + RandomAccessIteratorsIn...>) + { + cudaError_t tile_result; + if (detail::transform::tile::try_dispatch( + inputs, output, static_cast(num_items), stream, tile_result)) + { + return tile_result; + } + return cudaErrorInvalidValue; + } + else +#endif // _CCCL_CTK_AT_LEAST(13, 3) && CCCL_ENABLE_TILE_TRANSFORM_DISPATCH && _CCCL_TILE_COMPILATION() + { + using tuning_env = ::cuda::std::execution::__query_result_or_t>; using default_policy_selector = @@ -122,6 +152,7 @@ struct DeviceTransform ::cuda::std::move(transform_op), stream, policy_selector{}); + } } // TODO(bgruber): we want to eventually forward the output tuple to the kernel and optimize writing multiple streams diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 47f25c46bd9..23b1fac2790 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -20,9 +20,20 @@ #if _CCCL_CTK_AT_LEAST(13, 3) +# include # include # include +# include +# include + +# include +# include +# include +# include +# include +# include +# include # include # include @@ -81,6 +92,72 @@ struct DeviceTransform } }; +namespace __detail +{ +template +using __unwrapped_value_t = + ::cuda::std::remove_cv_t<::cuda::std::remove_pointer_t()))>>; +} // namespace __detail + +// Combined compile-time predicate used by cub::DeviceTransform's __transform_internal +// to decide whether to route a given (Op, OutIter, InIters...) to the tile path. +// The call site lifts this into an `if constexpr` so the standard CUB dispatch +// is not instantiated when tile takes over (under --enable-tile the standard +// path fails to compile for many functor/type combinations). +template +inline constexpr bool tile_dispatch_eligible_v = + THRUST_NS_QUALIFIER::is_contiguous_iterator_v + && (THRUST_NS_QUALIFIER::is_contiguous_iterator_v && ...) + && tile_eligible_v, sizeof...(InIters)>; + +// Bridge between cub::DeviceTransform::__transform_internal and the tile +// DeviceTransform above. Precondition: tile_dispatch_eligible_v is true. Returns true and writes the launch result when the +// call was handled; returns false when the runtime 16-byte alignment / +// divisibility preconditions are not satisfied (caller surfaces that as +// cudaErrorInvalidValue -- there is no CUB fallback under --enable-tile). +// +// The tile kernel is launched with the trait's tile_op_type (a tile-friendly +// mirror of Op with __tile__ operator), NOT the user's Op instance -- the +// user's scalar functor cannot be invoked on ct::tile arguments. +template +CUB_RUNTIME_FUNCTION bool try_dispatch( + ::cuda::std::tuple inputs, + OutIter output, + OffsetT num_items, + cudaStream_t stream, + cudaError_t& result) +{ + auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output); + auto in_ptrs = ::cuda::std::apply( + [](auto... iters) { + return ::cuda::std::make_tuple(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters)...); + }, + inputs); + using out_value_t = ::cuda::std::remove_cv_t<::cuda::std::remove_pointer_t>; + using tile_op_t = typename tile_eligible::tile_op_type; + static_assert(::cuda::std::is_empty_v, + "tile_op_type must be stateless (the tile kernel default-constructs it)"); + static_assert(::cuda::std::is_trivially_default_constructible_v, + "tile_op_type must be trivially default constructible"); + + constexpr int kAlign = 16; + const bool aligned_out = ::cuda::is_aligned(out_ptr, kAlign); + const bool aligned_in = + ::cuda::std::apply([](auto... p) { return ((::cuda::is_aligned(p, kAlign)) && ...); }, in_ptrs); + // Tile DSL's tensor_span uses uint32_t shape; cap at 2^31 to stay below + // the wraparound cliff at 2^32. + constexpr OffsetT kMaxItems = OffsetT{1} << 31; + if (!aligned_out || !aligned_in || (num_items % kAlign) != 0 || num_items > kMaxItems) + { + return false; + } + result = DeviceTransform::template Transform<0, tile_mufu_heavy_v, tile_op_t>( + in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), tile_op_t{}, stream); + return true; +} + } // namespace detail::transform::tile CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh index 8bfdadaac87..c823cc46b99 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh @@ -3,15 +3,20 @@ // Compile-time policy for cub::DeviceTransform's tile path. // +// Users call cub::DeviceTransform::Transform with whatever scalar functor they +// have (e.g. cuda::std::plus<__half>). That functor is NOT directly callable +// from a tile transform_kernel -- its operator() takes scalars, not ct::tile. +// So eligible specializations declare a `tile_op_type` member that names a +// tile-friendly replacement functor (with __tile__ templated operator()) that +// performs the same operation. The dispatch hook then launches the tile +// kernel with the replacement, not the user's original. +// // tile_eligible_v answers "should DeviceTransform::Transform // route to the tile kernel for this (functor, element type, input arity)?". // tile_mufu_heavy_v hints the tile policy picker that Op spends most of // its time on MUFU instructions, so the picker caps items/thread at the // vector width to avoid piling up MUFU work that cannot SIMD on Blackwell // for sub-4-byte types. -// -// This header is pure trait infrastructure; no callers yet. Specializations -// land here as benches confirm tile wins for a (Op, T, NIn) combination. #pragma once @@ -33,13 +38,38 @@ # include +# if _CCCL_TILE_COMPILATION() +# include +# endif + CUB_NAMESPACE_BEGIN namespace detail::transform::tile { -// Primary template: tile path is opt-in. Specialize for combinations where a -// bench has shown the tile kernel beats the existing CUB algorithms. +# if _CCCL_TILE_COMPILATION() +// Tile-friendly mirrors of common cuda::std ops. Each has a __tile__ +// templated operator() so it can be invoked from inside transform_kernel +// where the arguments are ct::tile rather than scalar T. +struct tile_plus +{ + template + __tile__ auto operator()(A a, B b) const + { + return a + b; + } +}; + +struct tile_multiplies +{ + template + __tile__ auto operator()(A a, B b) const + { + return a * b; + } +}; +# endif // _CCCL_TILE_COMPILATION() + template struct tile_eligible : ::cuda::std::false_type {}; @@ -47,8 +77,6 @@ struct tile_eligible : ::cuda::std::false_type template inline constexpr bool tile_eligible_v = tile_eligible::value; -// Companion trait: report Op as MUFU-heavy so the tile policy picker caps -// items/thread at the vector width on small element types. Default is false. template struct tile_mufu_heavy : ::cuda::std::false_type {}; @@ -56,23 +84,33 @@ struct tile_mufu_heavy : ::cuda::std::false_type template inline constexpr bool tile_mufu_heavy_v = tile_mufu_heavy::value; -# if _CCCL_HAS_NVFP16() +# if _CCCL_TILE_COMPILATION() +# if _CCCL_HAS_NVFP16() template <> struct tile_eligible<::cuda::std::plus<__half>, __half, 2> : ::cuda::std::true_type -{}; +{ + using tile_op_type = tile_plus; +}; template <> struct tile_eligible<::cuda::std::multiplies<__half>, __half, 2> : ::cuda::std::true_type -{}; -# endif // _CCCL_HAS_NVFP16() +{ + using tile_op_type = tile_multiplies; +}; +# endif // _CCCL_HAS_NVFP16() -# if _CCCL_HAS_NVBF16() +# if _CCCL_HAS_NVBF16() template <> struct tile_eligible<::cuda::std::plus<__nv_bfloat16>, __nv_bfloat16, 2> : ::cuda::std::true_type -{}; +{ + using tile_op_type = tile_plus; +}; template <> struct tile_eligible<::cuda::std::multiplies<__nv_bfloat16>, __nv_bfloat16, 2> : ::cuda::std::true_type -{}; -# endif // _CCCL_HAS_NVBF16() +{ + using tile_op_type = tile_multiplies; +}; +# endif // _CCCL_HAS_NVBF16() +# endif // _CCCL_TILE_COMPILATION() } // namespace detail::transform::tile From 906fbecc2a88323e850b85d4532c732c0f0c586e Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 3 Jun 2026 14:24:08 -0700 Subject: [PATCH 13/83] drop runtime gates from tile dispatch helper --- cub/cub/device/device_transform.cuh | 15 +++------ .../dispatch/dispatch_transform_tile.cuh | 32 ++++--------------- 2 files changed, 12 insertions(+), 35 deletions(-) diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index 77c50432e34..4890a908085 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -107,9 +107,9 @@ struct DeviceTransform // Opt-in tile path. When every compile-time gate passes we route here // and DO NOT instantiate the standard CUB transform dispatch below -- // under --enable-tile that path fails to compile for many (Op, T) - // combinations. Runtime alignment / divisibility violations on this - // branch surface as cudaErrorInvalidValue; the caller is expected to - // satisfy the 16-byte preconditions when opting into the tile path. + // combinations. The 16-byte alignment, num_items divisibility, and the + // 2^31 size cap are the caller's contract once the trait flags the + // (Op, T, NIn) combo as tile-eligible. if constexpr (StableAddress == detail::transform::requires_stable_address::no && ::cuda::std::is_same_v && detail::transform::tile::tile_dispatch_eligible_v< @@ -117,13 +117,8 @@ struct DeviceTransform RandomAccessIteratorOut, RandomAccessIteratorsIn...>) { - cudaError_t tile_result; - if (detail::transform::tile::try_dispatch( - inputs, output, static_cast(num_items), stream, tile_result)) - { - return tile_result; - } - return cudaErrorInvalidValue; + return detail::transform::tile::dispatch( + inputs, output, static_cast(num_items), stream); } else #endif // _CCCL_CTK_AT_LEAST(13, 3) && CCCL_ENABLE_TILE_TRANSFORM_DISPATCH && _CCCL_TILE_COMPILATION() diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 23b1fac2790..41a5a4e9cb3 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -27,11 +27,10 @@ # include # include -# include # include -# include # include # include +# include # include # include # include @@ -113,21 +112,16 @@ inline constexpr bool tile_dispatch_eligible_v = // Bridge between cub::DeviceTransform::__transform_internal and the tile // DeviceTransform above. Precondition: tile_dispatch_eligible_v is true. Returns true and writes the launch result when the -// call was handled; returns false when the runtime 16-byte alignment / -// divisibility preconditions are not satisfied (caller surfaces that as -// cudaErrorInvalidValue -- there is no CUB fallback under --enable-tile). +// InIters...> is true. The 16-byte pointer alignment, num_items divisibility, +// and 2^31 size cap (the tile DSL's uint32_t extent ceiling) are the caller's +// contract -- opting into the tile path is opting into these preconditions. // // The tile kernel is launched with the trait's tile_op_type (a tile-friendly // mirror of Op with __tile__ operator), NOT the user's Op instance -- the // user's scalar functor cannot be invoked on ct::tile arguments. template -CUB_RUNTIME_FUNCTION bool try_dispatch( - ::cuda::std::tuple inputs, - OutIter output, - OffsetT num_items, - cudaStream_t stream, - cudaError_t& result) +CUB_RUNTIME_FUNCTION cudaError_t dispatch( + ::cuda::std::tuple inputs, OutIter output, OffsetT num_items, cudaStream_t stream) { auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output); auto in_ptrs = ::cuda::std::apply( @@ -142,20 +136,8 @@ CUB_RUNTIME_FUNCTION bool try_dispatch( static_assert(::cuda::std::is_trivially_default_constructible_v, "tile_op_type must be trivially default constructible"); - constexpr int kAlign = 16; - const bool aligned_out = ::cuda::is_aligned(out_ptr, kAlign); - const bool aligned_in = - ::cuda::std::apply([](auto... p) { return ((::cuda::is_aligned(p, kAlign)) && ...); }, in_ptrs); - // Tile DSL's tensor_span uses uint32_t shape; cap at 2^31 to stay below - // the wraparound cliff at 2^32. - constexpr OffsetT kMaxItems = OffsetT{1} << 31; - if (!aligned_out || !aligned_in || (num_items % kAlign) != 0 || num_items > kMaxItems) - { - return false; - } - result = DeviceTransform::template Transform<0, tile_mufu_heavy_v, tile_op_t>( + return DeviceTransform::template Transform<0, tile_mufu_heavy_v, tile_op_t>( in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), tile_op_t{}, stream); - return true; } } // namespace detail::transform::tile From 249131e6cb68e144ceea71e9495785616a68aaa0 Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 3 Jun 2026 14:28:29 -0700 Subject: [PATCH 14/83] add runtime alignment check before routing to tile --- cub/cub/device/device_transform.cuh | 4 +++ .../dispatch/dispatch_transform_tile.cuh | 33 +++++++++++++++++-- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index 4890a908085..4a27a08b8d5 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -117,6 +117,10 @@ struct DeviceTransform RandomAccessIteratorOut, RandomAccessIteratorsIn...>) { + if (!detail::transform::tile::runtime_preconditions_ok(inputs, output, static_cast(num_items))) + { + return cudaErrorInvalidValue; + } return detail::transform::tile::dispatch( inputs, output, static_cast(num_items), stream); } diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 41a5a4e9cb3..414f64a3075 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -27,6 +27,7 @@ # include # include +# include # include # include # include @@ -110,11 +111,37 @@ inline constexpr bool tile_dispatch_eligible_v = && (THRUST_NS_QUALIFIER::is_contiguous_iterator_v && ...) && tile_eligible_v, sizeof...(InIters)>; +// Runtime predicate consulted by the cub::DeviceTransform tile hook before +// it commits to the tile path. Mirrors how CUB's dispatch_t::CanVectorize +// guards the vectorized kernel. The tile kernels use ct::assume_aligned<16> +// and ct::assume_divisible<16>, so violating these at runtime is UB. +// Returns false to tell the hook to surface cudaErrorInvalidValue. +template +CUB_RUNTIME_FUNCTION bool +runtime_preconditions_ok(::cuda::std::tuple const& inputs, OutIter output, OffsetT num_items) +{ + constexpr int kAlign = 16; + // Tile DSL's tensor_span uses uint32_t shape internally; values >= 2^32 + // wrap to 0. Cap at 2^31 to stay below the cliff with margin. + constexpr OffsetT kMaxItems = OffsetT{1} << 31; + + auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output); + const bool aligned_out = ::cuda::is_aligned(out_ptr, kAlign); + const bool aligned_in = ::cuda::std::apply( + [](auto... iters) { + return ((::cuda::is_aligned(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters), kAlign)) && ...); + }, + inputs); + + return aligned_out && aligned_in && (num_items % kAlign) == 0 && num_items <= kMaxItems; +} + // Bridge between cub::DeviceTransform::__transform_internal and the tile // DeviceTransform above. Precondition: tile_dispatch_eligible_v is true. The 16-byte pointer alignment, num_items divisibility, -// and 2^31 size cap (the tile DSL's uint32_t extent ceiling) are the caller's -// contract -- opting into the tile path is opting into these preconditions. +// InIters...> is true AND runtime_preconditions_ok returned true. The kernel +// itself assumes 16-byte pointer alignment and num_items divisibility; the +// caller (the hook in device_transform.cuh) is responsible for checking +// runtime_preconditions_ok first. // // The tile kernel is launched with the trait's tile_op_type (a tile-friendly // mirror of Op with __tile__ operator), NOT the user's Op instance -- the From 744dbb32078849753956f8a984dcd1b269593434 Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 3 Jun 2026 15:59:44 -0700 Subject: [PATCH 15/83] drop _CCCL_TILE from _CCCL_API to unblock CUB under enable-tile --- libcudacxx/include/cuda/std/__cccl/visibility.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/libcudacxx/include/cuda/std/__cccl/visibility.h b/libcudacxx/include/cuda/std/__cccl/visibility.h index 075a98130aa..47337d8d8fd 100644 --- a/libcudacxx/include/cuda/std/__cccl/visibility.h +++ b/libcudacxx/include/cuda/std/__cccl/visibility.h @@ -116,7 +116,15 @@ # define _CCCL_DEVICE_API _CCCL_DEVICE # define _CCCL_TILE_API _CCCL_TILE #else // ^^^ _CCCL_COMPILER(NVHPC) ^^^ / vvv !_CCCL_COMPILER(NVHPC) vvv -# define _CCCL_API _CCCL_TILE _CCCL_HOST_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION +// Local fork patch: drop _CCCL_TILE from _CCCL_API. Under the tile compiler's +// local-only context check, marking a host/device utility __tile__ means its +// body must satisfy tile restrictions even when the caller is non-tile. That +// fails for any utility that takes a user-provided callable (apply, invoke, +// visit, runtime_assume_aligned, ...). Drop the marker globally; tile DSL +// code in this branch uses its own tile-marked operations and doesn't depend +// on libcudacxx utilities being tile-callable. Revert when upstream fixes the +// marking discipline (or the compiler adopts per-instantiation checking). +# define _CCCL_API _CCCL_HOST_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION # define _CCCL_HOST_DEVICE_API _CCCL_HOST_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION # define _CCCL_HOST_API _CCCL_HOST _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION # define _CCCL_DEVICE_API _CCCL_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION From 74fd6ccabe35911082f28cfcb3aaee9947ad2cc3 Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 3 Jun 2026 16:05:43 -0700 Subject: [PATCH 16/83] fall back to standard CUB dispatch when tile preconditions fail --- cub/cub/device/device_transform.cuh | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index 4a27a08b8d5..1560d9e1a68 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -104,12 +104,12 @@ struct DeviceTransform const auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env).get(); #if _CCCL_CTK_AT_LEAST(13, 3) && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() - // Opt-in tile path. When every compile-time gate passes we route here - // and DO NOT instantiate the standard CUB transform dispatch below -- - // under --enable-tile that path fails to compile for many (Op, T) - // combinations. The 16-byte alignment, num_items divisibility, and the - // 2^31 size cap are the caller's contract once the trait flags the - // (Op, T, NIn) combo as tile-eligible. + // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible AND + // the runtime alignment / divisibility / size preconditions hold, route + // to the tile kernel. Otherwise fall through to the standard CUB + // dispatch below -- CUB's existing kernels handle the unaligned tail + // case via their own internal logic, so misalignment is a graceful + // fallback, not an error. if constexpr (StableAddress == detail::transform::requires_stable_address::no && ::cuda::std::is_same_v && detail::transform::tile::tile_dispatch_eligible_v< @@ -117,16 +117,13 @@ struct DeviceTransform RandomAccessIteratorOut, RandomAccessIteratorsIn...>) { - if (!detail::transform::tile::runtime_preconditions_ok(inputs, output, static_cast(num_items))) + if (detail::transform::tile::runtime_preconditions_ok(inputs, output, static_cast(num_items))) { - return cudaErrorInvalidValue; + return detail::transform::tile::dispatch( + inputs, output, static_cast(num_items), stream); } - return detail::transform::tile::dispatch( - inputs, output, static_cast(num_items), stream); } - else #endif // _CCCL_CTK_AT_LEAST(13, 3) && CCCL_ENABLE_TILE_TRANSFORM_DISPATCH && _CCCL_TILE_COMPILATION() - { using tuning_env = ::cuda::std::execution::__query_result_or_t>; @@ -151,7 +148,6 @@ struct DeviceTransform ::cuda::std::move(transform_op), stream, policy_selector{}); - } } // TODO(bgruber): we want to eventually forward the output tuple to the kernel and optimize writing multiple streams From b7e8c924aec76b8883feeff44eb84abc08e1a462 Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 3 Jun 2026 17:44:11 -0700 Subject: [PATCH 17/83] migrate tile benches and tests to cub::DeviceTransform --- .../bench/transform/tile/babelstream.cu | 87 +++++++--- cub/benchmarks/bench/transform/tile/copy.cu | 30 +++- cub/benchmarks/bench/transform/tile/fill.cu | 14 +- .../bench/transform/tile/grayscale.cu | 40 +++-- .../bench/transform/tile/pytorch.cu | 149 +++++++++++++----- .../transform/tile/test_device_transform.cu | 76 +++++++-- .../dispatch/dispatch_transform_tile.cuh | 8 - 7 files changed, 301 insertions(+), 103 deletions(-) diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu index 1e180f850a4..2201b05674a 100644 --- a/cub/benchmarks/bench/transform/tile/babelstream.cu +++ b/cub/benchmarks/bench/transform/tile/babelstream.cu @@ -1,31 +1,74 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause -// BabelStream-style bandwidth benchmarks on cub_tile::DeviceTransform. -// Mirror of cub/benchmarks/bench/transform/babelstream.cu so we can compare -// numbers side-by-side. +// BabelStream-style bandwidth benchmarks via cub::DeviceTransform::Transform. +// Custom ops self-register their tile substitutes via tile_eligible<>, so the +// dispatch hook routes them to the tile kernel under --enable-tile + the +// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH macro. #include -#include +#include #include #include #include #include +#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +# include +#endif + #include "bench_init.cuh" -#ifndef TILE_SIZE -#define TILE_SIZE 0 // 0 = auto-pick via detail::pick_tile_size -#endif -#define STR_(x) #x -#define STR(x) STR_(x) +// User-defined scalar ops (used at the call site, in both build modes). +struct mul_op { + template + __host__ __device__ auto operator()(B b) const { return -(b + b); } +}; +struct add_op { + template + __host__ __device__ auto operator()(A a, B b) const { return a + b; } +}; +struct triad_op { + template + __host__ __device__ auto operator()(B b, C c) const { return b - c - c; } +}; +struct nstream_op { + template + __host__ __device__ auto operator()(A a, B b, C c) const { return a + b - c - c; } +}; -struct mul_op { template __tile__ auto operator()(B b) const { return -(b + b); } }; -struct add_op { template __tile__ auto operator()(A a, B b) const { return a + b; } }; -struct triad_op { template __tile__ auto operator()(B b, C c) const { return b - c - c; } }; -struct nstream_op { template __tile__ auto operator()(A a, B b, C c) const { return a + b - c - c; } }; +#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +// Tile-friendly substitutes (must be stateless + trivially default constructible). +struct tile_mul_op { + template + __tile__ auto operator()(B b) const { return -(b + b); } +}; +struct tile_add_op { + template + __tile__ auto operator()(A a, B b) const { return a + b; } +}; +struct tile_triad_op { + template + __tile__ auto operator()(B b, C c) const { return b - c - c; } +}; +struct tile_nstream_op { + template + __tile__ auto operator()(A a, B b, C c) const { return a + b - c - c; } +}; + +// Self-register each scalar op for all T (partial specialization on T). +CUB_NAMESPACE_BEGIN +namespace detail::transform::tile +{ +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_mul_op; }; +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_add_op; }; +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_triad_op; }; +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_nstream_op; }; +} // namespace detail::transform::tile +CUB_NAMESPACE_END +#endif // True if `bytes_needed` worth of GPU memory is available, with 5% headroom // for driver overhead. Caller should `state.skip(...)` on false. @@ -43,8 +86,6 @@ struct Buffers { cudaMalloc(&a, n * sizeof(T)); cudaMalloc(&b, n * sizeof(T)); cudaMalloc(&c, n * sizeof(T)); - // touch every page so HBM is actually backed (not cold-page tricks). - // values don't matter for BW measurement. bench_init::rand_fill(a, n, 0xA111); bench_init::rand_fill(b, n, 0xB222); bench_init::rand_fill(c, n, 0xC333); @@ -62,7 +103,7 @@ void mul(nvbench::state& state, nvbench::type_list) { state.add_global_memory_reads(n); state.add_global_memory_writes(n); state.exec([&](nvbench::launch& launch) { - cub_tile::DeviceTransform::Transform( + cub::DeviceTransform::Transform( ::cuda::std::make_tuple(buf.b), buf.c, n, mul_op{}, launch.get_stream()); }); } @@ -75,7 +116,7 @@ void add(nvbench::state& state, nvbench::type_list) { state.add_global_memory_reads(2 * n); state.add_global_memory_writes(n); state.exec([&](nvbench::launch& launch) { - cub_tile::DeviceTransform::Transform( + cub::DeviceTransform::Transform( ::cuda::std::make_tuple(buf.a, buf.b), buf.c, n, add_op{}, launch.get_stream()); }); } @@ -88,7 +129,7 @@ void triad(nvbench::state& state, nvbench::type_list) { state.add_global_memory_reads(2 * n); state.add_global_memory_writes(n); state.exec([&](nvbench::launch& launch) { - cub_tile::DeviceTransform::Transform( + cub::DeviceTransform::Transform( ::cuda::std::make_tuple(buf.b, buf.c), buf.a, n, triad_op{}, launch.get_stream()); }); } @@ -101,7 +142,7 @@ void nstream(nvbench::state& state, nvbench::type_list) { state.add_global_memory_reads(3 * n); state.add_global_memory_writes(n); state.exec([&](nvbench::launch& launch) { - cub_tile::DeviceTransform::Transform( + cub::DeviceTransform::Transform( ::cuda::std::make_tuple(buf.a, buf.b, buf.c), buf.a, n, nstream_op{}, launch.get_stream()); }); } @@ -109,9 +150,9 @@ void nstream(nvbench::state& state, nvbench::type_list) { using types = nvbench::type_list; inline auto sizes = std::vector{16, 20, 24, 28, 31}; -NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(types)).set_name("tile_mul_ts" STR(TILE_SIZE)).add_int64_power_of_two_axis("Elements{io}", sizes); -NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(types)).set_name("tile_add_ts" STR(TILE_SIZE)).add_int64_power_of_two_axis("Elements{io}", sizes); -NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(types)).set_name("tile_triad_ts" STR(TILE_SIZE)).add_int64_power_of_two_axis("Elements{io}", sizes); -NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(types)).set_name("tile_nstream_ts" STR(TILE_SIZE)).add_int64_power_of_two_axis("Elements{io}", sizes); +NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(types)).set_name("tile_mul").add_int64_power_of_two_axis("Elements{io}", sizes); +NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(types)).set_name("tile_add").add_int64_power_of_two_axis("Elements{io}", sizes); +NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(types)).set_name("tile_triad").add_int64_power_of_two_axis("Elements{io}", sizes); +NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(types)).set_name("tile_nstream").add_int64_power_of_two_axis("Elements{io}", sizes); NVBENCH_MAIN diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu index 951af8b0fed..07d08f74b8b 100644 --- a/cub/benchmarks/bench/transform/tile/copy.cu +++ b/cub/benchmarks/bench/transform/tile/copy.cu @@ -1,23 +1,43 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause -// Pure copy bench (identity transform) — tile side. -// Isolates the load/store path from any arithmetic on top: useful for -// catching narrow-type store wars (e.g. byte stores capping BW). +// Pure copy bench (identity transform). Custom identity op self-registers +// its tile substitute via tile_eligible<>; under --enable-tile + the +// dispatch macro this routes to the tile load_masked/store_masked path, +// otherwise it falls through to CUB's standard transform. #include -#include + +#include + #include #include #include #include +#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +# include +#endif + #include "bench_init.cuh" struct identity { + template __host__ __device__ auto operator()(T v) const { return v; } +}; + +#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +struct tile_identity { template __tile__ auto operator()(T v) const { return v; } }; +CUB_NAMESPACE_BEGIN +namespace detail::transform::tile +{ +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_identity; }; +} // namespace detail::transform::tile +CUB_NAMESPACE_END +#endif + template void copy(nvbench::state& state, nvbench::type_list) { auto n = state.get_int64("Elements{io}"); @@ -28,7 +48,7 @@ void copy(nvbench::state& state, nvbench::type_list) { state.add_global_memory_reads(n); state.add_global_memory_writes(n); state.exec([&](nvbench::launch& launch) { - cub_tile::DeviceTransform::Transform( + cub::DeviceTransform::Transform( ::cuda::std::make_tuple(in), out, n, identity{}, launch.get_stream()); }); cudaFree(in); cudaFree(out); diff --git a/cub/benchmarks/bench/transform/tile/fill.cu b/cub/benchmarks/bench/transform/tile/fill.cu index 5514c1a1287..5105b25b67b 100644 --- a/cub/benchmarks/bench/transform/tile/fill.cu +++ b/cub/benchmarks/bench/transform/tile/fill.cu @@ -1,12 +1,16 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause -// Fill: zero-input broadcast. CUB models this as Transform with empty input tuple -// and a no-arg op. Tile can't express zero-input Transform directly, so we use the -// dedicated cub_tile::DeviceTransform::Fill API which writes a constant. +// Fill: zero-input broadcast. Calls cub::DeviceTransform::Fill, which goes +// through the unified __transform_internal path -- our trait dispatch hook +// sees the zero-input case but currently has no trait spec for it, so this +// lands on CUB's standard Fill kernel. Wire a tile substitute later if Fill +// becomes a bottleneck. #include -#include + +#include + #include template @@ -16,7 +20,7 @@ void fill(nvbench::state& state, nvbench::type_list) { state.add_element_count(n); state.add_global_memory_writes(n); state.exec([&](nvbench::launch& launch) { - cub_tile::DeviceTransform::Fill(out, n, T(42), launch.get_stream()); + cub::DeviceTransform::Fill(out, n, T(42), launch.get_stream()); }); cudaFree(out); } diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu index 14641c2d872..e715945b9bb 100644 --- a/cub/benchmarks/bench/transform/tile/grayscale.cu +++ b/cub/benchmarks/bench/transform/tile/grayscale.cu @@ -1,21 +1,35 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause -// Grayscale: RGB pixel -> luminance. Uses a 3-component pixel type. -// CUB stores rgb_t (12 bytes) packed; tile may or may not accept this as an -// element type. If tile rejects rgb_t, this bench will fail to compile — -// we'll then fall back to treating R/G/B as three separate float streams. +// Grayscale: RGB pixel -> luminance via three separate input streams. +// Custom rgb_to_y op self-registers its tile substitute via tile_eligible<>. #include -#include -#include "bench_init.cuh" + +#include + #include #include #include -// Three-stream version (R, G, B as separate input arrays). -// Computationally equivalent to CUB's packed rgb_t version. +#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +# include +#endif + +#include "bench_init.cuh" + struct rgb_to_y { + template + __host__ __device__ auto operator()(R r, G g, B b) const { + constexpr float w_r = 0.2989f; + constexpr float w_g = 0.587f; + constexpr float w_b = 0.114f; + return w_r * r + w_g * g + w_b * b; + } +}; + +#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +struct tile_rgb_to_y { template __tile__ auto operator()(R r, G g, B b) const { constexpr float w_r = 0.2989f; @@ -25,6 +39,14 @@ struct rgb_to_y { } }; +CUB_NAMESPACE_BEGIN +namespace detail::transform::tile +{ +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_rgb_to_y; }; +} // namespace detail::transform::tile +CUB_NAMESPACE_END +#endif + template void grayscale(nvbench::state& state, nvbench::type_list) { const auto n = state.get_int64("Elements{io}"); @@ -39,7 +61,7 @@ void grayscale(nvbench::state& state, nvbench::type_list) { state.add_global_memory_reads(3 * n); // matches CUB's rgb_t = 3*sizeof(T) state.add_global_memory_writes(n); state.exec([&](nvbench::launch& launch) { - cub_tile::DeviceTransform::Transform( + cub::DeviceTransform::Transform( ::cuda::std::make_tuple(r, g, b), out, n, rgb_to_y{}, launch.get_stream()); }); cudaFree(r); cudaFree(g); cudaFree(b); cudaFree(out); diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu index e1eee3e4452..71cbd20f583 100644 --- a/cub/benchmarks/bench/transform/tile/pytorch.cu +++ b/cub/benchmarks/bench/transform/tile/pytorch.cu @@ -1,51 +1,128 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause -// PyTorch ops on tile. Uses ct::tanh / ct::sin / ct::exp / ct::select. +// PyTorch-style ops via cub::DeviceTransform::Transform. Each custom op +// self-registers a tile substitute through tile_eligible<>, so the dispatch +// hook routes them to the tile kernel under --enable-tile + the +// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH macro. MUFU-heavy ops also opt into +// tile_mufu_heavy<> so the tile policy picker caps items/thread at the +// vector width on sub-4-byte types. #include -#include + +#include + #include #include #include +#include #include #include +#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +# include +#endif + #include "bench_init.cuh" -namespace ct = cuda::tiles; +// ======================================================================== +// Scalar ops (the types the user passes to cub::DeviceTransform::Transform). +// Sub-4-byte input types compute in float and cast back, matching the tile +// substitute below. +// ======================================================================== +template __host__ __device__ float to_f(T v) { return static_cast(v); } +template __host__ __device__ T from_f(float f) { return static_cast(f); } + +struct relu_op { template __host__ __device__ T operator()(T v) const { + float f = to_f(v); return from_f(f > 0.0f ? f : 0.0f); } }; +struct sigmoid_op { template __host__ __device__ T operator()(T v) const { + float f = to_f(v); return from_f(1.0f / (1.0f + ::cuda::std::exp(-f))); } }; +struct tanh_op { template __host__ __device__ T operator()(T v) const { + return from_f(::cuda::std::tanh(to_f(v))); } }; +struct gelu_op { template __host__ __device__ T operator()(T v) const { + constexpr float k0 = 0.7978845608028654f, k1 = 0.044715f; + float f = to_f(v); + return from_f(0.5f * f * (1.0f + ::cuda::std::tanh(k0 * (f + k1 * f * f * f)))); } }; +struct sin_op { template __host__ __device__ T operator()(T v) const { + return from_f(::cuda::std::sin(to_f(v))); } }; +struct exp_op { template __host__ __device__ T operator()(T v) const { + return from_f(::cuda::std::exp(to_f(v))); } }; + +struct binary_add { template __host__ __device__ auto operator()(A a, B b) const { return a + b; } }; +struct binary_sub { template __host__ __device__ auto operator()(A a, B b) const { return a - b; } }; +struct binary_mul { template __host__ __device__ auto operator()(A a, B b) const { return a * b; } }; +struct binary_div { template __host__ __device__ auto operator()(A a, B b) const { return a / b; } }; +struct binary_le { template __host__ __device__ A operator()(A a, B b) const { return static_cast(a <= b); } }; +struct binary_ge { template __host__ __device__ A operator()(A a, B b) const { return static_cast(a >= b); } }; +struct binary_fmin { template __host__ __device__ auto operator()(A a, B b) const { return a < b ? a : b; } }; +struct binary_fmax { template __host__ __device__ auto operator()(A a, B b) const { return a > b ? a : b; } }; + +// ======================================================================== +// Tile substitutes + trait registration. Only compiled under tile mode. +// ======================================================================== +#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +namespace ct = ::cuda::tiles; -// --- Unary --- (compute in float, cast back so the same ops work for __half/__bf16/float) template __tile__ auto as_float(T v) { return ct::element_cast(v); } template __tile__ auto from_float(F f) { return ct::element_cast>(f); } -struct relu_op { template __tile__ auto operator()(T v) const { +struct tile_relu { template __tile__ auto operator()(T v) const { auto f = as_float(v); return from_float(ct::select(f > 0.0f, f, f - f)); } }; -struct sigmoid_op { template __tile__ auto operator()(T v) const { +struct tile_sigmoid { template __tile__ auto operator()(T v) const { auto f = as_float(v); return from_float(1.0f / (1.0f + ct::exp(-f))); } }; -struct tanh_op { template __tile__ auto operator()(T v) const { +struct tile_tanh { template __tile__ auto operator()(T v) const { return from_float(ct::tanh(as_float(v))); } }; -struct gelu_op { template __tile__ auto operator()(T v) const { +struct tile_gelu { template __tile__ auto operator()(T v) const { constexpr float k0 = 0.7978845608028654f, k1 = 0.044715f; auto f = as_float(v); return from_float(0.5f * f * (1.0f + ct::tanh(k0 * (f + k1 * f * f * f)))); } }; -struct sin_op { template __tile__ auto operator()(T v) const { return from_float(ct::sin(as_float(v))); } }; -struct exp_op { template __tile__ auto operator()(T v) const { return from_float(ct::exp(as_float(v))); } }; - -// --- Binary --- -struct binary_add { template __tile__ auto operator()(A a, B b) const { return a + b; } }; -struct binary_sub { template __tile__ auto operator()(A a, B b) const { return a - b; } }; -struct binary_mul { template __tile__ auto operator()(A a, B b) const { return a * b; } }; -struct binary_div { template __tile__ auto operator()(A a, B b) const { return a / b; } }; -// le/ge: cast the bool result tile to A's element type so it fits the float output buffer -// (CUB does the same implicit cast via its iterator path). -struct binary_le { template __tile__ auto operator()(A a, B b) const { return ct::element_cast>(a <= b); } }; -struct binary_ge { template __tile__ auto operator()(A a, B b) const { return ct::element_cast>(a >= b); } }; -struct binary_fmin { template __tile__ auto operator()(A a, B b) const { return ct::select(a < b, a, b); } }; -struct binary_fmax { template __tile__ auto operator()(A a, B b) const { return ct::select(a > b, a, b); } }; - - -template +struct tile_sin { template __tile__ auto operator()(T v) const { return from_float(ct::sin(as_float(v))); } }; +struct tile_exp { template __tile__ auto operator()(T v) const { return from_float(ct::exp(as_float(v))); } }; + +struct tile_binary_add { template __tile__ auto operator()(A a, B b) const { return a + b; } }; +struct tile_binary_sub { template __tile__ auto operator()(A a, B b) const { return a - b; } }; +struct tile_binary_mul { template __tile__ auto operator()(A a, B b) const { return a * b; } }; +struct tile_binary_div { template __tile__ auto operator()(A a, B b) const { return a / b; } }; +struct tile_binary_le { template __tile__ auto operator()(A a, B b) const { return ct::element_cast>(a <= b); } }; +struct tile_binary_ge { template __tile__ auto operator()(A a, B b) const { return ct::element_cast>(a >= b); } }; +struct tile_binary_fmin { template __tile__ auto operator()(A a, B b) const { return ct::select(a < b, a, b); } }; +struct tile_binary_fmax { template __tile__ auto operator()(A a, B b) const { return ct::select(a > b, a, b); } }; + +CUB_NAMESPACE_BEGIN +namespace detail::transform::tile +{ +// Unary +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_relu; }; +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_sigmoid; }; +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_tanh; }; +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_gelu; }; +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_sin; }; +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_exp; }; + +// MUFU-heavy unary ops: hint to tile policy picker to cap items/thread at vector width on sub-4-byte types. +template <> struct tile_mufu_heavy : ::cuda::std::true_type {}; +template <> struct tile_mufu_heavy : ::cuda::std::true_type {}; +template <> struct tile_mufu_heavy : ::cuda::std::true_type {}; +template <> struct tile_mufu_heavy : ::cuda::std::true_type {}; +template <> struct tile_mufu_heavy : ::cuda::std::true_type {}; + +// Binary +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_add; }; +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_sub; }; +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_mul; }; +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_div; }; +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_le; }; +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_ge; }; +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_fmin; }; +template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_fmax; }; +} // namespace detail::transform::tile +CUB_NAMESPACE_END +#endif + +// ======================================================================== +// Bench harness. +// ======================================================================== +template void run_unary(nvbench::state& state) { const auto n = state.get_int64("Elements{io}"); T *in, *out; @@ -55,7 +132,7 @@ void run_unary(nvbench::state& state) { state.add_global_memory_reads(n); state.add_global_memory_writes(n); state.exec([&](nvbench::launch& launch) { - cub_tile::DeviceTransform::Transform<0, MufuHeavy>( + cub::DeviceTransform::Transform( ::cuda::std::make_tuple(in), out, n, Op{}, launch.get_stream()); }); cudaFree(in); cudaFree(out); @@ -73,7 +150,7 @@ void run_binary(nvbench::state& state) { state.add_global_memory_reads(2*n); state.add_global_memory_writes(n); state.exec([&](nvbench::launch& launch) { - cub_tile::DeviceTransform::Transform( + cub::DeviceTransform::Transform( ::cuda::std::make_tuple(a, b), out, n, Op{}, launch.get_stream()); }); cudaFree(a); cudaFree(b); cudaFree(out); @@ -82,18 +159,16 @@ void run_binary(nvbench::state& state) { using element_types = nvbench::type_list<__half, __nv_bfloat16, float>; inline auto pt_sizes = std::vector{16, 20, 24, 28, 31}; -#define UNARY_BENCH(name, op, mufu) \ - template void name##_bench(nvbench::state& state, nvbench::type_list) { run_unary(state); } \ +#define UNARY_BENCH(name, op) \ + template void name##_bench(nvbench::state& state, nvbench::type_list) { run_unary(state); } \ NVBENCH_BENCH_TYPES(name##_bench, NVBENCH_TYPE_AXES(element_types)).set_name("tile_" #name).add_int64_power_of_two_axis("Elements{io}", pt_sizes); -// MufuHeavy hint set for ops dominated by MUFU intrinsics (exp/tanh/sin/cos). -// relu is just compare+select, so no hint. -UNARY_BENCH(relu, relu_op, false) -UNARY_BENCH(sigmoid, sigmoid_op, true) -UNARY_BENCH(tanh, tanh_op, true) -UNARY_BENCH(gelu, gelu_op, true) -UNARY_BENCH(sin, sin_op, true) -UNARY_BENCH(exp, exp_op, true) +UNARY_BENCH(relu, relu_op) +UNARY_BENCH(sigmoid, sigmoid_op) +UNARY_BENCH(tanh, tanh_op) +UNARY_BENCH(gelu, gelu_op) +UNARY_BENCH(sin, sin_op) +UNARY_BENCH(exp, exp_op) #define BINARY_BENCH(name, op) \ template void name##_bench(nvbench::state& state, nvbench::type_list) { run_binary(state); } \ diff --git a/cub/benchmarks/bench/transform/tile/test_device_transform.cu b/cub/benchmarks/bench/transform/tile/test_device_transform.cu index 0df21dc66a3..713a3846025 100644 --- a/cub/benchmarks/bench/transform/tile/test_device_transform.cu +++ b/cub/benchmarks/bench/transform/tile/test_device_transform.cu @@ -1,13 +1,22 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause -// Standalone correctness tests for cub_tile::DeviceTransform. -// Sits next to the benches so it builds against the same tileiras -// toolchain and does not pretend to be part of CCCL's catch2 suite. - -#include +// Standalone correctness tests for cub::DeviceTransform with the tile +// dispatch hook on. Exercises: +// - Built-in trait specs (cuda::std::plus, cuda::std::multiplies) +// - User-registered trait specs (square_op, identity_op) +// - cub::DeviceTransform::Fill (zero-input case) +// +// Built under --enable-tile + CCCL_ENABLE_TILE_TRANSFORM_DISPATCH so the +// hook routes eligible combos to the tile kernel. Sits next to the benches +// so it builds against the same tileiras toolchain; not part of CCCL's +// catch2 suite. + +#include #include + +#include #include #include @@ -16,6 +25,10 @@ #include #include +#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +# include +#endif + namespace { int g_failures = 0; @@ -59,10 +72,28 @@ void expect_array(const char* name, const std::vector& got, const std::vector else { std::printf("[ OK ] %s (n=%zu)\n", name, got.size()); } } -struct identity_op { template __tile__ auto operator()(A a) const { return a; } }; -struct square_op { template __tile__ auto operator()(A a) const { return a * a; } }; -struct add_op { template __tile__ auto operator()(A a, B b) const { return a + b; } }; -struct mul_op { template __tile__ auto operator()(A a, B b) const { return a * b; } }; +// User-defined scalar functors (the call-site type). identity_op and square_op +// don't have a cuda::std equivalent, so we self-register them. add and mul map +// to cuda::std::plus / cuda::std::multiplies which CCCL already ships specs for. + +struct identity_op { + template __host__ __device__ T operator()(T a) const { return a; } +}; +struct square_op { + template __host__ __device__ T operator()(T a) const { return a * a; } +}; + +#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +namespace ct = ::cuda::tiles; + +// Tile-friendly substitutes (must be stateless + trivially default constructible). +struct tile_identity_op { + template __tile__ auto operator()(T v) const { return v; } +}; +struct tile_square_op { + template __tile__ auto operator()(T v) const { return v * v; } +}; +#endif template std::vector ramp(int64_t n, T start = T{0}, T step = T{1}) { @@ -91,7 +122,7 @@ template void test_identity(int64_t n) { auto h_in = ramp(n, T{1}, T{1}); GpuVec dx(h_in), dy(n); - CUDA_CHECK(cub_tile::DeviceTransform::Transform( + CUDA_CHECK(cub::DeviceTransform::Transform( ::cuda::std::make_tuple(dx.d), dy.d, n, identity_op{})); CUDA_CHECK(cudaDeviceSynchronize()); expect_array("identity", dy.to_host(), h_in); @@ -103,7 +134,7 @@ void test_square(int64_t n) { std::vector want(n); for (int64_t i = 0; i < n; ++i) want[i] = h_in[i] * h_in[i]; GpuVec dx(h_in), dy(n); - CUDA_CHECK(cub_tile::DeviceTransform::Transform( + CUDA_CHECK(cub::DeviceTransform::Transform( ::cuda::std::make_tuple(dx.d), dy.d, n, square_op{})); CUDA_CHECK(cudaDeviceSynchronize()); expect_array("square", dy.to_host(), want); @@ -116,8 +147,8 @@ void test_add(int64_t n) { std::vector want(n); for (int64_t i = 0; i < n; ++i) want[i] = ha[i] + hb[i]; GpuVec da(ha), db(hb), dc(n); - CUDA_CHECK(cub_tile::DeviceTransform::Transform( - ::cuda::std::make_tuple(da.d, db.d), dc.d, n, add_op{})); + CUDA_CHECK(cub::DeviceTransform::Transform( + ::cuda::std::make_tuple(da.d, db.d), dc.d, n, ::cuda::std::plus{})); CUDA_CHECK(cudaDeviceSynchronize()); expect_array("add", dc.to_host(), want); } @@ -129,8 +160,8 @@ void test_mul(int64_t n) { std::vector want(n); for (int64_t i = 0; i < n; ++i) want[i] = ha[i] * hb[i]; GpuVec da(ha), db(hb), dc(n); - CUDA_CHECK(cub_tile::DeviceTransform::Transform( - ::cuda::std::make_tuple(da.d, db.d), dc.d, n, mul_op{})); + CUDA_CHECK(cub::DeviceTransform::Transform( + ::cuda::std::make_tuple(da.d, db.d), dc.d, n, ::cuda::std::multiplies{})); CUDA_CHECK(cudaDeviceSynchronize()); expect_array("mul", dc.to_host(), want); } @@ -138,7 +169,7 @@ void test_mul(int64_t n) { template void test_fill(int64_t n, T value) { GpuVec dy(n); - CUDA_CHECK(cub_tile::DeviceTransform::Fill(dy.d, n, value)); + CUDA_CHECK(cub::DeviceTransform::Fill(dy.d, n, value)); CUDA_CHECK(cudaDeviceSynchronize()); std::vector want(n, value); expect_array("fill", dy.to_host(), want); @@ -146,6 +177,19 @@ void test_fill(int64_t n, T value) { } // namespace +// User self-registers identity_op and square_op as tile-eligible. +#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +CUB_NAMESPACE_BEGIN +namespace detail::transform::tile +{ +template <> struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_identity_op; }; +template <> struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_identity_op; }; +template <> struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_square_op; }; +template <> struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_square_op; }; +} // namespace detail::transform::tile +CUB_NAMESPACE_END +#endif + int main() { // pow-2, multiple tiles test_identity(4096); diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 414f64a3075..b62cfd61ca8 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -171,12 +171,4 @@ CUB_RUNTIME_FUNCTION cudaError_t dispatch( CUB_NAMESPACE_END -// Compatibility shim. Existing benches and tests still call -// cub_tile::DeviceTransform; once they move to cub::DeviceTransform with named -// functors and the trait dispatch, this alias can be removed. -namespace cub_tile -{ -using DeviceTransform = ::cub::detail::transform::tile::DeviceTransform; -} // namespace cub_tile - #endif // _CCCL_CTK_AT_LEAST(13, 3) From 4afa1b3070a3cd3020db6780937d513c6a229fba Mon Sep 17 00:00:00 2001 From: Nan An Date: Thu, 4 Jun 2026 19:40:08 -0700 Subject: [PATCH 18/83] use int64 extents in tile kernels and clean up runtime precondition check --- .../dispatch/dispatch_transform_tile.cuh | 19 +++++++++++-------- .../kernels/kernel_transform_tile.cuh | 9 ++++++--- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index b62cfd61ca8..438a237c35e 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -27,7 +27,7 @@ # include # include -# include +# include # include # include # include @@ -120,20 +120,23 @@ template CUB_RUNTIME_FUNCTION bool runtime_preconditions_ok(::cuda::std::tuple const& inputs, OutIter output, OffsetT num_items) { - constexpr int kAlign = 16; - // Tile DSL's tensor_span uses uint32_t shape internally; values >= 2^32 - // wrap to 0. Cap at 2^31 to stay below the cliff with margin. - constexpr OffsetT kMaxItems = OffsetT{1} << 31; + // Pointer alignment is in bytes (for LDG.E.128); the kernel's + // ct::assume_divisible applies to num_items as an element count. These + // are both 16 today by coincidence but live on different axes. + constexpr int byte_align = 16; + constexpr int items_divisor = 16; auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output); - const bool aligned_out = ::cuda::is_aligned(out_ptr, kAlign); + const bool aligned_out = ::cuda::std::is_sufficiently_aligned(out_ptr); const bool aligned_in = ::cuda::std::apply( [](auto... iters) { - return ((::cuda::is_aligned(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters), kAlign)) && ...); + return ((::cuda::std::is_sufficiently_aligned( + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters))) + && ...); }, inputs); - return aligned_out && aligned_in && (num_items % kAlign) == 0 && num_items <= kMaxItems; + return aligned_out && aligned_in && (num_items % items_divisor) == 0; } // Bridge between cub::DeviceTransform::__transform_internal and the tile diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index 3d038c9068f..5a67e75a04c 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -41,12 +41,14 @@ transform_kernel(int64_t num_items_, Out* __restrict__ out_, const Ins* __restri auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_)); auto out = ct::assume_aligned<16>(out_); - auto out_span = ct::tensor_span{out, ct::extents{num_items}}; + // Explicit int64_t element type on the extent; CTAD would deduce uint32_t + // and wrap at 2^32. Using int64_t lets us drop the 2^31 runtime cap. + auto out_span = ct::tensor_span{out, ct::extents{num_items}}; auto out_view = ct::partition_view{out_span, ct::shape{}}; auto load_one = [bx, num_items](auto* ptr_) { auto ptr = ct::assume_aligned<16>(ptr_); - auto span = ct::tensor_span{ptr, ct::extents{num_items}}; + auto span = ct::tensor_span{ptr, ct::extents{num_items}}; auto view = ct::partition_view{span, ct::shape{}}; return view.load_masked(bx); }; @@ -63,7 +65,8 @@ __tile_global__ void fill_kernel(int64_t num_items_, T* __restrict__ out_, T val auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_)); auto out = ct::assume_aligned<16>(out_); - auto out_span = ct::tensor_span{out, ct::extents{num_items}}; + // Explicit int64_t element type on the extent (see transform_kernel above). + auto out_span = ct::tensor_span{out, ct::extents{num_items}}; auto out_view = ct::partition_view{out_span, ct::shape{}}; using tile_t = ct::tile>; out_view.store_masked(ct::full(value), bx); From f761178b3ce8bf9945835dc6a1167cd465779d56 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 12:49:39 -0700 Subject: [PATCH 19/83] lift tile_eligible and tile_mufu_heavy to cub::transform namespace --- .../bench/transform/tile/babelstream.cu | 4 +- cub/benchmarks/bench/transform/tile/copy.cu | 4 +- .../bench/transform/tile/grayscale.cu | 4 +- .../bench/transform/tile/pytorch.cu | 4 +- .../transform/tile/test_device_transform.cu | 4 +- .../dispatch/dispatch_transform_tile.cuh | 8 +- .../dispatch_transform_tile_traits.cuh | 91 +++++++++++-------- 7 files changed, 70 insertions(+), 49 deletions(-) diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu index 2201b05674a..ba1c37036b0 100644 --- a/cub/benchmarks/bench/transform/tile/babelstream.cu +++ b/cub/benchmarks/bench/transform/tile/babelstream.cu @@ -60,13 +60,13 @@ struct tile_nstream_op { // Self-register each scalar op for all T (partial specialization on T). CUB_NAMESPACE_BEGIN -namespace detail::transform::tile +namespace transform { template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_mul_op; }; template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_add_op; }; template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_triad_op; }; template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_nstream_op; }; -} // namespace detail::transform::tile +} // namespace transform CUB_NAMESPACE_END #endif diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu index 07d08f74b8b..fd697256dd9 100644 --- a/cub/benchmarks/bench/transform/tile/copy.cu +++ b/cub/benchmarks/bench/transform/tile/copy.cu @@ -31,10 +31,10 @@ struct tile_identity { }; CUB_NAMESPACE_BEGIN -namespace detail::transform::tile +namespace transform { template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_identity; }; -} // namespace detail::transform::tile +} // namespace transform CUB_NAMESPACE_END #endif diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu index e715945b9bb..80768581aab 100644 --- a/cub/benchmarks/bench/transform/tile/grayscale.cu +++ b/cub/benchmarks/bench/transform/tile/grayscale.cu @@ -40,10 +40,10 @@ struct tile_rgb_to_y { }; CUB_NAMESPACE_BEGIN -namespace detail::transform::tile +namespace transform { template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_rgb_to_y; }; -} // namespace detail::transform::tile +} // namespace transform CUB_NAMESPACE_END #endif diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu index 71cbd20f583..6e35560f426 100644 --- a/cub/benchmarks/bench/transform/tile/pytorch.cu +++ b/cub/benchmarks/bench/transform/tile/pytorch.cu @@ -89,7 +89,7 @@ struct tile_binary_fmin { template __tile__ auto operator()(A struct tile_binary_fmax { template __tile__ auto operator()(A a, B b) const { return ct::select(a > b, a, b); } }; CUB_NAMESPACE_BEGIN -namespace detail::transform::tile +namespace transform { // Unary template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_relu; }; @@ -115,7 +115,7 @@ template struct tile_eligible : ::cuda::std::true_t template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_ge; }; template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_fmin; }; template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_fmax; }; -} // namespace detail::transform::tile +} // namespace transform CUB_NAMESPACE_END #endif diff --git a/cub/benchmarks/bench/transform/tile/test_device_transform.cu b/cub/benchmarks/bench/transform/tile/test_device_transform.cu index 713a3846025..b3fe263909d 100644 --- a/cub/benchmarks/bench/transform/tile/test_device_transform.cu +++ b/cub/benchmarks/bench/transform/tile/test_device_transform.cu @@ -180,13 +180,13 @@ void test_fill(int64_t n, T value) { // User self-registers identity_op and square_op as tile-eligible. #if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() CUB_NAMESPACE_BEGIN -namespace detail::transform::tile +namespace transform { template <> struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_identity_op; }; template <> struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_identity_op; }; template <> struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_square_op; }; template <> struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_square_op; }; -} // namespace detail::transform::tile +} // namespace transform CUB_NAMESPACE_END #endif diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 438a237c35e..ac9cdaf059d 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -109,7 +109,8 @@ template inline constexpr bool tile_dispatch_eligible_v = THRUST_NS_QUALIFIER::is_contiguous_iterator_v && (THRUST_NS_QUALIFIER::is_contiguous_iterator_v && ...) - && tile_eligible_v, sizeof...(InIters)>; + && CUB_NS_QUALIFIER::transform::tile_eligible_v< + Op, __detail::__unwrapped_value_t, sizeof...(InIters)>; // Runtime predicate consulted by the cub::DeviceTransform tile hook before // it commits to the tile path. Mirrors how CUB's dispatch_t::CanVectorize @@ -160,13 +161,14 @@ CUB_RUNTIME_FUNCTION cudaError_t dispatch( }, inputs); using out_value_t = ::cuda::std::remove_cv_t<::cuda::std::remove_pointer_t>; - using tile_op_t = typename tile_eligible::tile_op_type; + using tile_op_t = + typename CUB_NS_QUALIFIER::transform::tile_eligible::tile_op_type; static_assert(::cuda::std::is_empty_v, "tile_op_type must be stateless (the tile kernel default-constructs it)"); static_assert(::cuda::std::is_trivially_default_constructible_v, "tile_op_type must be trivially default constructible"); - return DeviceTransform::template Transform<0, tile_mufu_heavy_v, tile_op_t>( + return DeviceTransform::template Transform<0, CUB_NS_QUALIFIER::transform::tile_mufu_heavy_v, tile_op_t>( in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), tile_op_t{}, stream); } diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh index c823cc46b99..330f34d8754 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh @@ -3,20 +3,27 @@ // Compile-time policy for cub::DeviceTransform's tile path. // -// Users call cub::DeviceTransform::Transform with whatever scalar functor they -// have (e.g. cuda::std::plus<__half>). That functor is NOT directly callable -// from a tile transform_kernel -- its operator() takes scalars, not ct::tile. -// So eligible specializations declare a `tile_op_type` member that names a -// tile-friendly replacement functor (with __tile__ templated operator()) that -// performs the same operation. The dispatch hook then launches the tile -// kernel with the replacement, not the user's original. +// PUBLIC EXTENSION POINTS (cub::transform): +// tile_eligible -- specialize this to opt a (functor type, +// element type, input arity) combo into +// the tile dispatch path. +// tile_eligible_v<...> -- variable-template companion. +// tile_mufu_heavy -- specialize to flag Op as MUFU-heavy; the +// tile policy picker uses this hint. +// tile_mufu_heavy_v<...> -- variable-template companion. // -// tile_eligible_v answers "should DeviceTransform::Transform -// route to the tile kernel for this (functor, element type, input arity)?". -// tile_mufu_heavy_v hints the tile policy picker that Op spends most of -// its time on MUFU instructions, so the picker caps items/thread at the -// vector width to avoid piling up MUFU work that cannot SIMD on Blackwell -// for sub-4-byte types. +// Users call cub::DeviceTransform::Transform with whatever scalar functor +// they have (e.g. cuda::std::plus<__half>). That scalar functor is NOT +// directly callable from a tile transform_kernel -- its operator() takes +// scalars, not ct::tile. So eligible specializations declare a `tile_op_type` +// member naming a tile-friendly replacement (a stateless functor with a +// __tile__ templated operator() that performs the same op on ct::tile args). +// The dispatch hook launches the tile kernel with the replacement, not the +// user's original functor instance. +// +// INTERNAL (cub::detail::transform::tile): +// tile_plus, tile_multiplies -- shipped tile-friendly substitutes used by +// the built-in specializations below. #pragma once @@ -44,6 +51,27 @@ CUB_NAMESPACE_BEGIN +// Public extension surface. +namespace transform +{ + +template +struct tile_eligible : ::cuda::std::false_type +{}; + +template +inline constexpr bool tile_eligible_v = tile_eligible::value; + +template +struct tile_mufu_heavy : ::cuda::std::false_type +{}; + +template +inline constexpr bool tile_mufu_heavy_v = tile_mufu_heavy::value; + +} // namespace transform + +// Internal substitutes shipped by CCCL. namespace detail::transform::tile { @@ -70,50 +98,41 @@ struct tile_multiplies }; # endif // _CCCL_TILE_COMPILATION() -template -struct tile_eligible : ::cuda::std::false_type -{}; - -template -inline constexpr bool tile_eligible_v = tile_eligible::value; - -template -struct tile_mufu_heavy : ::cuda::std::false_type -{}; - -template -inline constexpr bool tile_mufu_heavy_v = tile_mufu_heavy::value; +} // namespace detail::transform::tile +// Built-in trait specializations live in the public namespace alongside the +// trait, but reference the internal substitute functors. # if _CCCL_TILE_COMPILATION() +namespace transform +{ # if _CCCL_HAS_NVFP16() template <> -struct tile_eligible<::cuda::std::plus<__half>, __half, 2> : ::cuda::std::true_type +struct tile_eligible<::cuda::std::plus<::__half>, ::__half, 2> : ::cuda::std::true_type { - using tile_op_type = tile_plus; + using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_plus; }; template <> -struct tile_eligible<::cuda::std::multiplies<__half>, __half, 2> : ::cuda::std::true_type +struct tile_eligible<::cuda::std::multiplies<::__half>, ::__half, 2> : ::cuda::std::true_type { - using tile_op_type = tile_multiplies; + using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_multiplies; }; # endif // _CCCL_HAS_NVFP16() # if _CCCL_HAS_NVBF16() template <> -struct tile_eligible<::cuda::std::plus<__nv_bfloat16>, __nv_bfloat16, 2> : ::cuda::std::true_type +struct tile_eligible<::cuda::std::plus<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type { - using tile_op_type = tile_plus; + using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_plus; }; template <> -struct tile_eligible<::cuda::std::multiplies<__nv_bfloat16>, __nv_bfloat16, 2> : ::cuda::std::true_type +struct tile_eligible<::cuda::std::multiplies<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type { - using tile_op_type = tile_multiplies; + using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_multiplies; }; # endif // _CCCL_HAS_NVBF16() +} // namespace transform # endif // _CCCL_TILE_COMPILATION() -} // namespace detail::transform::tile - CUB_NAMESPACE_END #endif // _CCCL_CTK_AT_LEAST(13, 3) From eb6bd04bbcd6d172b1b9381f7ade2755d5574995 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 12:58:25 -0700 Subject: [PATCH 20/83] purge outdated comments from before runtime fallback was added --- cub/cub/device/device_transform.cuh | 10 ++++---- .../dispatch/dispatch_transform_tile.cuh | 23 ++++++++++++------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index 1560d9e1a68..bcb84f76fa4 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -105,11 +105,11 @@ struct DeviceTransform #if _CCCL_CTK_AT_LEAST(13, 3) && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible AND - // the runtime alignment / divisibility / size preconditions hold, route - // to the tile kernel. Otherwise fall through to the standard CUB - // dispatch below -- CUB's existing kernels handle the unaligned tail - // case via their own internal logic, so misalignment is a graceful - // fallback, not an error. + // the runtime alignment + divisibility preconditions hold, route to the + // tile kernel. Otherwise fall through to the standard CUB dispatch + // below -- CUB's existing kernels handle the unaligned tail case via + // their own internal logic, so misalignment is a graceful fallback, + // not an error. if constexpr (StableAddress == detail::transform::requires_stable_address::no && ::cuda::std::is_same_v && detail::transform::tile::tile_dispatch_eligible_v< diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index ac9cdaf059d..e3bb569b2af 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -1,10 +1,16 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// Tile port of cub::DeviceTransform. The public surface mirrors -// cub::DeviceTransform::{Transform, Fill}; the kernels are written against the -// tile DSL (cuda::tiles). This header requires CTK 13.3 or newer and nvcc -// invoked with --enable-tile. +// Internal dispatch helpers for cub::DeviceTransform's tile path: +// tile_dispatch_eligible_v -- compile-time predicate the hook consults +// runtime_preconditions_ok -- runtime alignment + divisibility predicate +// dispatch -- bridge that launches the tile kernel with +// the trait's substitute functor +// DeviceTransform -- internal tile-local Transform/Fill wrappers +// used by `dispatch` +// User-facing extension points (tile_eligible / tile_mufu_heavy) live in +// dispatch_transform_tile_traits.cuh under cub::transform. +// Requires CTK 13.3 or newer and nvcc invoked with --enable-tile. #pragma once @@ -102,9 +108,10 @@ using __unwrapped_value_t = // Combined compile-time predicate used by cub::DeviceTransform's __transform_internal // to decide whether to route a given (Op, OutIter, InIters...) to the tile path. -// The call site lifts this into an `if constexpr` so the standard CUB dispatch -// is not instantiated when tile takes over (under --enable-tile the standard -// path fails to compile for many functor/type combinations). +// The call site lifts this into an `if constexpr`: when this is true the hook +// tries the tile kernel first and, on runtime alignment / divisibility +// failure, falls through to the standard CUB dispatch below. When false, the +// tile branch is discarded and only CUB's standard path is emitted. template inline constexpr bool tile_dispatch_eligible_v = THRUST_NS_QUALIFIER::is_contiguous_iterator_v @@ -116,7 +123,7 @@ inline constexpr bool tile_dispatch_eligible_v = // it commits to the tile path. Mirrors how CUB's dispatch_t::CanVectorize // guards the vectorized kernel. The tile kernels use ct::assume_aligned<16> // and ct::assume_divisible<16>, so violating these at runtime is UB. -// Returns false to tell the hook to surface cudaErrorInvalidValue. +// Returns false to tell the hook to fall back to the standard CUB dispatch. template CUB_RUNTIME_FUNCTION bool runtime_preconditions_ok(::cuda::std::tuple const& inputs, OutIter output, OffsetT num_items) From f69e0ede371ff775c245c3f22ef5192e13cf6fc7 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 13:12:48 -0700 Subject: [PATCH 21/83] move kernel doc-comment next to the kernel and reflow to 100 col --- cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index 5a67e75a04c..5f6271e61e1 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -1,11 +1,6 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// Tile DSL kernels backing cub::DeviceTransform's tile path. The kernels -// assume 16-byte alignment on every pointer and 16-byte divisibility on -// num_items so the compiler can pick LDG.E.128. Callers in the dispatch -// header are responsible for honoring those preconditions. - #pragma once #include @@ -29,6 +24,9 @@ CUB_NAMESPACE_BEGIN namespace detail::transform::tile { +// Tile DSL kernels backing cub::DeviceTransform's tile path. The kernels assume 16-byte alignment on +// every pointer and 16-byte divisibility on num_items so the compiler can pick LDG.E.128. Callers in +// the dispatch header are responsible for honoring those preconditions. template __tile_global__ void transform_kernel(int64_t num_items_, Out* __restrict__ out_, const Ins* __restrict__... ins_) From 207ba0e08a454652d6e72fa74ab5187a6450d04e Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 13:24:20 -0700 Subject: [PATCH 22/83] gate tile transform headers on a single config macro --- .../bench/transform/tile/babelstream.cu | 4 +-- cub/benchmarks/bench/transform/tile/copy.cu | 4 +-- .../bench/transform/tile/grayscale.cu | 4 +-- .../bench/transform/tile/pytorch.cu | 4 +-- .../transform/tile/test_device_transform.cu | 6 ++-- cub/cub/device/device_transform.cuh | 7 ++-- .../dispatch/dispatch_transform_tile.cuh | 6 ++-- .../dispatch_transform_tile_config.cuh | 34 +++++++++++++++++++ .../dispatch_transform_tile_traits.cuh | 24 ++++++------- .../kernels/kernel_transform_tile.cuh | 6 ++-- .../dispatch/tuning/tuning_transform_tile.cuh | 6 ++-- 11 files changed, 71 insertions(+), 34 deletions(-) create mode 100644 cub/cub/device/dispatch/dispatch_transform_tile_config.cuh diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu index ba1c37036b0..297ef78379a 100644 --- a/cub/benchmarks/bench/transform/tile/babelstream.cu +++ b/cub/benchmarks/bench/transform/tile/babelstream.cu @@ -15,7 +15,7 @@ #include #include -#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() # include #endif @@ -39,7 +39,7 @@ struct nstream_op { __host__ __device__ auto operator()(A a, B b, C c) const { return a + b - c - c; } }; -#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() // Tile-friendly substitutes (must be stateless + trivially default constructible). struct tile_mul_op { template diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu index fd697256dd9..da9665b2f25 100644 --- a/cub/benchmarks/bench/transform/tile/copy.cu +++ b/cub/benchmarks/bench/transform/tile/copy.cu @@ -15,7 +15,7 @@ #include #include -#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() # include #endif @@ -25,7 +25,7 @@ struct identity { template __host__ __device__ auto operator()(T v) const { return v; } }; -#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() struct tile_identity { template __tile__ auto operator()(T v) const { return v; } }; diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu index 80768581aab..9f364304266 100644 --- a/cub/benchmarks/bench/transform/tile/grayscale.cu +++ b/cub/benchmarks/bench/transform/tile/grayscale.cu @@ -12,7 +12,7 @@ #include #include -#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() # include #endif @@ -28,7 +28,7 @@ struct rgb_to_y { } }; -#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() struct tile_rgb_to_y { template __tile__ auto operator()(R r, G g, B b) const { diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu index 6e35560f426..0e1767fdac7 100644 --- a/cub/benchmarks/bench/transform/tile/pytorch.cu +++ b/cub/benchmarks/bench/transform/tile/pytorch.cu @@ -19,7 +19,7 @@ #include #include -#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() # include #endif @@ -60,7 +60,7 @@ struct binary_fmax { template __host__ __device__ auto operat // ======================================================================== // Tile substitutes + trait registration. Only compiled under tile mode. // ======================================================================== -#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() namespace ct = ::cuda::tiles; template __tile__ auto as_float(T v) { return ct::element_cast(v); } diff --git a/cub/benchmarks/bench/transform/tile/test_device_transform.cu b/cub/benchmarks/bench/transform/tile/test_device_transform.cu index b3fe263909d..d3a143a3deb 100644 --- a/cub/benchmarks/bench/transform/tile/test_device_transform.cu +++ b/cub/benchmarks/bench/transform/tile/test_device_transform.cu @@ -25,7 +25,7 @@ #include #include -#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() # include #endif @@ -83,7 +83,7 @@ struct square_op { template __host__ __device__ T operator()(T a) const { return a * a; } }; -#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() namespace ct = ::cuda::tiles; // Tile-friendly substitutes (must be stateless + trivially default constructible). @@ -178,7 +178,7 @@ void test_fill(int64_t n, T value) { } // namespace // User self-registers identity_op and square_op as tile-eligible. -#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() CUB_NAMESPACE_BEGIN namespace transform { diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index bcb84f76fa4..bcdedf8ba95 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -15,9 +15,10 @@ #include #include +#include #include -#if _CCCL_CTK_AT_LEAST(13, 3) && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() # include #endif @@ -103,7 +104,7 @@ struct DeviceTransform const auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env).get(); -#if _CCCL_CTK_AT_LEAST(13, 3) && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION() +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible AND // the runtime alignment + divisibility preconditions hold, route to the // tile kernel. Otherwise fall through to the standard CUB dispatch @@ -123,7 +124,7 @@ struct DeviceTransform inputs, output, static_cast(num_items), stream); } } -#endif // _CCCL_CTK_AT_LEAST(13, 3) && CCCL_ENABLE_TILE_TRANSFORM_DISPATCH && _CCCL_TILE_COMPILATION() +#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() using tuning_env = ::cuda::std::execution::__query_result_or_t>; diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index e3bb569b2af..f9ec1e1ff31 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -16,6 +16,8 @@ #include +#include + #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -24,7 +26,7 @@ # pragma system_header #endif // no system header -#if _CCCL_CTK_AT_LEAST(13, 3) +#if _CCCL_CUB_HAS_TILE_TRANSFORM() # include # include @@ -183,4 +185,4 @@ CUB_RUNTIME_FUNCTION cudaError_t dispatch( CUB_NAMESPACE_END -#endif // _CCCL_CTK_AT_LEAST(13, 3) +#endif // _CCCL_CUB_HAS_TILE_TRANSFORM() diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh new file mode 100644 index 00000000000..cd43a9d8b48 --- /dev/null +++ b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Single source of truth for the compile-time gates the tile transform headers +// share. Two macros: +// +// _CCCL_CUB_HAS_TILE_TRANSFORM() +// True when CUB's tile transform machinery is available: CTK 13.3 or newer, +// C++20 (tile DSL requires it), and the tile compilation trajectory +// (--enable-tile). When false, the tile headers (kernel / tuning / dispatch +// / traits) are skipped entirely. +// +// _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() +// True when the dispatch hook in cub::DeviceTransform should fire. Same as +// _CCCL_CUB_HAS_TILE_TRANSFORM() plus the user opt-in macro +// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH. + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#define _CCCL_CUB_HAS_TILE_TRANSFORM() \ + (_CCCL_CTK_AT_LEAST(13, 3) && _CCCL_STD_VER >= 2020 && _CCCL_TILE_COMPILATION()) + +#define _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() \ + (_CCCL_CUB_HAS_TILE_TRANSFORM() && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH)) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh index 330f34d8754..32422766ba4 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh @@ -29,6 +29,8 @@ #include +#include + #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -37,7 +39,9 @@ # pragma system_header #endif // no system header -#if _CCCL_CTK_AT_LEAST(13, 3) +#if _CCCL_CUB_HAS_TILE_TRANSFORM() + +# include # include # include @@ -45,10 +49,6 @@ # include -# if _CCCL_TILE_COMPILATION() -# include -# endif - CUB_NAMESPACE_BEGIN // Public extension surface. @@ -75,7 +75,6 @@ inline constexpr bool tile_mufu_heavy_v = tile_mufu_heavy::value; namespace detail::transform::tile { -# if _CCCL_TILE_COMPILATION() // Tile-friendly mirrors of common cuda::std ops. Each has a __tile__ // templated operator() so it can be invoked from inside transform_kernel // where the arguments are ct::tile rather than scalar T. @@ -96,16 +95,14 @@ struct tile_multiplies return a * b; } }; -# endif // _CCCL_TILE_COMPILATION() } // namespace detail::transform::tile // Built-in trait specializations live in the public namespace alongside the // trait, but reference the internal substitute functors. -# if _CCCL_TILE_COMPILATION() namespace transform { -# if _CCCL_HAS_NVFP16() +# if _CCCL_HAS_NVFP16() template <> struct tile_eligible<::cuda::std::plus<::__half>, ::__half, 2> : ::cuda::std::true_type { @@ -116,9 +113,9 @@ struct tile_eligible<::cuda::std::multiplies<::__half>, ::__half, 2> : ::cuda::s { using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_multiplies; }; -# endif // _CCCL_HAS_NVFP16() +# endif // _CCCL_HAS_NVFP16() -# if _CCCL_HAS_NVBF16() +# if _CCCL_HAS_NVBF16() template <> struct tile_eligible<::cuda::std::plus<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type { @@ -129,10 +126,9 @@ struct tile_eligible<::cuda::std::multiplies<::__nv_bfloat16>, ::__nv_bfloat16, { using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_multiplies; }; -# endif // _CCCL_HAS_NVBF16() +# endif // _CCCL_HAS_NVBF16() } // namespace transform -# endif // _CCCL_TILE_COMPILATION() CUB_NAMESPACE_END -#endif // _CCCL_CTK_AT_LEAST(13, 3) +#endif // _CCCL_CUB_HAS_TILE_TRANSFORM() diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index 5f6271e61e1..a5c7e2d2d82 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -5,6 +5,8 @@ #include +#include + #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -13,7 +15,7 @@ # pragma system_header #endif // no system header -#if _CCCL_CTK_AT_LEAST(13, 3) +#if _CCCL_CUB_HAS_TILE_TRANSFORM() # include @@ -74,4 +76,4 @@ __tile_global__ void fill_kernel(int64_t num_items_, T* __restrict__ out_, T val CUB_NAMESPACE_END -#endif // _CCCL_CTK_AT_LEAST(13, 3) +#endif // _CCCL_CUB_HAS_TILE_TRANSFORM() diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh index 86c2d1b394f..4bd82475c5c 100644 --- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh @@ -11,6 +11,8 @@ #include +#include + #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -19,7 +21,7 @@ # pragma system_header #endif // no system header -#if _CCCL_CTK_AT_LEAST(13, 3) +#if _CCCL_CUB_HAS_TILE_TRANSFORM() # include @@ -100,4 +102,4 @@ constexpr int pick_tile_size(bool mufu_heavy = false, int cc_x10 = 1000) CUB_NAMESPACE_END -#endif // _CCCL_CTK_AT_LEAST(13, 3) +#endif // _CCCL_CUB_HAS_TILE_TRANSFORM() From 48b949a189be94b59fe44efc283fd3fb9f500812 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 13:28:32 -0700 Subject: [PATCH 23/83] tidy kernel_transform_tile.cuh: use cuda::std::int64_t and drop _-suffix params --- .../kernels/kernel_transform_tile.cuh | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index a5c7e2d2d82..18bf9cd5f86 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -19,7 +19,7 @@ # include -# include +# include CUB_NAMESPACE_BEGIN @@ -31,42 +31,42 @@ namespace detail::transform::tile // the dispatch header are responsible for honoring those preconditions. template __tile_global__ void -transform_kernel(int64_t num_items_, Out* __restrict__ out_, const Ins* __restrict__... ins_) +transform_kernel(::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins) { - namespace ct = cuda::tiles; + namespace ct = ::cuda::tiles; const auto bx = ct::bid().x; Fn fn{}; - auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_)); - auto out = ct::assume_aligned<16>(out_); + const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); + const auto out_align = ct::assume_aligned<16>(out); - // Explicit int64_t element type on the extent; CTAD would deduce uint32_t - // and wrap at 2^32. Using int64_t lets us drop the 2^31 runtime cap. - auto out_span = ct::tensor_span{out, ct::extents{num_items}}; + // Explicit int64_t element type on the extent; CTAD would deduce uint32_t and wrap at 2^32. Using + // int64_t lets us drop the 2^31 runtime cap. + auto out_span = ct::tensor_span{out_align, ct::extents<::cuda::std::int64_t, ct::dynamic_extent>{n}}; auto out_view = ct::partition_view{out_span, ct::shape{}}; - auto load_one = [bx, num_items](auto* ptr_) { - auto ptr = ct::assume_aligned<16>(ptr_); - auto span = ct::tensor_span{ptr, ct::extents{num_items}}; - auto view = ct::partition_view{span, ct::shape{}}; + auto load_one = [bx, n](auto* ptr) { + auto ptr_align = ct::assume_aligned<16>(ptr); + auto span = ct::tensor_span{ptr_align, ct::extents<::cuda::std::int64_t, ct::dynamic_extent>{n}}; + auto view = ct::partition_view{span, ct::shape{}}; return view.load_masked(bx); }; - out_view.store_masked(fn(load_one(ins_)...), bx); + out_view.store_masked(fn(load_one(ins)...), bx); } template -__tile_global__ void fill_kernel(int64_t num_items_, T* __restrict__ out_, T value) +__tile_global__ void fill_kernel(::cuda::std::int64_t num_items, T* __restrict__ out, T value) { - namespace ct = cuda::tiles; + namespace ct = ::cuda::tiles; const auto bx = ct::bid().x; - auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_)); - auto out = ct::assume_aligned<16>(out_); + const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); + const auto out_align = ct::assume_aligned<16>(out); // Explicit int64_t element type on the extent (see transform_kernel above). - auto out_span = ct::tensor_span{out, ct::extents{num_items}}; + auto out_span = ct::tensor_span{out_align, ct::extents<::cuda::std::int64_t, ct::dynamic_extent>{n}}; auto out_view = ct::partition_view{out_span, ct::shape{}}; using tile_t = ct::tile>; out_view.store_masked(ct::full(value), bx); From bb091d0d33d938999141dc35371dcb0e753168e0 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 14:02:45 -0700 Subject: [PATCH 24/83] factor out make_partition_view helper and document assume_* annotations --- .../kernels/kernel_transform_tile.cuh | 47 ++++++++++--------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index 18bf9cd5f86..781bd84b6b5 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -26,32 +26,39 @@ CUB_NAMESPACE_BEGIN namespace detail::transform::tile { +// Build a tile partition_view for a 1D contiguous buffer. The two annotations are load-bearing: +// assume_aligned<16> -- promises the pointer is 16-byte aligned, so the compiler can pick +// LDG.E.128 vectorized loads/stores. +// ct::extents -- explicit element type on the extent; CTAD would deduce uint32_t and +// wrap at 2^32. int64_t lets us cover the full num_items range. +// The caller is responsible for honoring assume_aligned<16>; the dispatch header's +// runtime_preconditions_ok enforces this before launching either kernel. +template +__tile__ auto make_partition_view(T* ptr, N n) +{ + namespace ct = ::cuda::tiles; + const auto ptr_align = ct::assume_aligned<16>(ptr); + auto span = ct::tensor_span{ptr_align, ct::extents<::cuda::std::int64_t, ct::dynamic_extent>{n}}; + return ct::partition_view{span, ct::shape{}}; +} + // Tile DSL kernels backing cub::DeviceTransform's tile path. The kernels assume 16-byte alignment on // every pointer and 16-byte divisibility on num_items so the compiler can pick LDG.E.128. Callers in // the dispatch header are responsible for honoring those preconditions. +// +// assume_divisible<16> -- promises num_items % 16 == 0, so the tile DSL can elide tail handling. +// assume_bounded_below<0> -- promises num_items >= 0; enables sign-comparison simplifications. template __tile_global__ void transform_kernel(::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins) { - namespace ct = ::cuda::tiles; - + namespace ct = ::cuda::tiles; const auto bx = ct::bid().x; Fn fn{}; - const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); - const auto out_align = ct::assume_aligned<16>(out); - - // Explicit int64_t element type on the extent; CTAD would deduce uint32_t and wrap at 2^32. Using - // int64_t lets us drop the 2^31 runtime cap. - auto out_span = ct::tensor_span{out_align, ct::extents<::cuda::std::int64_t, ct::dynamic_extent>{n}}; - auto out_view = ct::partition_view{out_span, ct::shape{}}; - - auto load_one = [bx, n](auto* ptr) { - auto ptr_align = ct::assume_aligned<16>(ptr); - auto span = ct::tensor_span{ptr_align, ct::extents<::cuda::std::int64_t, ct::dynamic_extent>{n}}; - auto view = ct::partition_view{span, ct::shape{}}; - return view.load_masked(bx); - }; + const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); + auto out_view = make_partition_view(out, n); + auto load_one = [bx, n](auto* ptr) { return make_partition_view(ptr, n).load_masked(bx); }; out_view.store_masked(fn(load_one(ins)...), bx); } @@ -62,12 +69,8 @@ __tile_global__ void fill_kernel(::cuda::std::int64_t num_items, T* __restrict__ namespace ct = ::cuda::tiles; const auto bx = ct::bid().x; - const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); - const auto out_align = ct::assume_aligned<16>(out); - - // Explicit int64_t element type on the extent (see transform_kernel above). - auto out_span = ct::tensor_span{out_align, ct::extents<::cuda::std::int64_t, ct::dynamic_extent>{n}}; - auto out_view = ct::partition_view{out_span, ct::shape{}}; + const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); + auto out_view = make_partition_view(out, n); using tile_t = ct::tile>; out_view.store_masked(ct::full(value), bx); } From 514b53617db1a2236152f7f06948f11ad9b1d158 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 14:09:27 -0700 Subject: [PATCH 25/83] const-qualify scalar parameters in transform_kernel and fill_kernel --- cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index 781bd84b6b5..84298e35e78 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -50,7 +50,7 @@ __tile__ auto make_partition_view(T* ptr, N n) // assume_bounded_below<0> -- promises num_items >= 0; enables sign-comparison simplifications. template __tile_global__ void -transform_kernel(::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins) +transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins) { namespace ct = ::cuda::tiles; const auto bx = ct::bid().x; @@ -64,7 +64,7 @@ transform_kernel(::cuda::std::int64_t num_items, Out* __restrict__ out, const In } template -__tile_global__ void fill_kernel(::cuda::std::int64_t num_items, T* __restrict__ out, T value) +__tile_global__ void fill_kernel(const ::cuda::std::int64_t num_items, T* __restrict__ out, const T value) { namespace ct = ::cuda::tiles; const auto bx = ct::bid().x; From 607f7a07fdbddeca626b12d4cd0c51509c4353cc Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 14:18:43 -0700 Subject: [PATCH 26/83] rename runtime_preconditions_ok to runtime_preconditions_valid --- cub/cub/device/device_transform.cuh | 2 +- cub/cub/device/dispatch/dispatch_transform_tile.cuh | 8 ++++---- cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index bcdedf8ba95..9e1404c0f62 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -118,7 +118,7 @@ struct DeviceTransform RandomAccessIteratorOut, RandomAccessIteratorsIn...>) { - if (detail::transform::tile::runtime_preconditions_ok(inputs, output, static_cast(num_items))) + if (detail::transform::tile::runtime_preconditions_valid(inputs, output, static_cast(num_items))) { return detail::transform::tile::dispatch( inputs, output, static_cast(num_items), stream); diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index f9ec1e1ff31..0c2dfaf9ba2 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -3,7 +3,7 @@ // Internal dispatch helpers for cub::DeviceTransform's tile path: // tile_dispatch_eligible_v -- compile-time predicate the hook consults -// runtime_preconditions_ok -- runtime alignment + divisibility predicate +// runtime_preconditions_valid -- runtime alignment + divisibility predicate // dispatch -- bridge that launches the tile kernel with // the trait's substitute functor // DeviceTransform -- internal tile-local Transform/Fill wrappers @@ -128,7 +128,7 @@ inline constexpr bool tile_dispatch_eligible_v = // Returns false to tell the hook to fall back to the standard CUB dispatch. template CUB_RUNTIME_FUNCTION bool -runtime_preconditions_ok(::cuda::std::tuple const& inputs, OutIter output, OffsetT num_items) +runtime_preconditions_valid(::cuda::std::tuple const& inputs, OutIter output, OffsetT num_items) { // Pointer alignment is in bytes (for LDG.E.128); the kernel's // ct::assume_divisible applies to num_items as an element count. These @@ -151,10 +151,10 @@ runtime_preconditions_ok(::cuda::std::tuple const& inputs, OutIter o // Bridge between cub::DeviceTransform::__transform_internal and the tile // DeviceTransform above. Precondition: tile_dispatch_eligible_v is true AND runtime_preconditions_ok returned true. The kernel +// InIters...> is true AND runtime_preconditions_valid returned true. The kernel // itself assumes 16-byte pointer alignment and num_items divisibility; the // caller (the hook in device_transform.cuh) is responsible for checking -// runtime_preconditions_ok first. +// runtime_preconditions_valid first. // // The tile kernel is launched with the trait's tile_op_type (a tile-friendly // mirror of Op with __tile__ operator), NOT the user's Op instance -- the diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index 84298e35e78..f31cfca5d40 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -32,7 +32,7 @@ namespace detail::transform::tile // ct::extents -- explicit element type on the extent; CTAD would deduce uint32_t and // wrap at 2^32. int64_t lets us cover the full num_items range. // The caller is responsible for honoring assume_aligned<16>; the dispatch header's -// runtime_preconditions_ok enforces this before launching either kernel. +// runtime_preconditions_valid enforces this before launching either kernel. template __tile__ auto make_partition_view(T* ptr, N n) { From a5d3eca54eadd82f0949ed63582086115a94d448 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 14:30:58 -0700 Subject: [PATCH 27/83] trim tile traits header includes --- cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh index 32422766ba4..660ec4f10c6 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh @@ -43,12 +43,10 @@ # include -# include +# include # include # include -# include - CUB_NAMESPACE_BEGIN // Public extension surface. From 8eaa4950e88557bd0fb3e943bd1339eb26402dbc Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 14:37:13 -0700 Subject: [PATCH 28/83] annotate tile-path return-valued helpers with [[nodiscard]] --- cub/cub/device/dispatch/dispatch_transform_tile.cuh | 10 +++++----- .../device/dispatch/kernels/kernel_transform_tile.cuh | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 0c2dfaf9ba2..5e4bedf725b 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -55,7 +55,7 @@ namespace detail::transform::tile { template -cudaError_t launch_impl( +[[nodiscard]] cudaError_t launch_impl( ::cuda::std::tuple inputs, Out* output, int64_t num_items, @@ -78,7 +78,7 @@ cudaError_t launch_impl( struct DeviceTransform { template - static cudaError_t + [[nodiscard]] static cudaError_t Transform(::cuda::std::tuple inputs, Out* output, int64_t num_items, Fn, cudaStream_t stream = 0) { constexpr int chosen = (TileSize > 0) ? TileSize : pick_tile_size(MufuHeavy); @@ -87,7 +87,7 @@ struct DeviceTransform // Fill template - static cudaError_t Fill(T* output, int64_t num_items, T value, cudaStream_t stream = 0) + [[nodiscard]] static cudaError_t Fill(T* output, int64_t num_items, T value, cudaStream_t stream = 0) { if (num_items <= 0) { @@ -127,7 +127,7 @@ inline constexpr bool tile_dispatch_eligible_v = // and ct::assume_divisible<16>, so violating these at runtime is UB. // Returns false to tell the hook to fall back to the standard CUB dispatch. template -CUB_RUNTIME_FUNCTION bool +[[nodiscard]] CUB_RUNTIME_FUNCTION bool runtime_preconditions_valid(::cuda::std::tuple const& inputs, OutIter output, OffsetT num_items) { // Pointer alignment is in bytes (for LDG.E.128); the kernel's @@ -160,7 +160,7 @@ runtime_preconditions_valid(::cuda::std::tuple const& inputs, OutIte // mirror of Op with __tile__ operator), NOT the user's Op instance -- the // user's scalar functor cannot be invoked on ct::tile arguments. template -CUB_RUNTIME_FUNCTION cudaError_t dispatch( +[[nodiscard]] CUB_RUNTIME_FUNCTION cudaError_t dispatch( ::cuda::std::tuple inputs, OutIter output, OffsetT num_items, cudaStream_t stream) { auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output); diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index f31cfca5d40..1d96a29e3c0 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -34,7 +34,7 @@ namespace detail::transform::tile // The caller is responsible for honoring assume_aligned<16>; the dispatch header's // runtime_preconditions_valid enforces this before launching either kernel. template -__tile__ auto make_partition_view(T* ptr, N n) +[[nodiscard]] __tile__ auto make_partition_view(T* ptr, N n) { namespace ct = ::cuda::tiles; const auto ptr_align = ct::assume_aligned<16>(ptr); From 0055590b01134fe408ace0922b307fa919ec36d4 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 17:06:54 -0700 Subject: [PATCH 29/83] drop redundant __detail sub-namespace from tile dispatch helper --- cub/cub/device/dispatch/dispatch_transform_tile.cuh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 5e4bedf725b..8b69b85ae22 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -100,13 +100,10 @@ struct DeviceTransform } }; -namespace __detail -{ template using __unwrapped_value_t = ::cuda::std::remove_cv_t<::cuda::std::remove_pointer_t()))>>; -} // namespace __detail // Combined compile-time predicate used by cub::DeviceTransform's __transform_internal // to decide whether to route a given (Op, OutIter, InIters...) to the tile path. @@ -119,7 +116,7 @@ inline constexpr bool tile_dispatch_eligible_v = THRUST_NS_QUALIFIER::is_contiguous_iterator_v && (THRUST_NS_QUALIFIER::is_contiguous_iterator_v && ...) && CUB_NS_QUALIFIER::transform::tile_eligible_v< - Op, __detail::__unwrapped_value_t, sizeof...(InIters)>; + Op, __unwrapped_value_t, sizeof...(InIters)>; // Runtime predicate consulted by the cub::DeviceTransform tile hook before // it commits to the tile path. Mirrors how CUB's dispatch_t::CanVectorize From c7ee05c41d63341454a3767b87a6e7847bb0c139 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 17:12:26 -0700 Subject: [PATCH 30/83] use cub::detail::it_value_t and drop hand-rolled unwrap helper --- .../device/dispatch/dispatch_transform_tile.cuh | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 8b69b85ae22..49e61de1eba 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -39,9 +39,6 @@ # include # include # include -# include -# include -# include # include # include @@ -100,11 +97,6 @@ struct DeviceTransform } }; -template -using __unwrapped_value_t = - ::cuda::std::remove_cv_t<::cuda::std::remove_pointer_t()))>>; - // Combined compile-time predicate used by cub::DeviceTransform's __transform_internal // to decide whether to route a given (Op, OutIter, InIters...) to the tile path. // The call site lifts this into an `if constexpr`: when this is true the hook @@ -115,8 +107,7 @@ template inline constexpr bool tile_dispatch_eligible_v = THRUST_NS_QUALIFIER::is_contiguous_iterator_v && (THRUST_NS_QUALIFIER::is_contiguous_iterator_v && ...) - && CUB_NS_QUALIFIER::transform::tile_eligible_v< - Op, __unwrapped_value_t, sizeof...(InIters)>; + && CUB_NS_QUALIFIER::transform::tile_eligible_v, sizeof...(InIters)>; // Runtime predicate consulted by the cub::DeviceTransform tile hook before // it commits to the tile path. Mirrors how CUB's dispatch_t::CanVectorize @@ -166,9 +157,8 @@ template >; - using tile_op_t = - typename CUB_NS_QUALIFIER::transform::tile_eligible::tile_op_type; + using tile_op_t = + typename CUB_NS_QUALIFIER::transform::tile_eligible, sizeof...(InIters)>::tile_op_type; static_assert(::cuda::std::is_empty_v, "tile_op_type must be stateless (the tile kernel default-constructs it)"); static_assert(::cuda::std::is_trivially_default_constructible_v, From 91d3945d0ac13b84b6f861d95f07f89396971361 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 17:31:31 -0700 Subject: [PATCH 31/83] drop redundant template keyword on DeviceTransform::Transform call --- cub/cub/device/dispatch/dispatch_transform_tile.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 49e61de1eba..428f34e8c9b 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -164,7 +164,7 @@ template , "tile_op_type must be trivially default constructible"); - return DeviceTransform::template Transform<0, CUB_NS_QUALIFIER::transform::tile_mufu_heavy_v, tile_op_t>( + return DeviceTransform::Transform<0, CUB_NS_QUALIFIER::transform::tile_mufu_heavy_v, tile_op_t>( in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), tile_op_t{}, stream); } From 3c67f7deb9b4d3f8d98d86bfa1efa7a1e9cdc9a9 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 17:35:28 -0700 Subject: [PATCH 32/83] wrap kernel-launch error checks with CubDebug --- cub/cub/device/dispatch/dispatch_transform_tile.cuh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 428f34e8c9b..7a0815d575c 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -31,6 +31,7 @@ # include # include # include +# include # include # include @@ -69,7 +70,7 @@ template <<(num_blocks), 1, 0, stream>>>( num_items, output, ::cuda::std::get(inputs)...); - return cudaGetLastError(); + return CubDebug(cudaGetLastError()); } struct DeviceTransform @@ -93,7 +94,7 @@ struct DeviceTransform constexpr int chosen = (TileSize > 0) ? TileSize : pick_tile_size(); const int64_t num_blocks = (num_items + chosen - 1) / chosen; fill_kernel<<(num_blocks), 1, 0, stream>>>(num_items, output, value); - return cudaGetLastError(); + return CubDebug(cudaGetLastError()); } }; From fd9b7a285d8160db433ffa4f1c338019b5d853ad Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 17:39:03 -0700 Subject: [PATCH 33/83] fully qualify tile kernel-launch names and use unsigned in casts --- cub/cub/device/dispatch/dispatch_transform_tile.cuh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 7a0815d575c..cc0aa93cea6 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -67,8 +67,8 @@ template <<(num_blocks), 1, 0, stream>>>( - num_items, output, ::cuda::std::get(inputs)...); + CUB_NS_QUALIFIER::detail::transform::tile::transform_kernel + <<(num_blocks), 1, 0, stream>>>(num_items, output, ::cuda::std::get(inputs)...); return CubDebug(cudaGetLastError()); } @@ -93,7 +93,8 @@ struct DeviceTransform } constexpr int chosen = (TileSize > 0) ? TileSize : pick_tile_size(); const int64_t num_blocks = (num_items + chosen - 1) / chosen; - fill_kernel<<(num_blocks), 1, 0, stream>>>(num_items, output, value); + CUB_NS_QUALIFIER::detail::transform::tile::fill_kernel + <<(num_blocks), 1, 0, stream>>>(num_items, output, value); return CubDebug(cudaGetLastError()); } }; From 0793ffe82afb833f912d40f49f8301b3cb74f984 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 17:47:36 -0700 Subject: [PATCH 34/83] document tile_mufu_heavy with a usage hint --- cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh index 660ec4f10c6..bf8f9caa1a3 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh @@ -60,6 +60,8 @@ struct tile_eligible : ::cuda::std::false_type template inline constexpr bool tile_eligible_v = tile_eligible::value; +// Hint that Op uses MUFU (multi-function unit, sin/cos/exp/log/tanh/rcp/rsq). Setting this makes +// the tile policy picker cap items/thread so MUFU pipes are not oversaturated. template struct tile_mufu_heavy : ::cuda::std::false_type {}; From 3e206e69b9179a999f069220b15c764c2607d705 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 17:51:37 -0700 Subject: [PATCH 35/83] use ::cuda::ceil_div for block-count math in tile dispatch --- cub/cub/device/dispatch/dispatch_transform_tile.cuh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index cc0aa93cea6..f4597cf67ff 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -36,6 +36,7 @@ # include # include +# include # include # include # include @@ -65,7 +66,7 @@ template <<(num_blocks), 1, 0, stream>>>(num_items, output, ::cuda::std::get(inputs)...); @@ -92,7 +93,7 @@ struct DeviceTransform return cudaSuccess; } constexpr int chosen = (TileSize > 0) ? TileSize : pick_tile_size(); - const int64_t num_blocks = (num_items + chosen - 1) / chosen; + const int64_t num_blocks = ::cuda::ceil_div(num_items, int64_t{chosen}); CUB_NS_QUALIFIER::detail::transform::tile::fill_kernel <<(num_blocks), 1, 0, stream>>>(num_items, output, value); return CubDebug(cudaGetLastError()); From e56f0d642ff81ce1d11f2500b825b90b9b9bf196 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 17:57:24 -0700 Subject: [PATCH 36/83] reuse CUB's cc_to_min_bytes_in_flight, take compute_capability object --- .../dispatch/tuning/tuning_transform_tile.cuh | 28 +++++-------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh index 4bd82475c5c..aabc9c1852a 100644 --- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh @@ -1,8 +1,8 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// Policy picker for cub::DeviceTransform's tile path. Mirrors the -// bytes-in-flight target used by CUB's non-tile algorithms (see +// Policy picker for cub::DeviceTransform's tile path. Shares the +// bytes-in-flight target used by CUB's non-tile algorithms (calls // tuning_transform.cuh's cc_to_min_bytes_in_flight) but expresses the // answer as a TileSize, since tile kernels partition by compile-time // shape rather than threads*items. @@ -23,6 +23,9 @@ #if _CCCL_CUB_HAS_TILE_TRANSFORM() +# include + +# include # include CUB_NAMESPACE_BEGIN @@ -30,23 +33,6 @@ CUB_NAMESPACE_BEGIN namespace detail::transform::tile { -constexpr int min_bytes_in_flight_per_sm(int cc_x10) -{ - if (cc_x10 >= 1000) - { - return 64 * 1024; // B200 - } - if (cc_x10 >= 900) - { - return 48 * 1024; // H100/H200 - } - if (cc_x10 >= 800) - { - return 16 * 1024; // A100 - } - return 12 * 1024; -} - constexpr int min_size(int a) { return a; @@ -63,7 +49,7 @@ constexpr int min_size(int a, int b, Ts... rest) // registers and the compiler unpacks them and packs them back. reducing the // compute work per thread helps here. need profiling to know the exact cause. template -constexpr int pick_tile_size(bool mufu_heavy = false, int cc_x10 = 1000) +constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability cc = {10, 0}) { constexpr int threads_per_block = 128; constexpr int vector_bytes = 16; // LDG.E.128 -> 16 bytes @@ -75,7 +61,7 @@ constexpr int pick_tile_size(bool mufu_heavy = false, int cc_x10 = 1000) // Fill (zero inputs) keeps the same latency target by counting output bytes. constexpr int bytes_per_iter = (sizeof...(Ins) > 0) ? (int(sizeof(Ins)) + ... + 0) : int(sizeof(Out)); - const int target = min_bytes_in_flight_per_sm(cc_x10); + const int target = cc_to_min_bytes_in_flight(cc); const int items_for_latency = static_cast(::cuda::ceil_div(target, max_occupancy * threads_per_block * bytes_per_iter)); From e9e9939a4e46a8c895092d67433287e3c13da379 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 18:01:25 -0700 Subject: [PATCH 37/83] use ::cuda::std::min initializer list instead of hand-rolled variadic min --- .../dispatch/tuning/tuning_transform_tile.cuh | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh index aabc9c1852a..f31066dd034 100644 --- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh @@ -27,23 +27,13 @@ # include # include +# include CUB_NAMESPACE_BEGIN namespace detail::transform::tile { -constexpr int min_size(int a) -{ - return a; -} -template -constexpr int min_size(int a, int b, Ts... rest) -{ - int m = a < b ? a : b; - return min_size(m, rest...); -} - // mufu_heavy=true tells the policy the functor body has heavy MUFU usage. // for small data types, vectorized load will make them arrive packed in // registers and the compiler unpacks them and packs them back. reducing the @@ -56,7 +46,7 @@ constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability constexpr int max_items_per_thread = 32; constexpr int max_occupancy = 16; - constexpr int min_elem = min_size(int(sizeof(Out)), int(sizeof(Ins))...); + constexpr int min_elem = ::cuda::std::min({int(sizeof(Out)), int(sizeof(Ins))...}); constexpr int items_for_vec = static_cast(::cuda::ceil_div(vector_bytes, min_elem)); // Fill (zero inputs) keeps the same latency target by counting output bytes. From cc77ef2157cafbf7dbf6e5bc296fe7fa760c6a7e Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 9 Jun 2026 18:14:53 -0700 Subject: [PATCH 38/83] drop int() casts on sizeof and use ::cuda::std::max --- .../dispatch/tuning/tuning_transform_tile.cuh | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh index f31066dd034..51892ef7005 100644 --- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh @@ -27,7 +27,9 @@ # include # include +# include # include +# include CUB_NAMESPACE_BEGIN @@ -46,17 +48,17 @@ constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability constexpr int max_items_per_thread = 32; constexpr int max_occupancy = 16; - constexpr int min_elem = ::cuda::std::min({int(sizeof(Out)), int(sizeof(Ins))...}); + constexpr auto min_elem = ::cuda::std::min({sizeof(Out), sizeof(Ins)...}); constexpr int items_for_vec = static_cast(::cuda::ceil_div(vector_bytes, min_elem)); // Fill (zero inputs) keeps the same latency target by counting output bytes. - constexpr int bytes_per_iter = (sizeof...(Ins) > 0) ? (int(sizeof(Ins)) + ... + 0) : int(sizeof(Out)); - const int target = cc_to_min_bytes_in_flight(cc); + constexpr auto bytes_per_iter = (sizeof...(Ins) > 0) ? (sizeof(Ins) + ... + ::cuda::std::size_t{0}) : sizeof(Out); + const int target = cc_to_min_bytes_in_flight(cc); const int items_for_latency = static_cast(::cuda::ceil_div(target, max_occupancy * threads_per_block * bytes_per_iter)); - int items = items_for_vec > items_for_latency ? items_for_vec : items_for_latency; - items = static_cast(::cuda::next_power_of_two(static_cast(items))); + int items = ::cuda::std::max(items_for_vec, items_for_latency); + items = static_cast(::cuda::next_power_of_two(static_cast(items))); if (items > max_items_per_thread) { items = max_items_per_thread; @@ -64,10 +66,10 @@ constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability if (mufu_heavy && min_elem < 4) { - const int byte_cap = vector_bytes / min_elem; // 16 for I8, 8 for I16/half/bf16 - if (items > byte_cap) + const auto byte_cap = vector_bytes / min_elem; // 16 for I8, 8 for I16/half/bf16 + if (static_cast(items) > byte_cap) { - items = byte_cap; + items = static_cast(byte_cap); } } From 69d2339676b8422e256cbc666b2745e7fe393570 Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 10 Jun 2026 14:25:45 -0700 Subject: [PATCH 39/83] simplify _CCCL_CUB_HAS_TILE_TRANSFORM to just _CCCL_TILE_COMPILATION --- .../dispatch/dispatch_transform_tile_config.cuh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh index cd43a9d8b48..4636d3c5759 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh @@ -5,10 +5,12 @@ // share. Two macros: // // _CCCL_CUB_HAS_TILE_TRANSFORM() -// True when CUB's tile transform machinery is available: CTK 13.3 or newer, -// C++20 (tile DSL requires it), and the tile compilation trajectory -// (--enable-tile). When false, the tile headers (kernel / tuning / dispatch -// / traits) are skipped entirely. +// True when nvcc is compiling in tile mode (--enable-tile, i.e. +// _CCCL_TILE_COMPILATION()). The other preconditions tile needs are +// enforced where they belong: CTK 13.3+ is implied because --enable-tile +// is a 13.3+ nvcc flag, and C++20 is enforced by cuda_tile.h itself with +// an explicit #error. When false, the tile headers (kernel / tuning / +// dispatch / traits) are skipped entirely. // // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() // True when the dispatch hook in cub::DeviceTransform should fire. Same as @@ -27,8 +29,7 @@ # pragma system_header #endif // no system header -#define _CCCL_CUB_HAS_TILE_TRANSFORM() \ - (_CCCL_CTK_AT_LEAST(13, 3) && _CCCL_STD_VER >= 2020 && _CCCL_TILE_COMPILATION()) +#define _CCCL_CUB_HAS_TILE_TRANSFORM() _CCCL_TILE_COMPILATION() #define _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() \ (_CCCL_CUB_HAS_TILE_TRANSFORM() && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH)) From 4c2d0c52f114b2da4dbdb9f6d450b74601c82b38 Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 10 Jun 2026 15:55:09 -0700 Subject: [PATCH 40/83] fully qualify cub::detail/cub::transform refs + ::cuda* runtime types Per @fbusato review: spell out the full namespace at every site instead of relying on local cub::detail::transform::tile scope. Same for ::cudaError_t, ::cudaStream_t, ::cudaSuccess, ::cudaGetLastError(). Default stream parameters now nullptr instead of literal 0. swapped for ; bare int64_t -> ::cuda::std::int64_t. --- cub/cub/device/device_transform.cuh | 7 +-- .../dispatch/dispatch_transform_tile.cuh | 49 ++++++++++--------- .../dispatch/tuning/tuning_transform_tile.cuh | 2 +- 3 files changed, 32 insertions(+), 26 deletions(-) diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index 9e1404c0f62..7d8bd316e81 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -113,14 +113,15 @@ struct DeviceTransform // not an error. if constexpr (StableAddress == detail::transform::requires_stable_address::no && ::cuda::std::is_same_v - && detail::transform::tile::tile_dispatch_eligible_v< + && cub::detail::transform::tile::tile_dispatch_eligible_v< TransformOp, RandomAccessIteratorOut, RandomAccessIteratorsIn...>) { - if (detail::transform::tile::runtime_preconditions_valid(inputs, output, static_cast(num_items))) + if (cub::detail::transform::tile::runtime_preconditions_valid( + inputs, output, static_cast(num_items))) { - return detail::transform::tile::dispatch( + return cub::detail::transform::tile::dispatch( inputs, output, static_cast(num_items), stream); } } diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index f4597cf67ff..63b565f812f 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -46,7 +46,7 @@ # include -# include +# include CUB_NAMESPACE_BEGIN @@ -54,49 +54,54 @@ namespace detail::transform::tile { template -[[nodiscard]] cudaError_t launch_impl( +[[nodiscard]] ::cudaError_t launch_impl( ::cuda::std::tuple inputs, Out* output, - int64_t num_items, - cudaStream_t stream, + ::cuda::std::int64_t num_items, + ::cudaStream_t stream, ::cuda::std::index_sequence) { if (num_items <= 0) { - return cudaSuccess; + return ::cudaSuccess; } - const int64_t num_blocks = ::cuda::ceil_div(num_items, int64_t{TileSize}); + const ::cuda::std::int64_t num_blocks = ::cuda::ceil_div(num_items, ::cuda::std::int64_t{TileSize}); - CUB_NS_QUALIFIER::detail::transform::tile::transform_kernel + cub::detail::transform::tile::transform_kernel <<(num_blocks), 1, 0, stream>>>(num_items, output, ::cuda::std::get(inputs)...); - return CubDebug(cudaGetLastError()); + return CubDebug(::cudaGetLastError()); } struct DeviceTransform { template - [[nodiscard]] static cudaError_t - Transform(::cuda::std::tuple inputs, Out* output, int64_t num_items, Fn, cudaStream_t stream = 0) + [[nodiscard]] static ::cudaError_t Transform( + ::cuda::std::tuple inputs, + Out* output, + ::cuda::std::int64_t num_items, + Fn, + ::cudaStream_t stream = nullptr) { - constexpr int chosen = (TileSize > 0) ? TileSize : pick_tile_size(MufuHeavy); + constexpr int chosen = (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size(MufuHeavy); return launch_impl(inputs, output, num_items, stream, ::cuda::std::index_sequence_for{}); } // Fill template - [[nodiscard]] static cudaError_t Fill(T* output, int64_t num_items, T value, cudaStream_t stream = 0) + [[nodiscard]] static ::cudaError_t + Fill(T* output, ::cuda::std::int64_t num_items, T value, ::cudaStream_t stream = nullptr) { if (num_items <= 0) { - return cudaSuccess; + return ::cudaSuccess; } - constexpr int chosen = (TileSize > 0) ? TileSize : pick_tile_size(); - const int64_t num_blocks = ::cuda::ceil_div(num_items, int64_t{chosen}); - CUB_NS_QUALIFIER::detail::transform::tile::fill_kernel + constexpr int chosen = (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size(); + const ::cuda::std::int64_t num_blocks = ::cuda::ceil_div(num_items, ::cuda::std::int64_t{chosen}); + cub::detail::transform::tile::fill_kernel <<(num_blocks), 1, 0, stream>>>(num_items, output, value); - return CubDebug(cudaGetLastError()); + return CubDebug(::cudaGetLastError()); } }; @@ -110,7 +115,7 @@ template inline constexpr bool tile_dispatch_eligible_v = THRUST_NS_QUALIFIER::is_contiguous_iterator_v && (THRUST_NS_QUALIFIER::is_contiguous_iterator_v && ...) - && CUB_NS_QUALIFIER::transform::tile_eligible_v, sizeof...(InIters)>; + && cub::transform::tile_eligible_v, sizeof...(InIters)>; // Runtime predicate consulted by the cub::DeviceTransform tile hook before // it commits to the tile path. Mirrors how CUB's dispatch_t::CanVectorize @@ -151,8 +156,8 @@ runtime_preconditions_valid(::cuda::std::tuple const& inputs, OutIte // mirror of Op with __tile__ operator), NOT the user's Op instance -- the // user's scalar functor cannot be invoked on ct::tile arguments. template -[[nodiscard]] CUB_RUNTIME_FUNCTION cudaError_t dispatch( - ::cuda::std::tuple inputs, OutIter output, OffsetT num_items, cudaStream_t stream) +[[nodiscard]] CUB_RUNTIME_FUNCTION ::cudaError_t dispatch( + ::cuda::std::tuple inputs, OutIter output, OffsetT num_items, ::cudaStream_t stream) { auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output); auto in_ptrs = ::cuda::std::apply( @@ -161,13 +166,13 @@ template , sizeof...(InIters)>::tile_op_type; + typename cub::transform::tile_eligible, sizeof...(InIters)>::tile_op_type; static_assert(::cuda::std::is_empty_v, "tile_op_type must be stateless (the tile kernel default-constructs it)"); static_assert(::cuda::std::is_trivially_default_constructible_v, "tile_op_type must be trivially default constructible"); - return DeviceTransform::Transform<0, CUB_NS_QUALIFIER::transform::tile_mufu_heavy_v, tile_op_t>( + return DeviceTransform::Transform<0, cub::transform::tile_mufu_heavy_v, tile_op_t>( in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), tile_op_t{}, stream); } diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh index 51892ef7005..bea4e390eab 100644 --- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh @@ -53,7 +53,7 @@ constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability // Fill (zero inputs) keeps the same latency target by counting output bytes. constexpr auto bytes_per_iter = (sizeof...(Ins) > 0) ? (sizeof(Ins) + ... + ::cuda::std::size_t{0}) : sizeof(Out); - const int target = cc_to_min_bytes_in_flight(cc); + const int target = cub::detail::transform::cc_to_min_bytes_in_flight(cc); const int items_for_latency = static_cast(::cuda::ceil_div(target, max_occupancy * threads_per_block * bytes_per_iter)); From 9d861ecee1fbf06dc43ef0f81cf8a7932f49eeeb Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 10 Jun 2026 16:08:49 -0700 Subject: [PATCH 41/83] reflow kernel_transform_tile.cuh comments to 120-column limit Per @fbusato nit: the comment blocks were wrapped at ~96 cols, leaving ~20 cols of the 120 budget unused. Reflow to pack each line near 120. No code or semantic changes. --- .../dispatch/kernels/kernel_transform_tile.cuh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index 1d96a29e3c0..154eeceb9c3 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -27,12 +27,12 @@ namespace detail::transform::tile { // Build a tile partition_view for a 1D contiguous buffer. The two annotations are load-bearing: -// assume_aligned<16> -- promises the pointer is 16-byte aligned, so the compiler can pick -// LDG.E.128 vectorized loads/stores. -// ct::extents -- explicit element type on the extent; CTAD would deduce uint32_t and -// wrap at 2^32. int64_t lets us cover the full num_items range. -// The caller is responsible for honoring assume_aligned<16>; the dispatch header's -// runtime_preconditions_valid enforces this before launching either kernel. +// assume_aligned<16> -- promises the pointer is 16-byte aligned, so the compiler can pick LDG.E.128 vectorized +// loads/stores. +// ct::extents -- explicit element type on the extent; CTAD would deduce uint32_t and wrap at 2^32. +// int64_t lets us cover the full num_items range. +// The caller is responsible for honoring assume_aligned<16>; the dispatch header's runtime_preconditions_valid +// enforces this before launching either kernel. template [[nodiscard]] __tile__ auto make_partition_view(T* ptr, N n) { @@ -42,9 +42,9 @@ template return ct::partition_view{span, ct::shape{}}; } -// Tile DSL kernels backing cub::DeviceTransform's tile path. The kernels assume 16-byte alignment on -// every pointer and 16-byte divisibility on num_items so the compiler can pick LDG.E.128. Callers in -// the dispatch header are responsible for honoring those preconditions. +// Tile DSL kernels backing cub::DeviceTransform's tile path. The kernels assume 16-byte alignment on every pointer +// and 16-byte divisibility on num_items so the compiler can pick LDG.E.128. Callers in the dispatch header are +// responsible for honoring those preconditions. // // assume_divisible<16> -- promises num_items % 16 == 0, so the tile DSL can elide tail handling. // assume_bounded_below<0> -- promises num_items >= 0; enables sign-comparison simplifications. From b83cc7031707cd836a894c91d045f69b70f1f86c Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 10 Jun 2026 16:53:49 -0700 Subject: [PATCH 42/83] anchor make_partition_view with using-decl; inline stateless Fn Per @fbusato suggestion on transform_kernel: - drop the named `Fn fn{}` local and construct the stateless functor inline as `Fn{}(...)` at the call site - add `using cub::detail::transform::tile::make_partition_view;` to anchor the helper name explicitly (called twice), consistent with the full-qualification convention, instead of relying on enclosing-namespace lookup --- cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index 154eeceb9c3..d2131a5b646 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -53,14 +53,14 @@ __tile_global__ void transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins) { namespace ct = ::cuda::tiles; + using cub::detail::transform::tile::make_partition_view; const auto bx = ct::bid().x; - Fn fn{}; const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); auto out_view = make_partition_view(out, n); auto load_one = [bx, n](auto* ptr) { return make_partition_view(ptr, n).load_masked(bx); }; - out_view.store_masked(fn(load_one(ins)...), bx); + out_view.store_masked(Fn{}(load_one(ins)...), bx); } template From 60db7cea4721d7bf5bc8114d5a59c091945e76ba Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 10 Jun 2026 17:00:21 -0700 Subject: [PATCH 43/83] anchor remaining intra-namespace helper calls Same convention as the make_partition_view anchor in transform_kernel, applied to the two spots the qualification sweep left bare: - fill_kernel: add the make_partition_view using-decl (matches transform_kernel) - DeviceTransform::Transform: full-path qualify launch_impl, matching the pick_tile_size call directly above it Type-name references (DeviceTransform within its own namespace) left as-is. --- cub/cub/device/dispatch/dispatch_transform_tile.cuh | 3 ++- cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 63b565f812f..c038faf8435 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -85,7 +85,8 @@ struct DeviceTransform ::cudaStream_t stream = nullptr) { constexpr int chosen = (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size(MufuHeavy); - return launch_impl(inputs, output, num_items, stream, ::cuda::std::index_sequence_for{}); + return cub::detail::transform::tile::launch_impl( + inputs, output, num_items, stream, ::cuda::std::index_sequence_for{}); } // Fill diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index d2131a5b646..157be4b0c2b 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -67,6 +67,7 @@ template __tile_global__ void fill_kernel(const ::cuda::std::int64_t num_items, T* __restrict__ out, const T value) { namespace ct = ::cuda::tiles; + using cub::detail::transform::tile::make_partition_view; const auto bx = ct::bid().x; const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); From 85394e1b1a6c231842b1d88cec6c5850339815d2 Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 10 Jun 2026 17:06:07 -0700 Subject: [PATCH 44/83] rename make_partition_view -> make_aligned_partition_view Per @fbusato: the helper bakes in ct::assume_aligned<16>, so the name should advertise that it returns an aligned partition view rather than a plain ct::partition_view. Pure rename, 6 sites, all in this file. --- .../dispatch/kernels/kernel_transform_tile.cuh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index 157be4b0c2b..63f5d61d5cb 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -34,7 +34,7 @@ namespace detail::transform::tile // The caller is responsible for honoring assume_aligned<16>; the dispatch header's runtime_preconditions_valid // enforces this before launching either kernel. template -[[nodiscard]] __tile__ auto make_partition_view(T* ptr, N n) +[[nodiscard]] __tile__ auto make_aligned_partition_view(T* ptr, N n) { namespace ct = ::cuda::tiles; const auto ptr_align = ct::assume_aligned<16>(ptr); @@ -53,12 +53,12 @@ __tile_global__ void transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins) { namespace ct = ::cuda::tiles; - using cub::detail::transform::tile::make_partition_view; + using cub::detail::transform::tile::make_aligned_partition_view; const auto bx = ct::bid().x; const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); - auto out_view = make_partition_view(out, n); - auto load_one = [bx, n](auto* ptr) { return make_partition_view(ptr, n).load_masked(bx); }; + auto out_view = make_aligned_partition_view(out, n); + auto load_one = [bx, n](auto* ptr) { return make_aligned_partition_view(ptr, n).load_masked(bx); }; out_view.store_masked(Fn{}(load_one(ins)...), bx); } @@ -67,11 +67,11 @@ template __tile_global__ void fill_kernel(const ::cuda::std::int64_t num_items, T* __restrict__ out, const T value) { namespace ct = ::cuda::tiles; - using cub::detail::transform::tile::make_partition_view; + using cub::detail::transform::tile::make_aligned_partition_view; const auto bx = ct::bid().x; const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); - auto out_view = make_partition_view(out, n); + auto out_view = make_aligned_partition_view(out, n); using tile_t = ct::tile>; out_view.store_masked(ct::full(value), bx); } From 63f6bda135f0e5b8a9574aee42485ee130fea47b Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 10 Jun 2026 17:20:23 -0700 Subject: [PATCH 45/83] mark out_view const in both tile kernels Per @fbusato: out_view is never re-seated, only stored through. ct::partition_view is a non-owning view with shallow const (store_masked/load_masked are const member functions, like std::span/mdspan), so a const view still writes through -- verified the store still compiles and the dispatch test stays bit-exact. Kept the make_aligned_partition_view using-decl rather than fully qualifying inline; the name is already anchored. Applied to fill_kernel too for parity. --- .../dispatch/kernels/kernel_transform_tile.cuh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index 63f5d61d5cb..585cefc833d 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -56,9 +56,9 @@ transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, co using cub::detail::transform::tile::make_aligned_partition_view; const auto bx = ct::bid().x; - const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); - auto out_view = make_aligned_partition_view(out, n); - auto load_one = [bx, n](auto* ptr) { return make_aligned_partition_view(ptr, n).load_masked(bx); }; + const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); + const auto out_view = make_aligned_partition_view(out, n); + auto load_one = [bx, n](auto* ptr) { return make_aligned_partition_view(ptr, n).load_masked(bx); }; out_view.store_masked(Fn{}(load_one(ins)...), bx); } @@ -70,9 +70,9 @@ __tile_global__ void fill_kernel(const ::cuda::std::int64_t num_items, T* __rest using cub::detail::transform::tile::make_aligned_partition_view; const auto bx = ct::bid().x; - const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); - auto out_view = make_aligned_partition_view(out, n); - using tile_t = ct::tile>; + const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); + const auto out_view = make_aligned_partition_view(out, n); + using tile_t = ct::tile>; out_view.store_masked(ct::full(value), bx); } From 0a81752c661efe8e9de7b8c06b32957a4c5554f7 Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 10 Jun 2026 17:28:15 -0700 Subject: [PATCH 46/83] include specific libcu++ headers instead of umbrella Per @fbusato: use the narrowest internal header for each symbol, matching the sibling non-tile dispatch_transform.cuh convention. - dispatch_transform_tile.cuh: -> ; -> - tuning_transform_tile.cuh: -> + Kept (sibling keeps the umbrella) and . --- cub/cub/device/dispatch/dispatch_transform_tile.cuh | 4 ++-- cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index c038faf8435..2e05252cce8 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -36,13 +36,13 @@ # include # include -# include +# include # include # include # include # include +# include # include -# include # include diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh index bea4e390eab..df6b849951d 100644 --- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh @@ -25,8 +25,9 @@ # include +# include +# include # include -# include # include # include # include From 33a783a31049402efe7a8fccb669fb1ce9d174b0 Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 10 Jun 2026 17:31:26 -0700 Subject: [PATCH 47/83] make byte_cap constexpr in pick_tile_size Per @fbusato: byte_cap = vector_bytes / min_elem is a constant expression (both operands are constexpr), so constexpr expresses its nature better than const. --- cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh index df6b849951d..b34da587fac 100644 --- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh @@ -67,7 +67,7 @@ constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability if (mufu_heavy && min_elem < 4) { - const auto byte_cap = vector_bytes / min_elem; // 16 for I8, 8 for I16/half/bf16 + constexpr auto byte_cap = vector_bytes / min_elem; // 16 for I8, 8 for I16/half/bf16 if (static_cast(items) > byte_cap) { items = static_cast(byte_cap); From 37cf303e0566e4ba7036ddf01156c67c06586766 Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 10 Jun 2026 17:42:04 -0700 Subject: [PATCH 48/83] drop redundant static_cast on items_for_vec Per @fbusato: ceil_div's result initializes a constexpr int directly; the size_t->int copy-init is well-formed (not list-init) and CCCL's warning set (-Wall -Wextra, no -Wconversion) doesn't flag it, so the cast was noise. --- cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh index b34da587fac..fec23bcdb43 100644 --- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh @@ -50,7 +50,7 @@ constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability constexpr int max_occupancy = 16; constexpr auto min_elem = ::cuda::std::min({sizeof(Out), sizeof(Ins)...}); - constexpr int items_for_vec = static_cast(::cuda::ceil_div(vector_bytes, min_elem)); + constexpr int items_for_vec = ::cuda::ceil_div(vector_bytes, min_elem); // Fill (zero inputs) keeps the same latency target by counting output bytes. constexpr auto bytes_per_iter = (sizeof...(Ins) > 0) ? (sizeof(Ins) + ... + ::cuda::std::size_t{0}) : sizeof(Out); From 0fb0a4ff4d96455c324d363de7f3a05751c12143 Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 10 Jun 2026 17:44:14 -0700 Subject: [PATCH 49/83] define gate macro as literal 1/0 to avoid expansion-to-defined UB _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() previously expanded to (_CCCL_CUB_HAS_TILE_TRANSFORM() && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH)). Using it in `#if` generated `defined` via macro expansion -- UB per the standard, flagged by -Wexpansion-to-defined (in -Wall/-Wextra) and fatal under CCCL's -Werror. Library headers hid it via #pragma GCC system_header, but non-system consumers (benches, and the test once it moves to cub/tests) would fail to compile. Switch to the literal 1/0 form, the same idiom _CCCL_TILE_COMPILATION uses: the defined() now lives directly in an #if, and consumers see #if 1 / #if 0. --- .../device/dispatch/dispatch_transform_tile_config.cuh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh index 4636d3c5759..8c25ea9bd30 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh @@ -31,5 +31,11 @@ #define _CCCL_CUB_HAS_TILE_TRANSFORM() _CCCL_TILE_COMPILATION() -#define _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() \ - (_CCCL_CUB_HAS_TILE_TRANSFORM() && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH)) +// Defined as a literal 1/0 (not (_CCCL_CUB_HAS_TILE_TRANSFORM() && defined(...))) so that +// `#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()` in non-system code (benches, tests) does not +// generate `defined` via macro expansion, which is UB and trips -Wexpansion-to-defined under -Werror. +#if _CCCL_CUB_HAS_TILE_TRANSFORM() && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) +# define _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() 1 +#else +# define _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() 0 +#endif From 768ab0c3e802dd3cf41d2250899d144946dbb7a0 Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 10 Jun 2026 18:00:53 -0700 Subject: [PATCH 50/83] make the vector-width cap an int so the cap comparison needs no casts @fbusato noted the cap looked like it should be int. It actually deduced to size_t (min_elem is size_t from sizeof/cuda::std::min, so vector_bytes/min_elem promotes to size_t), which is why the use sites had two casts and an int-vs-size_t compare. Cast once at the definition so it is genuinely int; the comparison and assignment are then int-vs-int -- no sign-compare, no use-site casts. Also renamed byte_cap -> vec_items_cap: the value is an items/thread count (how many elements fit in one 16-byte vector load), not a byte count, so the old name wrongly implied a byte quantity / size_t. --- cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh index fec23bcdb43..a7715e6f195 100644 --- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh @@ -67,10 +67,13 @@ constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability if (mufu_heavy && min_elem < 4) { - constexpr auto byte_cap = vector_bytes / min_elem; // 16 for I8, 8 for I16/half/bf16 - if (static_cast(items) > byte_cap) + // Elements that fit in one 16-byte vector load -> items/thread cap for MUFU-heavy sub-4B ops. + // min_elem is size_t, so cast the quotient once here to keep this an int item count (matches + // items below, so the comparison/assignment stay int-vs-int: no sign-compare, no use-site casts). + constexpr int vec_items_cap = static_cast(vector_bytes / min_elem); // 16 for I8, 8 for I16/half/bf16 + if (items > vec_items_cap) { - items = static_cast(byte_cap); + items = vec_items_cap; } } From 418e59280aacb0fec1435341a5682669f50bd462 Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 10 Jun 2026 19:07:20 -0700 Subject: [PATCH 51/83] drop CUB_NS_QUALIFIER from tile_eligible substitutes (sweep missed them) The fa9b87caec qualification sweep converted CUB_NS_QUALIFIER -> cub:: in dispatch_transform_tile.cuh but missed the four tile_op_type aliases in the traits file. They are tile_eligible specializations inside cub::transform, so literal cub::detail::transform::tile::tile_plus/tile_multiplies is correct (and resolves even under CUB_WRAPPED_NAMESPACE via enclosing-namespace lookup). CUB_NS_QUALIFIER is only needed to name cub from outside the namespace. --- .../device/dispatch/dispatch_transform_tile_traits.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh index bf8f9caa1a3..4759d97569e 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh @@ -106,12 +106,12 @@ namespace transform template <> struct tile_eligible<::cuda::std::plus<::__half>, ::__half, 2> : ::cuda::std::true_type { - using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_plus; + using tile_op_type = cub::detail::transform::tile::tile_plus; }; template <> struct tile_eligible<::cuda::std::multiplies<::__half>, ::__half, 2> : ::cuda::std::true_type { - using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_multiplies; + using tile_op_type = cub::detail::transform::tile::tile_multiplies; }; # endif // _CCCL_HAS_NVFP16() @@ -119,12 +119,12 @@ struct tile_eligible<::cuda::std::multiplies<::__half>, ::__half, 2> : ::cuda::s template <> struct tile_eligible<::cuda::std::plus<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type { - using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_plus; + using tile_op_type = cub::detail::transform::tile::tile_plus; }; template <> struct tile_eligible<::cuda::std::multiplies<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type { - using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_multiplies; + using tile_op_type = cub::detail::transform::tile::tile_multiplies; }; # endif // _CCCL_HAS_NVBF16() } // namespace transform From 5123278fa96b41ca170d3dd6da512f3263ecf69e Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 10 Jun 2026 19:19:33 -0700 Subject: [PATCH 52/83] document why num_blocks fits the unsigned grid x-dim Per @fbusato suggestion: note that the static_cast(num_blocks) can't truncate -- num_blocks > 2^32-1 would require num_items > TileSize * 2^32 (>= 2^40 elements), more than any device can hold. Added to launch_impl and the parallel spot in Fill. --- cub/cub/device/dispatch/dispatch_transform_tile.cuh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 2e05252cce8..b1e6041d71a 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -66,6 +66,8 @@ template 2^32-1 + // would need num_items > TileSize * 2^32 (>= 2^40 elements), more than any device can hold. const ::cuda::std::int64_t num_blocks = ::cuda::ceil_div(num_items, ::cuda::std::int64_t{TileSize}); cub::detail::transform::tile::transform_kernel @@ -99,6 +101,8 @@ struct DeviceTransform return ::cudaSuccess; } constexpr int chosen = (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size(); + // One CTA per tile; see launch_impl -- num_blocks can't exceed the unsigned grid x-dim for + // any device-sized num_items. const ::cuda::std::int64_t num_blocks = ::cuda::ceil_div(num_items, ::cuda::std::int64_t{chosen}); cub::detail::transform::tile::fill_kernel <<(num_blocks), 1, 0, stream>>>(num_items, output, value); From 1fa728905929ae52dc36d59d161f144214088c5d Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 10 Jun 2026 19:23:45 -0700 Subject: [PATCH 53/83] clang-format dispatch_transform_tile.cuh --- .../dispatch/dispatch_transform_tile.cuh | 32 ++++++++----------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index b1e6041d71a..c170dd71013 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -42,17 +42,15 @@ # include # include # include +# include # include # include -# include - CUB_NAMESPACE_BEGIN namespace detail::transform::tile { - template [[nodiscard]] ::cudaError_t launch_impl( ::cuda::std::tuple inputs, @@ -80,13 +78,10 @@ struct DeviceTransform { template [[nodiscard]] static ::cudaError_t Transform( - ::cuda::std::tuple inputs, - Out* output, - ::cuda::std::int64_t num_items, - Fn, - ::cudaStream_t stream = nullptr) + ::cuda::std::tuple inputs, Out* output, ::cuda::std::int64_t num_items, Fn, ::cudaStream_t stream = nullptr) { - constexpr int chosen = (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size(MufuHeavy); + constexpr int chosen = + (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size(MufuHeavy); return cub::detail::transform::tile::launch_impl( inputs, output, num_items, stream, ::cuda::std::index_sequence_for{}); } @@ -100,7 +95,7 @@ struct DeviceTransform { return ::cudaSuccess; } - constexpr int chosen = (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size(); + constexpr int chosen = (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size(); // One CTA per tile; see launch_impl -- num_blocks can't exceed the unsigned grid x-dim for // any device-sized num_items. const ::cuda::std::int64_t num_blocks = ::cuda::ceil_div(num_items, ::cuda::std::int64_t{chosen}); @@ -137,13 +132,13 @@ runtime_preconditions_valid(::cuda::std::tuple const& inputs, OutIte constexpr int byte_align = 16; constexpr int items_divisor = 16; - auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output); + auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output); const bool aligned_out = ::cuda::std::is_sufficiently_aligned(out_ptr); const bool aligned_in = ::cuda::std::apply( [](auto... iters) { - return ((::cuda::std::is_sufficiently_aligned( - THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters))) - && ...); + return ( + (::cuda::std::is_sufficiently_aligned(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters))) + && ...); }, inputs); @@ -161,8 +156,8 @@ runtime_preconditions_valid(::cuda::std::tuple const& inputs, OutIte // mirror of Op with __tile__ operator), NOT the user's Op instance -- the // user's scalar functor cannot be invoked on ct::tile arguments. template -[[nodiscard]] CUB_RUNTIME_FUNCTION ::cudaError_t dispatch( - ::cuda::std::tuple inputs, OutIter output, OffsetT num_items, ::cudaStream_t stream) +[[nodiscard]] CUB_RUNTIME_FUNCTION ::cudaError_t +dispatch(::cuda::std::tuple inputs, OutIter output, OffsetT num_items, ::cudaStream_t stream) { auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output); auto in_ptrs = ::cuda::std::apply( @@ -170,8 +165,8 @@ template , sizeof...(InIters)>::tile_op_type; + using tile_op_t = typename cub::transform:: + tile_eligible, sizeof...(InIters)>::tile_op_type; static_assert(::cuda::std::is_empty_v, "tile_op_type must be stateless (the tile kernel default-constructs it)"); static_assert(::cuda::std::is_trivially_default_constructible_v, @@ -180,7 +175,6 @@ template , tile_op_t>( in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), tile_op_t{}, stream); } - } // namespace detail::transform::tile CUB_NAMESPACE_END From 34568d4e57326e1de1c65abb9d01ab25a2762448 Mon Sep 17 00:00:00 2001 From: Nan An Date: Wed, 10 Jun 2026 19:37:44 -0700 Subject: [PATCH 54/83] split tile_op_t alias into an intermediate out_value_t Per @fbusato: break the dense one-liner into an intermediate alias for the output value type, so the tile_eligible instantiation reads cleanly on one line. --- cub/cub/device/dispatch/dispatch_transform_tile.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index c170dd71013..67756cb20c9 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -165,8 +165,8 @@ dispatch(::cuda::std::tuple inputs, OutIter output, OffsetT num_item return ::cuda::std::make_tuple(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters)...); }, inputs); - using tile_op_t = typename cub::transform:: - tile_eligible, sizeof...(InIters)>::tile_op_type; + using out_value_t = cub::detail::it_value_t; + using tile_op_t = typename cub::transform::tile_eligible::tile_op_type; static_assert(::cuda::std::is_empty_v, "tile_op_type must be stateless (the tile kernel default-constructs it)"); static_assert(::cuda::std::is_trivially_default_constructible_v, From f80bb25d1741a89b17a7ed3e5cc052044f6d6bd5 Mon Sep 17 00:00:00 2001 From: Nan An Date: Thu, 11 Jun 2026 16:14:58 -0700 Subject: [PATCH 55/83] separate tile eligibility from the tile-operator substitute Per @fbusato: tile_eligible previously did two jobs -- mark a combo eligible AND carry a `tile_op_type` substitute. Split the two axes: - tile_eligible -> eligibility only (bool). - tile_operator -> the __tile__ functor the tile kernel runs for Op, with NO default: a scalar functor cannot run on ct::tile, so every eligible op must specialize it. Omitting it is a clear static_assert, not a cryptic "calling __host__ __device__ from __tile_global__" kernel error. tile_operator_t is the alias. dispatch now uses tile_operator_t. Built-in cuda::std::plus/multiplies substitutes and all bench/test registrations migrated to the two-trait form. --- .../bench/transform/tile/babelstream.cu | 12 ++- cub/benchmarks/bench/transform/tile/copy.cu | 3 +- .../bench/transform/tile/grayscale.cu | 3 +- .../bench/transform/tile/pytorch.cu | 42 ++++++---- .../transform/tile/test_device_transform.cu | 10 ++- .../dispatch/dispatch_transform_tile.cuh | 14 ++-- .../dispatch_transform_tile_traits.cuh | 76 +++++++++++++------ 7 files changed, 105 insertions(+), 55 deletions(-) diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu index 297ef78379a..9ee750ce35b 100644 --- a/cub/benchmarks/bench/transform/tile/babelstream.cu +++ b/cub/benchmarks/bench/transform/tile/babelstream.cu @@ -62,10 +62,14 @@ struct tile_nstream_op { CUB_NAMESPACE_BEGIN namespace transform { -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_mul_op; }; -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_add_op; }; -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_triad_op; }; -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_nstream_op; }; +template struct tile_eligible : ::cuda::std::true_type {}; +template struct tile_eligible : ::cuda::std::true_type {}; +template struct tile_eligible : ::cuda::std::true_type {}; +template struct tile_eligible : ::cuda::std::true_type {}; +template <> struct tile_operator { using type = tile_mul_op; }; +template <> struct tile_operator { using type = tile_add_op; }; +template <> struct tile_operator { using type = tile_triad_op; }; +template <> struct tile_operator { using type = tile_nstream_op; }; } // namespace transform CUB_NAMESPACE_END #endif diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu index da9665b2f25..6133c69c684 100644 --- a/cub/benchmarks/bench/transform/tile/copy.cu +++ b/cub/benchmarks/bench/transform/tile/copy.cu @@ -33,7 +33,8 @@ struct tile_identity { CUB_NAMESPACE_BEGIN namespace transform { -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_identity; }; +template struct tile_eligible : ::cuda::std::true_type {}; +template <> struct tile_operator { using type = tile_identity; }; } // namespace transform CUB_NAMESPACE_END #endif diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu index 9f364304266..5ad936019fa 100644 --- a/cub/benchmarks/bench/transform/tile/grayscale.cu +++ b/cub/benchmarks/bench/transform/tile/grayscale.cu @@ -42,7 +42,8 @@ struct tile_rgb_to_y { CUB_NAMESPACE_BEGIN namespace transform { -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_rgb_to_y; }; +template struct tile_eligible : ::cuda::std::true_type {}; +template <> struct tile_operator { using type = tile_rgb_to_y; }; } // namespace transform CUB_NAMESPACE_END #endif diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu index 0e1767fdac7..25e90b7f66e 100644 --- a/cub/benchmarks/bench/transform/tile/pytorch.cu +++ b/cub/benchmarks/bench/transform/tile/pytorch.cu @@ -92,12 +92,18 @@ CUB_NAMESPACE_BEGIN namespace transform { // Unary -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_relu; }; -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_sigmoid; }; -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_tanh; }; -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_gelu; }; -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_sin; }; -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_exp; }; +template struct tile_eligible : ::cuda::std::true_type {}; +template struct tile_eligible : ::cuda::std::true_type {}; +template struct tile_eligible : ::cuda::std::true_type {}; +template struct tile_eligible : ::cuda::std::true_type {}; +template struct tile_eligible : ::cuda::std::true_type {}; +template struct tile_eligible : ::cuda::std::true_type {}; +template <> struct tile_operator { using type = tile_relu; }; +template <> struct tile_operator { using type = tile_sigmoid; }; +template <> struct tile_operator { using type = tile_tanh; }; +template <> struct tile_operator { using type = tile_gelu; }; +template <> struct tile_operator { using type = tile_sin; }; +template <> struct tile_operator { using type = tile_exp; }; // MUFU-heavy unary ops: hint to tile policy picker to cap items/thread at vector width on sub-4-byte types. template <> struct tile_mufu_heavy : ::cuda::std::true_type {}; @@ -107,14 +113,22 @@ template <> struct tile_mufu_heavy : ::cuda::std::true_type {}; template <> struct tile_mufu_heavy : ::cuda::std::true_type {}; // Binary -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_add; }; -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_sub; }; -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_mul; }; -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_div; }; -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_le; }; -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_ge; }; -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_fmin; }; -template struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_binary_fmax; }; +template struct tile_eligible : ::cuda::std::true_type {}; +template struct tile_eligible : ::cuda::std::true_type {}; +template struct tile_eligible : ::cuda::std::true_type {}; +template struct tile_eligible : ::cuda::std::true_type {}; +template struct tile_eligible : ::cuda::std::true_type {}; +template struct tile_eligible : ::cuda::std::true_type {}; +template struct tile_eligible : ::cuda::std::true_type {}; +template struct tile_eligible : ::cuda::std::true_type {}; +template <> struct tile_operator { using type = tile_binary_add; }; +template <> struct tile_operator { using type = tile_binary_sub; }; +template <> struct tile_operator { using type = tile_binary_mul; }; +template <> struct tile_operator { using type = tile_binary_div; }; +template <> struct tile_operator { using type = tile_binary_le; }; +template <> struct tile_operator { using type = tile_binary_ge; }; +template <> struct tile_operator { using type = tile_binary_fmin; }; +template <> struct tile_operator { using type = tile_binary_fmax; }; } // namespace transform CUB_NAMESPACE_END #endif diff --git a/cub/benchmarks/bench/transform/tile/test_device_transform.cu b/cub/benchmarks/bench/transform/tile/test_device_transform.cu index d3a143a3deb..e2c7a5006bb 100644 --- a/cub/benchmarks/bench/transform/tile/test_device_transform.cu +++ b/cub/benchmarks/bench/transform/tile/test_device_transform.cu @@ -182,10 +182,12 @@ void test_fill(int64_t n, T value) { CUB_NAMESPACE_BEGIN namespace transform { -template <> struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_identity_op; }; -template <> struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_identity_op; }; -template <> struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_square_op; }; -template <> struct tile_eligible : ::cuda::std::true_type { using tile_op_type = tile_square_op; }; +template <> struct tile_eligible : ::cuda::std::true_type {}; +template <> struct tile_eligible : ::cuda::std::true_type {}; +template <> struct tile_eligible : ::cuda::std::true_type {}; +template <> struct tile_eligible : ::cuda::std::true_type {}; +template <> struct tile_operator { using type = tile_identity_op; }; +template <> struct tile_operator { using type = tile_square_op; }; } // namespace transform CUB_NAMESPACE_END #endif diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 67756cb20c9..7ce52562fa0 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -152,9 +152,9 @@ runtime_preconditions_valid(::cuda::std::tuple const& inputs, OutIte // caller (the hook in device_transform.cuh) is responsible for checking // runtime_preconditions_valid first. // -// The tile kernel is launched with the trait's tile_op_type (a tile-friendly -// mirror of Op with __tile__ operator), NOT the user's Op instance -- the -// user's scalar functor cannot be invoked on ct::tile arguments. +// The tile kernel is launched with tile_operator_t: for a scalar Op that is its +// registered tile-friendly mirror (a __tile__ functor), and for an already-tile Op it +// is Op itself. A scalar functor cannot be invoked on ct::tile arguments. template [[nodiscard]] CUB_RUNTIME_FUNCTION ::cudaError_t dispatch(::cuda::std::tuple inputs, OutIter output, OffsetT num_items, ::cudaStream_t stream) @@ -165,12 +165,12 @@ dispatch(::cuda::std::tuple inputs, OutIter output, OffsetT num_item return ::cuda::std::make_tuple(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters)...); }, inputs); - using out_value_t = cub::detail::it_value_t; - using tile_op_t = typename cub::transform::tile_eligible::tile_op_type; + // The tile functor to run for TransformOp: its registered tile_operator mirror. + using tile_op_t = cub::transform::tile_operator_t; static_assert(::cuda::std::is_empty_v, - "tile_op_type must be stateless (the tile kernel default-constructs it)"); + "tile_operator type must be stateless (the tile kernel default-constructs it)"); static_assert(::cuda::std::is_trivially_default_constructible_v, - "tile_op_type must be trivially default constructible"); + "tile_operator type must be trivially default constructible"); return DeviceTransform::Transform<0, cub::transform::tile_mufu_heavy_v, tile_op_t>( in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), tile_op_t{}, stream); diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh index 4759d97569e..f51d280264b 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh @@ -3,23 +3,27 @@ // Compile-time policy for cub::DeviceTransform's tile path. // -// PUBLIC EXTENSION POINTS (cub::transform): -// tile_eligible -- specialize this to opt a (functor type, -// element type, input arity) combo into -// the tile dispatch path. +// PUBLIC EXTENSION POINTS (cub::transform) -- two independent axes: +// tile_eligible -- specialize to true_type to opt a (functor +// type, element type, input arity) combo into +// the tile dispatch path. Eligibility only. // tile_eligible_v<...> -- variable-template companion. +// tile_operator -- the __tile__ functor the tile kernel runs +// for Op. No default: every tile-eligible Op +// must specialize it with `using type = `, +// because a scalar functor (e.g. +// cuda::std::plus<__half>) cannot be invoked +// on ct::tile. Omitting it is a clear +// static_assert, not a cryptic kernel error. +// tile_operator_t -- alias for tile_operator::type. // tile_mufu_heavy -- specialize to flag Op as MUFU-heavy; the // tile policy picker uses this hint. // tile_mufu_heavy_v<...> -- variable-template companion. // -// Users call cub::DeviceTransform::Transform with whatever scalar functor -// they have (e.g. cuda::std::plus<__half>). That scalar functor is NOT -// directly callable from a tile transform_kernel -- its operator() takes -// scalars, not ct::tile. So eligible specializations declare a `tile_op_type` -// member naming a tile-friendly replacement (a stateless functor with a -// __tile__ templated operator() that performs the same op on ct::tile args). -// The dispatch hook launches the tile kernel with the replacement, not the -// user's original functor instance. +// Eligibility ("may this combo use the tile path?") and substitution ("which +// __tile__ functor do we actually run?") are separate traits, so an eligible op +// always registers both: tile_eligible and tile_operator. // // INTERNAL (cub::detail::transform::tile): // tile_plus, tile_multiplies -- shipped tile-friendly substitutes used by @@ -41,18 +45,17 @@ #if _CCCL_CUB_HAS_TILE_TRANSFORM() -# include - # include # include # include +# include + CUB_NAMESPACE_BEGIN // Public extension surface. namespace transform { - template struct tile_eligible : ::cuda::std::false_type {}; @@ -60,6 +63,21 @@ struct tile_eligible : ::cuda::std::false_type template inline constexpr bool tile_eligible_v = tile_eligible::value; +// The __tile__ functor the tile kernel runs for Op -- the tile-side mirror of the scalar Op. There is +// no default: a scalar functor cannot be invoked on ct::tile, so every tile-eligible Op must specialize +// this with a `type` naming a stateless __tile__ functor. tile_eligible says a combo MAY use the +// tile path; tile_operator says WHAT the tile kernel runs. +template +struct tile_operator +{ + static_assert(sizeof(Op) == 0, + "cub::transform::tile_operator must be specialized for every tile-eligible Op: " + "provide `using type = `."); +}; + +template +using tile_operator_t = typename tile_operator::type; + // Hint that Op uses MUFU (multi-function unit, sin/cos/exp/log/tanh/rcp/rsq). Setting this makes // the tile policy picker cap items/thread so MUFU pipes are not oversaturated. template @@ -68,13 +86,11 @@ struct tile_mufu_heavy : ::cuda::std::false_type template inline constexpr bool tile_mufu_heavy_v = tile_mufu_heavy::value; - } // namespace transform // Internal substitutes shipped by CCCL. namespace detail::transform::tile { - // Tile-friendly mirrors of common cuda::std ops. Each has a __tile__ // templated operator() so it can be invoked from inside transform_kernel // where the arguments are ct::tile rather than scalar T. @@ -95,36 +111,48 @@ struct tile_multiplies return a * b; } }; - } // namespace detail::transform::tile // Built-in trait specializations live in the public namespace alongside the // trait, but reference the internal substitute functors. namespace transform { +// cuda::std::plus / multiplies are scalar ops, so each is marked eligible and given a tile_operator mirror. # if _CCCL_HAS_NVFP16() template <> struct tile_eligible<::cuda::std::plus<::__half>, ::__half, 2> : ::cuda::std::true_type +{}; +template <> +struct tile_eligible<::cuda::std::multiplies<::__half>, ::__half, 2> : ::cuda::std::true_type +{}; +template <> +struct tile_operator<::cuda::std::plus<::__half>> { - using tile_op_type = cub::detail::transform::tile::tile_plus; + using type = cub::detail::transform::tile::tile_plus; }; template <> -struct tile_eligible<::cuda::std::multiplies<::__half>, ::__half, 2> : ::cuda::std::true_type +struct tile_operator<::cuda::std::multiplies<::__half>> { - using tile_op_type = cub::detail::transform::tile::tile_multiplies; + using type = cub::detail::transform::tile::tile_multiplies; }; # endif // _CCCL_HAS_NVFP16() # if _CCCL_HAS_NVBF16() template <> struct tile_eligible<::cuda::std::plus<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type +{}; +template <> +struct tile_eligible<::cuda::std::multiplies<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type +{}; +template <> +struct tile_operator<::cuda::std::plus<::__nv_bfloat16>> { - using tile_op_type = cub::detail::transform::tile::tile_plus; + using type = cub::detail::transform::tile::tile_plus; }; template <> -struct tile_eligible<::cuda::std::multiplies<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type +struct tile_operator<::cuda::std::multiplies<::__nv_bfloat16>> { - using tile_op_type = cub::detail::transform::tile::tile_multiplies; + using type = cub::detail::transform::tile::tile_multiplies; }; # endif // _CCCL_HAS_NVBF16() } // namespace transform From dcb838dc7efc7e887ead8a6ef12e76593f4379d5 Mon Sep 17 00:00:00 2001 From: Nan An Date: Thu, 11 Jun 2026 17:03:15 -0700 Subject: [PATCH 56/83] add gated c2h test for the tile transform dispatch path catch2_test_device_transform_tile.cu, gated by _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED(): in a normal build it is a single skipped test; under --enable-tile it registers a unary (square) and a binary (add) scalar op with tile_operator substitutes and checks the transform result is bit-exact with a host std::transform reference across sizes that span the tile path (n % 16 == 0) and the CUB fallback. Verified the TU compiles under --enable-tile; the dispatch correctness it checks also matches the standalone harness. Building it with --enable-tile in-tree needs a scoped/conditional CMake flag (global --enable-tile breaks the c++17 c2h support lib) -- left as a follow-up. --- cub/test/catch2_test_device_transform_tile.cu | 145 ++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 cub/test/catch2_test_device_transform_tile.cu diff --git a/cub/test/catch2_test_device_transform_tile.cu b/cub/test/catch2_test_device_transform_tile.cu new file mode 100644 index 00000000000..3ff05fac915 --- /dev/null +++ b/cub/test/catch2_test_device_transform_tile.cu @@ -0,0 +1,145 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "insert_nested_NVTX_range_guard.h" + +#include + +#include + +// The tile dispatch path only exists when nvcc is invoked with --enable-tile and the user opts in via +// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH. In any other build this file compiles to a single skipped test. +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() +# include + +# include + +# include "catch2_test_launch_helper.h" + +// %PARAM% TEST_LAUNCH lid 0:1:2 + +DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::Transform, transform_many); + +namespace ct = ::cuda::tiles; + +// Each scalar op (passed to Transform, used by the CUB fallback) pairs with a tile-side mirror +// registered through tile_operator. The bodies use tile-tile arithmetic and wrap for unsigned types, +// so the tile result is bit-exact with the host reference. + +// Unary: v * v. +struct square_op +{ + template + __host__ __device__ T operator()(T v) const + { + return static_cast(v * v); + } +}; +struct tile_square_op +{ + template + __tile__ auto operator()(T v) const + { + return v * v; + } +}; + +// Binary: a + b. +struct add_op +{ + template + __host__ __device__ auto operator()(A a, B b) const + { + return static_cast(a + b); + } +}; +struct tile_add_op +{ + template + __tile__ auto operator()(A a, B b) const + { + return a + b; + } +}; + +CUB_NAMESPACE_BEGIN +namespace transform +{ +template +struct tile_eligible : ::cuda::std::true_type +{}; +template <> +struct tile_operator +{ + using type = tile_square_op; +}; + +template +struct tile_eligible : ::cuda::std::true_type +{}; +template <> +struct tile_operator +{ + using type = tile_add_op; +}; +} // namespace transform +CUB_NAMESPACE_END + +// Unsigned types so arithmetic wraps deterministically and matches the host reference bit-for-bit. +using tile_types = c2h::type_list<::cuda::std::uint32_t, ::cuda::std::uint64_t>; + +// Sizes span the runtime preconditions: multiples of 16 (with aligned c2h buffers) take the tile +// kernel; the others fall back to the standard CUB dispatch. Both must produce identical results. +# define TILE_TRANSFORM_SIZES GENERATE(::cuda::std::int64_t{0}, 16, 32, 128, 1024, 4096, 65536, 17, 127, 1000) + +C2H_TEST("DeviceTransform tile dispatch: unary scalar op routed through its tile_operator substitute", + "[device][transform][tile]", + tile_types) +{ + using type = c2h::get<0, TestType>; + const ::cuda::std::int64_t num_items = TILE_TRANSFORM_SIZES; + CAPTURE(c2h::type_name(), num_items); + + c2h::device_vector in(num_items, thrust::no_init); + c2h::gen(C2H_SEED(2), in); + c2h::device_vector result(num_items, thrust::no_init); + + transform_many(::cuda::std::make_tuple(in.begin()), result.begin(), num_items, square_op{}); + + c2h::host_vector in_h = in; + c2h::host_vector reference_h(num_items, thrust::no_init); + std::transform(in_h.begin(), in_h.end(), reference_h.begin(), square_op{}); + REQUIRE(reference_h == result); +} + +C2H_TEST("DeviceTransform tile dispatch: binary scalar op routed through its tile_operator substitute", + "[device][transform][tile]", + tile_types) +{ + using type = c2h::get<0, TestType>; + const ::cuda::std::int64_t num_items = TILE_TRANSFORM_SIZES; + CAPTURE(c2h::type_name(), num_items); + + c2h::device_vector a(num_items, thrust::no_init); + c2h::device_vector b(num_items, thrust::no_init); + c2h::gen(C2H_SEED(2), a); + c2h::gen(C2H_SEED(2), b); + c2h::device_vector result(num_items, thrust::no_init); + + transform_many(::cuda::std::make_tuple(a.begin(), b.begin()), result.begin(), num_items, add_op{}); + + c2h::host_vector a_h = a; + c2h::host_vector b_h = b; + c2h::host_vector reference_h(num_items, thrust::no_init); + std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), add_op{}); + REQUIRE(reference_h == result); +} + +#else // !_CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() + +C2H_TEST("DeviceTransform tile dispatch requires --enable-tile", "[device][transform][tile]") +{ + SUCCEED("tile transform dispatch not enabled in this build"); +} + +#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() From 14a30fc9f66d8b3e6d5316c8401bd42638c8a1a2 Mon Sep 17 00:00:00 2001 From: Nan An Date: Thu, 11 Jun 2026 17:10:42 -0700 Subject: [PATCH 57/83] use thrust::device_vector in copy bench Per @fbusato: replace the manual cudaMalloc/cudaFree pair with thrust::device_vector (RAII, no leak on early return), passing thrust::raw_pointer_cast(...) to Transform. --- cub/benchmarks/bench/transform/tile/copy.cu | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu index 6133c69c684..8b7cbb4dedf 100644 --- a/cub/benchmarks/bench/transform/tile/copy.cu +++ b/cub/benchmarks/bench/transform/tile/copy.cu @@ -10,6 +10,8 @@ #include +#include + #include #include #include @@ -41,18 +43,18 @@ CUB_NAMESPACE_END template void copy(nvbench::state& state, nvbench::type_list) { - auto n = state.get_int64("Elements{io}"); - T *in, *out; - cudaMalloc(&in, n * sizeof(T)); cudaMalloc(&out, n * sizeof(T)); - bench_init::rand_fill(in, n, 0xA111); cudaDeviceSynchronize(); + const auto n = state.get_int64("Elements{io}"); + thrust::device_vector in(n), out(n); + T* in_ptr = thrust::raw_pointer_cast(in.data()); + T* out_ptr = thrust::raw_pointer_cast(out.data()); + bench_init::rand_fill(in_ptr, n, 0xA111); cudaDeviceSynchronize(); state.add_element_count(n); state.add_global_memory_reads(n); state.add_global_memory_writes(n); state.exec([&](nvbench::launch& launch) { cub::DeviceTransform::Transform( - ::cuda::std::make_tuple(in), out, n, identity{}, launch.get_stream()); + ::cuda::std::make_tuple(in_ptr), out_ptr, n, identity{}, launch.get_stream()); }); - cudaFree(in); cudaFree(out); } using types = nvbench::type_list; From 67af2573dcf4c1bc2a491055eb1b82fa06e44557 Mon Sep 17 00:00:00 2001 From: Nan An Date: Thu, 11 Jun 2026 17:24:29 -0700 Subject: [PATCH 58/83] rewrite tile babelstream bench to CUB conventions Mirror the base transform/babelstream.cu: include ../common.h (nvbench_helper + bench_transform), thrust::device_vector with constant init, try/catch OOM, and NVBENCH_BENCH_TYPES with set_type_axes_names + nvbench::range. Named ops keep their gated tile_operator registrations so --enable-tile routes them to the tile kernel. Drops the ad-hoc Buffers/cudaMalloc and bench_init.cuh usage. __int128 omitted (unsupported on the tile path). --- .../bench/transform/tile/babelstream.cu | 314 +++++++++++------- 1 file changed, 193 insertions(+), 121 deletions(-) diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu index 9ee750ce35b..6e9caf03f2d 100644 --- a/cub/benchmarks/bench/transform/tile/babelstream.cu +++ b/cub/benchmarks/bench/transform/tile/babelstream.cu @@ -1,162 +1,234 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause -// BabelStream-style bandwidth benchmarks via cub::DeviceTransform::Transform. -// Custom ops self-register their tile substitutes via tile_eligible<>, so the -// dispatch hook routes them to the tile kernel under --enable-tile + the -// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH macro. +// Tile variant of the BabelStream transform bench. The lambdas of the base benchmark are replaced by +// named, stateless ops that register a tile_operator substitute (gated). Under --enable-tile + +// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH the dispatch hook routes them to the tile kernel; otherwise this +// is the standard CUB transform path. This file disappears once tile dispatch is fully transparent. -#include - -#include - -#include -#include -#include -#include +#include "../common.h" #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() # include #endif -#include "bench_init.cuh" - -// User-defined scalar ops (used at the call site, in both build modes). -struct mul_op { - template - __host__ __device__ auto operator()(B b) const { return -(b + b); } +// Stateless scalar ops, used at the call site in both build modes. Constants are baked in so the ops +// stay stateless (the tile substitute must be trivially default constructible): with startScalar == -2, +// `c * scalar` is `-(c + c)`, `b + scalar * c` is `b - c - c`, etc. +struct mul_op +{ + template + __host__ __device__ auto operator()(B b) const + { + return -(b + b); + } }; -struct add_op { - template - __host__ __device__ auto operator()(A a, B b) const { return a + b; } +struct add_op +{ + template + __host__ __device__ auto operator()(A a, B b) const + { + return a + b; + } }; -struct triad_op { - template - __host__ __device__ auto operator()(B b, C c) const { return b - c - c; } +struct triad_op +{ + template + __host__ __device__ auto operator()(B b, C c) const + { + return b - c - c; + } }; -struct nstream_op { - template - __host__ __device__ auto operator()(A a, B b, C c) const { return a + b - c - c; } +struct nstream_op +{ + template + __host__ __device__ auto operator()(A a, B b, C c) const + { + return a + b - c - c; + } }; #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() -// Tile-friendly substitutes (must be stateless + trivially default constructible). -struct tile_mul_op { - template - __tile__ auto operator()(B b) const { return -(b + b); } +struct tile_mul_op +{ + template + __tile__ auto operator()(B b) const + { + return -(b + b); + } }; -struct tile_add_op { - template - __tile__ auto operator()(A a, B b) const { return a + b; } +struct tile_add_op +{ + template + __tile__ auto operator()(A a, B b) const + { + return a + b; + } }; -struct tile_triad_op { - template - __tile__ auto operator()(B b, C c) const { return b - c - c; } +struct tile_triad_op +{ + template + __tile__ auto operator()(B b, C c) const + { + return b - c - c; + } }; -struct tile_nstream_op { - template - __tile__ auto operator()(A a, B b, C c) const { return a + b - c - c; } +struct tile_nstream_op +{ + template + __tile__ auto operator()(A a, B b, C c) const + { + return a + b - c - c; + } }; -// Self-register each scalar op for all T (partial specialization on T). CUB_NAMESPACE_BEGIN namespace transform { -template struct tile_eligible : ::cuda::std::true_type {}; -template struct tile_eligible : ::cuda::std::true_type {}; -template struct tile_eligible : ::cuda::std::true_type {}; -template struct tile_eligible : ::cuda::std::true_type {}; -template <> struct tile_operator { using type = tile_mul_op; }; -template <> struct tile_operator { using type = tile_add_op; }; -template <> struct tile_operator { using type = tile_triad_op; }; -template <> struct tile_operator { using type = tile_nstream_op; }; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template <> +struct tile_operator +{ + using type = tile_mul_op; +}; +template <> +struct tile_operator +{ + using type = tile_add_op; +}; +template <> +struct tile_operator +{ + using type = tile_triad_op; +}; +template <> +struct tile_operator +{ + using type = tile_nstream_op; +}; } // namespace transform CUB_NAMESPACE_END +#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() + +// The tile path does not support __int128 (no tensor_span/partition_view for it), so the type axis +// omits it relative to the base babelstream bench. +#ifdef TUNE_T +using element_types = nvbench::type_list; +#else +using element_types = nvbench::type_list; #endif -// True if `bytes_needed` worth of GPU memory is available, with 5% headroom -// for driver overhead. Caller should `state.skip(...)` on false. -inline bool gpu_mem_available(size_t bytes_needed) { - size_t free_b = 0, total_b = 0; - if (cudaMemGetInfo(&free_b, &total_b) != cudaSuccess) return false; - return bytes_needed + (bytes_needed / 20) < free_b; -} +inline auto array_size_powers = nvbench::range(16, 32, 4); -template -struct Buffers { - T *a{}, *b{}, *c{}; - int64_t n{}; - Buffers(int64_t n) : n(n) { - cudaMalloc(&a, n * sizeof(T)); - cudaMalloc(&b, n * sizeof(T)); - cudaMalloc(&c, n * sizeof(T)); - bench_init::rand_fill(a, n, 0xA111); - bench_init::rand_fill(b, n, 0xB222); - bench_init::rand_fill(c, n, 0xC333); - cudaDeviceSynchronize(); - } - ~Buffers() { cudaFree(a); cudaFree(b); cudaFree(c); } -}; +// Same constant inputs as the base bench so nstream maintains a consistent workload. +inline constexpr auto startA = 11; +inline constexpr auto startB = 2; +inline constexpr auto startC = 1; +inline constexpr auto startScalar = -2; +static_assert(startA == (startA + startB + startScalar * startC), "nstream must have a consistent workload"); -// --- benchmarks --- template -void mul(nvbench::state& state, nvbench::type_list) { - auto n = state.get_int64("Elements{io}"); - Buffers buf(n); - state.add_element_count(n); - state.add_global_memory_reads(n); - state.add_global_memory_writes(n); - state.exec([&](nvbench::launch& launch) { - cub::DeviceTransform::Transform( - ::cuda::std::make_tuple(buf.b), buf.c, n, mul_op{}, launch.get_stream()); - }); +static void mul(nvbench::state& state, nvbench::type_list) +try +{ + const auto n = state.get_int64("Elements{io}"); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + bench_transform(state, cuda::std::tuple{c.begin()}, b.begin(), n, mul_op{}); } - -template -void add(nvbench::state& state, nvbench::type_list) { - auto n = state.get_int64("Elements{io}"); - Buffers buf(n); - state.add_element_count(n); - state.add_global_memory_reads(2 * n); - state.add_global_memory_writes(n); - state.exec([&](nvbench::launch& launch) { - cub::DeviceTransform::Transform( - ::cuda::std::make_tuple(buf.a, buf.b), buf.c, n, add_op{}, launch.get_stream()); - }); +catch (const std::bad_alloc&) +{ + state.skip("Skipping: out of memory."); } +NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(element_types)) + .set_name("tile_mul") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers); + template -void triad(nvbench::state& state, nvbench::type_list) { - auto n = state.get_int64("Elements{io}"); - Buffers buf(n); - state.add_element_count(n); - state.add_global_memory_reads(2 * n); - state.add_global_memory_writes(n); - state.exec([&](nvbench::launch& launch) { - cub::DeviceTransform::Transform( - ::cuda::std::make_tuple(buf.b, buf.c), buf.a, n, triad_op{}, launch.get_stream()); - }); +static void add(nvbench::state& state, nvbench::type_list) +try +{ + const auto n = state.get_int64("Elements{io}"); + thrust::device_vector a(n, startA); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + state.add_element_count(n); + state.add_global_memory_reads(2 * n); + state.add_global_memory_writes(n); + bench_transform(state, cuda::std::tuple{a.begin(), b.begin()}, c.begin(), n, add_op{}); +} +catch (const std::bad_alloc&) +{ + state.skip("Skipping: out of memory."); } +NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(element_types)) + .set_name("tile_add") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers); + template -void nstream(nvbench::state& state, nvbench::type_list) { - auto n = state.get_int64("Elements{io}"); - Buffers buf(n); - state.add_element_count(n); - state.add_global_memory_reads(3 * n); - state.add_global_memory_writes(n); - state.exec([&](nvbench::launch& launch) { - cub::DeviceTransform::Transform( - ::cuda::std::make_tuple(buf.a, buf.b, buf.c), buf.a, n, nstream_op{}, launch.get_stream()); - }); +static void triad(nvbench::state& state, nvbench::type_list) +try +{ + const auto n = state.get_int64("Elements{io}"); + thrust::device_vector a(n, startA); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + state.add_element_count(n); + state.add_global_memory_reads(2 * n); + state.add_global_memory_writes(n); + bench_transform(state, cuda::std::tuple{b.begin(), c.begin()}, a.begin(), n, triad_op{}); +} +catch (const std::bad_alloc&) +{ + state.skip("Skipping: out of memory."); } -using types = nvbench::type_list; -inline auto sizes = std::vector{16, 20, 24, 28, 31}; +NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(element_types)) + .set_name("tile_triad") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers); -NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(types)).set_name("tile_mul").add_int64_power_of_two_axis("Elements{io}", sizes); -NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(types)).set_name("tile_add").add_int64_power_of_two_axis("Elements{io}", sizes); -NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(types)).set_name("tile_triad").add_int64_power_of_two_axis("Elements{io}", sizes); -NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(types)).set_name("tile_nstream").add_int64_power_of_two_axis("Elements{io}", sizes); +template +static void nstream(nvbench::state& state, nvbench::type_list) +try +{ + const auto n = state.get_int64("Elements{io}"); + thrust::device_vector a(n, startA); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + state.add_element_count(n); + state.add_global_memory_reads(3 * n); + state.add_global_memory_writes(n); + bench_transform(state, cuda::std::tuple{a.begin(), b.begin(), c.begin()}, a.begin(), n, nstream_op{}); +} +catch (const std::bad_alloc&) +{ + state.skip("Skipping: out of memory."); +} -NVBENCH_MAIN +NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types)) + .set_name("tile_nstream") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers); From 2a1983abfd5c40fa00a5ebdcf5741b0cea59c1aa Mon Sep 17 00:00:00 2001 From: Nan An Date: Thu, 11 Jun 2026 17:28:45 -0700 Subject: [PATCH 59/83] rewrite remaining tile benches to CUB conventions; drop redundant files grayscale/copy/pytorch: same treatment as babelstream -- ../common.h (nvbench_helper + bench_transform), thrust::device_vector + generate(), try/catch OOM, NVBENCH_BENCH_TYPES with set_type_axes_names + nvbench::range, gated tile_operator registrations (pytorch keeps tile_mufu_heavy hints). Deletions: - fill.cu: the tile path has no fill kernel wired into dispatch (zero-input Transform isn't routed), so it only duplicated the base fill bench. fill-on-tile remains a follow-up. - test_device_transform.cu: superseded by cub/test/catch2_test_device_transform_tile.cu. - bench_init.cuh: replaced by nvbench_helper/common.h. --- .../bench/transform/tile/bench_init.cuh | 67 -- cub/benchmarks/bench/transform/tile/copy.cu | 95 +-- cub/benchmarks/bench/transform/tile/fill.cu | 34 - .../bench/transform/tile/grayscale.cu | 111 +-- .../bench/transform/tile/pytorch.cu | 640 +++++++++++++----- .../transform/tile/test_device_transform.cu | 217 ------ 6 files changed, 589 insertions(+), 575 deletions(-) delete mode 100644 cub/benchmarks/bench/transform/tile/bench_init.cuh delete mode 100644 cub/benchmarks/bench/transform/tile/fill.cu delete mode 100644 cub/benchmarks/bench/transform/tile/test_device_transform.cu diff --git a/cub/benchmarks/bench/transform/tile/bench_init.cuh b/cub/benchmarks/bench/transform/tile/bench_init.cuh deleted file mode 100644 index da3e37f8c40..00000000000 --- a/cub/benchmarks/bench/transform/tile/bench_init.cuh +++ /dev/null @@ -1,67 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -// SPDX-License-Identifier: BSD-3-Clause - -#pragma once - -#include -#include -#include -#include -#include - -namespace bench_init { - -// splitmix64 — fast deterministic PRNG, one mix per element. -__device__ __forceinline__ uint64_t splitmix64(uint64_t x) { - x += 0x9E3779B97F4A7C15ULL; - x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9ULL; - x = (x ^ (x >> 27)) * 0x94D049BB133111EBULL; - return x ^ (x >> 31); -} - -// Map a uint64 to a "reasonable" finite value of T in roughly [-1, 1) for floats, -// or to a non-zero byte for small ints (so neither all-zero nor pathological). -template -__device__ __forceinline__ T from_random(uint64_t r) { - if constexpr (std::is_same_v) { - // 24-bit mantissa precision, range (-1, 1) - uint32_t u = uint32_t(r >> 40); // 24 bits - float f = float(u) * (1.0f / float(1u << 23)) - 1.0f; - return f; - } else if constexpr (std::is_same_v) { - uint64_t u = r >> 11; // 53 bits - double d = double(u) * (1.0 / double(1ull << 52)) - 1.0; - return d; - } else if constexpr (std::is_same_v) { - uint32_t u = uint32_t(r >> 40); - float f = float(u) * (1.0f / float(1u << 23)) - 1.0f; - return __float2half(f); - } else if constexpr (std::is_same_v) { - uint32_t u = uint32_t(r >> 40); - float f = float(u) * (1.0f / float(1u << 23)) - 1.0f; - return __float2bfloat16(f); - } else { - // integer types: small non-zero values, biased away from zero so div is meaningful - int v = int(r & 0x7f) + 1; // 1..128 - if (r & 0x100) v = -v; // sometimes negative - return T(v); - } -} - -template -__global__ void rand_fill_kernel(T* __restrict__ p, int64_t n, uint64_t seed) { - int64_t stride = int64_t(gridDim.x) * blockDim.x; - for (int64_t i = int64_t(blockIdx.x) * blockDim.x + threadIdx.x; i < n; i += stride) { - p[i] = from_random(splitmix64(seed ^ uint64_t(i))); - } -} - -template -inline void rand_fill(T* p, int64_t n, uint64_t seed = 0xC0FFEE) { - int block = 256; - int64_t nblocks = (n + block - 1) / block; - int grid = int(nblocks < 65535 ? nblocks : 65535); - rand_fill_kernel<<>>(p, n, seed); -} - -} // namespace bench_init diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu index 8b7cbb4dedf..85ed12e0d4d 100644 --- a/cub/benchmarks/bench/transform/tile/copy.cu +++ b/cub/benchmarks/bench/transform/tile/copy.cu @@ -1,67 +1,78 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause -// Pure copy bench (identity transform). Custom identity op self-registers -// its tile substitute via tile_eligible<>; under --enable-tile + the -// dispatch macro this routes to the tile load_masked/store_masked path, -// otherwise it falls through to CUB's standard transform. +// Pure copy (identity transform) -- measures plain load/store bandwidth through the tile +// load_masked/store_masked path. The identity op registers a tile_operator substitute (gated); under +// --enable-tile + CCCL_ENABLE_TILE_TRANSFORM_DISPATCH the dispatch hook routes it to the tile kernel, +// otherwise it falls through to CUB's standard transform. This file disappears once tile dispatch is +// fully transparent. -#include - -#include - -#include - -#include -#include -#include -#include +#include "../common.h" #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() # include #endif -#include "bench_init.cuh" - -struct identity { - template __host__ __device__ auto operator()(T v) const { return v; } +struct identity +{ + template + __host__ __device__ auto operator()(T v) const + { + return v; + } }; #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() -struct tile_identity { - template __tile__ auto operator()(T v) const { return v; } +struct tile_identity +{ + template + __tile__ auto operator()(T v) const + { + return v; + } }; CUB_NAMESPACE_BEGIN namespace transform { -template struct tile_eligible : ::cuda::std::true_type {}; -template <> struct tile_operator { using type = tile_identity; }; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template <> +struct tile_operator +{ + using type = tile_identity; +}; } // namespace transform CUB_NAMESPACE_END +#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() + +#ifdef TUNE_T +using element_types = nvbench::type_list; +#else +using element_types = nvbench::type_list; #endif template -void copy(nvbench::state& state, nvbench::type_list) { - const auto n = state.get_int64("Elements{io}"); - thrust::device_vector in(n), out(n); - T* in_ptr = thrust::raw_pointer_cast(in.data()); - T* out_ptr = thrust::raw_pointer_cast(out.data()); - bench_init::rand_fill(in_ptr, n, 0xA111); cudaDeviceSynchronize(); - state.add_element_count(n); - state.add_global_memory_reads(n); - state.add_global_memory_writes(n); - state.exec([&](nvbench::launch& launch) { - cub::DeviceTransform::Transform( - ::cuda::std::make_tuple(in_ptr), out_ptr, n, identity{}, launch.get_stream()); - }); -} +static void copy(nvbench::state& state, nvbench::type_list) +try +{ + const auto n = state.get_int64("Elements{io}"); -using types = nvbench::type_list; -inline auto sizes = std::vector{16, 20, 24, 28, 31}; + thrust::device_vector in = generate(n); + thrust::device_vector out(n, thrust::no_init); -NVBENCH_BENCH_TYPES(copy, NVBENCH_TYPE_AXES(types)) - .set_name("tile_copy") - .add_int64_power_of_two_axis("Elements{io}", sizes); + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + bench_transform(state, cuda::std::tuple{in.begin()}, out.begin(), n, identity{}); +} +catch (const std::bad_alloc&) +{ + state.skip("Skipping: out of memory."); +} -NVBENCH_MAIN +NVBENCH_BENCH_TYPES(copy, NVBENCH_TYPE_AXES(element_types)) + .set_name("tile_copy") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 32, 4)); diff --git a/cub/benchmarks/bench/transform/tile/fill.cu b/cub/benchmarks/bench/transform/tile/fill.cu deleted file mode 100644 index 5105b25b67b..00000000000 --- a/cub/benchmarks/bench/transform/tile/fill.cu +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -// SPDX-License-Identifier: BSD-3-Clause - -// Fill: zero-input broadcast. Calls cub::DeviceTransform::Fill, which goes -// through the unified __transform_internal path -- our trait dispatch hook -// sees the zero-input case but currently has no trait spec for it, so this -// lands on CUB's standard Fill kernel. Wire a tile substitute later if Fill -// becomes a bottleneck. - -#include - -#include - -#include - -template -void fill(nvbench::state& state, nvbench::type_list) { - const auto n = state.get_int64("Elements{io}"); - T* out; cudaMalloc(&out, n * sizeof(T)); - state.add_element_count(n); - state.add_global_memory_writes(n); - state.exec([&](nvbench::launch& launch) { - cub::DeviceTransform::Fill(out, n, T(42), launch.get_stream()); - }); - cudaFree(out); -} - -// CUB sweeps integral types: int8/16/32/64 -using fill_types = nvbench::type_list; - -NVBENCH_BENCH_TYPES(fill, NVBENCH_TYPE_AXES(fill_types)).set_name("tile_fill") - .add_int64_power_of_two_axis("Elements{io}", std::vector{16, 20, 24, 28, 31}); - -NVBENCH_MAIN diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu index 5ad936019fa..f9ab98d62ad 100644 --- a/cub/benchmarks/bench/transform/tile/grayscale.cu +++ b/cub/benchmarks/bench/transform/tile/grayscale.cu @@ -1,76 +1,85 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause -// Grayscale: RGB pixel -> luminance via three separate input streams. -// Custom rgb_to_y op self-registers its tile substitute via tile_eligible<>. +// Tile variant of the grayscale transform bench. Unlike the base bench (a single rgb_t struct +// input), this uses three separate R/G/B streams so the inputs are plain element types the tile path +// can vectorize. The named rgb_to_y op registers a tile_operator substitute (gated). This file +// disappears once tile dispatch is fully transparent. -#include - -#include - -#include -#include -#include +#include "../common.h" #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() # include #endif -#include "bench_init.cuh" - -struct rgb_to_y { - template - __host__ __device__ auto operator()(R r, G g, B b) const { - constexpr float w_r = 0.2989f; - constexpr float w_g = 0.587f; - constexpr float w_b = 0.114f; - return w_r * r + w_g * g + w_b * b; - } +struct rgb_to_y +{ + template + __host__ __device__ auto operator()(R r, G g, B b) const + { + constexpr float w_r = 0.2989f; + constexpr float w_g = 0.587f; + constexpr float w_b = 0.114f; + return w_r * r + w_g * g + w_b * b; + } }; #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() -struct tile_rgb_to_y { - template - __tile__ auto operator()(R r, G g, B b) const { - constexpr float w_r = 0.2989f; - constexpr float w_g = 0.587f; - constexpr float w_b = 0.114f; - return w_r * r + w_g * g + w_b * b; - } +struct tile_rgb_to_y +{ + template + __tile__ auto operator()(R r, G g, B b) const + { + constexpr float w_r = 0.2989f; + constexpr float w_g = 0.587f; + constexpr float w_b = 0.114f; + return w_r * r + w_g * g + w_b * b; + } }; CUB_NAMESPACE_BEGIN namespace transform { -template struct tile_eligible : ::cuda::std::true_type {}; -template <> struct tile_operator { using type = tile_rgb_to_y; }; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template <> +struct tile_operator +{ + using type = tile_rgb_to_y; +}; } // namespace transform CUB_NAMESPACE_END +#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() + +#ifdef TUNE_T +using value_types = nvbench::type_list; +#else +using value_types = nvbench::type_list; #endif template -void grayscale(nvbench::state& state, nvbench::type_list) { - const auto n = state.get_int64("Elements{io}"); - T *r, *g, *b, *out; - cudaMalloc(&r, n*sizeof(T)); cudaMalloc(&g, n*sizeof(T)); cudaMalloc(&b, n*sizeof(T)); - cudaMalloc(&out, n*sizeof(T)); - bench_init::rand_fill(r, n, 0xA111); - bench_init::rand_fill(g, n, 0xA222); - bench_init::rand_fill(b, n, 0xA333); - - state.add_element_count(n); - state.add_global_memory_reads(3 * n); // matches CUB's rgb_t = 3*sizeof(T) - state.add_global_memory_writes(n); - state.exec([&](nvbench::launch& launch) { - cub::DeviceTransform::Transform( - ::cuda::std::make_tuple(r, g, b), out, n, rgb_to_y{}, launch.get_stream()); - }); - cudaFree(r); cudaFree(g); cudaFree(b); cudaFree(out); -} +static void grayscale(nvbench::state& state, nvbench::type_list) +try +{ + const auto n = state.get_int64("Elements{io}"); -using value_types = nvbench::type_list; + thrust::device_vector r = generate(n); + thrust::device_vector g = generate(n); + thrust::device_vector b = generate(n); + thrust::device_vector out(n, thrust::no_init); -NVBENCH_BENCH_TYPES(grayscale, NVBENCH_TYPE_AXES(value_types)).set_name("tile_grayscale") - .add_int64_power_of_two_axis("Elements{io}", std::vector{16, 20, 24, 28, 31}); + state.add_element_count(n); + state.add_global_memory_reads(3 * n); // matches the base bench's rgb_t = 3 * sizeof(T) + state.add_global_memory_writes(n); + bench_transform(state, cuda::std::tuple{r.begin(), g.begin(), b.begin()}, out.begin(), n, rgb_to_y{}); +} +catch (const std::bad_alloc&) +{ + state.skip("Skipping: out of memory."); +} -NVBENCH_MAIN +NVBENCH_BENCH_TYPES(grayscale, NVBENCH_TYPE_AXES(value_types)) + .set_name("tile_grayscale") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 32, 4)); diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu index 25e90b7f66e..006f88ff5c6 100644 --- a/cub/benchmarks/bench/transform/tile/pytorch.cu +++ b/cub/benchmarks/bench/transform/tile/pytorch.cu @@ -1,200 +1,512 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause -// PyTorch-style ops via cub::DeviceTransform::Transform. Each custom op -// self-registers a tile substitute through tile_eligible<>, so the dispatch -// hook routes them to the tile kernel under --enable-tile + the -// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH macro. MUFU-heavy ops also opt into -// tile_mufu_heavy<> so the tile policy picker caps items/thread at the -// vector width on sub-4-byte types. +// Tile variant of the PyTorch-style transform benches. Each named op registers a tile_operator +// substitute (gated); MUFU-heavy ops also opt into tile_mufu_heavy<> so the tile policy picker caps +// items/thread at the vector width on sub-4-byte types. Under --enable-tile + +// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH the dispatch hook routes them to the tile kernel; otherwise this +// is the standard CUB path. This file disappears once tile dispatch is fully transparent. -#include - -#include - -#include -#include #include +#include + #include -#include -#include + +#include "../common.h" #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() # include #endif -#include "bench_init.cuh" - -// ======================================================================== -// Scalar ops (the types the user passes to cub::DeviceTransform::Transform). -// Sub-4-byte input types compute in float and cast back, matching the tile -// substitute below. -// ======================================================================== -template __host__ __device__ float to_f(T v) { return static_cast(v); } -template __host__ __device__ T from_f(float f) { return static_cast(f); } - -struct relu_op { template __host__ __device__ T operator()(T v) const { - float f = to_f(v); return from_f(f > 0.0f ? f : 0.0f); } }; -struct sigmoid_op { template __host__ __device__ T operator()(T v) const { - float f = to_f(v); return from_f(1.0f / (1.0f + ::cuda::std::exp(-f))); } }; -struct tanh_op { template __host__ __device__ T operator()(T v) const { - return from_f(::cuda::std::tanh(to_f(v))); } }; -struct gelu_op { template __host__ __device__ T operator()(T v) const { +// Scalar ops the user passes to Transform. Sub-4-byte input types compute in float and cast back, +// matching the tile substitutes below. +template +__host__ __device__ float to_f(T v) +{ + return static_cast(v); +} +template +__host__ __device__ T from_f(float f) +{ + return static_cast(f); +} + +struct relu_op +{ + template + __host__ __device__ T operator()(T v) const + { + float f = to_f(v); + return from_f(f > 0.0f ? f : 0.0f); + } +}; +struct sigmoid_op +{ + template + __host__ __device__ T operator()(T v) const + { + float f = to_f(v); + return from_f(1.0f / (1.0f + ::cuda::std::exp(-f))); + } +}; +struct tanh_op +{ + template + __host__ __device__ T operator()(T v) const + { + return from_f(::cuda::std::tanh(to_f(v))); + } +}; +struct gelu_op +{ + template + __host__ __device__ T operator()(T v) const + { constexpr float k0 = 0.7978845608028654f, k1 = 0.044715f; float f = to_f(v); - return from_f(0.5f * f * (1.0f + ::cuda::std::tanh(k0 * (f + k1 * f * f * f)))); } }; -struct sin_op { template __host__ __device__ T operator()(T v) const { - return from_f(::cuda::std::sin(to_f(v))); } }; -struct exp_op { template __host__ __device__ T operator()(T v) const { - return from_f(::cuda::std::exp(to_f(v))); } }; - -struct binary_add { template __host__ __device__ auto operator()(A a, B b) const { return a + b; } }; -struct binary_sub { template __host__ __device__ auto operator()(A a, B b) const { return a - b; } }; -struct binary_mul { template __host__ __device__ auto operator()(A a, B b) const { return a * b; } }; -struct binary_div { template __host__ __device__ auto operator()(A a, B b) const { return a / b; } }; -struct binary_le { template __host__ __device__ A operator()(A a, B b) const { return static_cast(a <= b); } }; -struct binary_ge { template __host__ __device__ A operator()(A a, B b) const { return static_cast(a >= b); } }; -struct binary_fmin { template __host__ __device__ auto operator()(A a, B b) const { return a < b ? a : b; } }; -struct binary_fmax { template __host__ __device__ auto operator()(A a, B b) const { return a > b ? a : b; } }; - -// ======================================================================== -// Tile substitutes + trait registration. Only compiled under tile mode. -// ======================================================================== + return from_f(0.5f * f * (1.0f + ::cuda::std::tanh(k0 * (f + k1 * f * f * f)))); + } +}; +struct sin_op +{ + template + __host__ __device__ T operator()(T v) const + { + return from_f(::cuda::std::sin(to_f(v))); + } +}; +struct exp_op +{ + template + __host__ __device__ T operator()(T v) const + { + return from_f(::cuda::std::exp(to_f(v))); + } +}; + +struct binary_add +{ + template + __host__ __device__ auto operator()(A a, B b) const + { + return a + b; + } +}; +struct binary_sub +{ + template + __host__ __device__ auto operator()(A a, B b) const + { + return a - b; + } +}; +struct binary_mul +{ + template + __host__ __device__ auto operator()(A a, B b) const + { + return a * b; + } +}; +struct binary_div +{ + template + __host__ __device__ auto operator()(A a, B b) const + { + return a / b; + } +}; +struct binary_le +{ + template + __host__ __device__ A operator()(A a, B b) const + { + return static_cast(a <= b); + } +}; +struct binary_ge +{ + template + __host__ __device__ A operator()(A a, B b) const + { + return static_cast(a >= b); + } +}; +struct binary_fmin +{ + template + __host__ __device__ auto operator()(A a, B b) const + { + return a < b ? a : b; + } +}; +struct binary_fmax +{ + template + __host__ __device__ auto operator()(A a, B b) const + { + return a > b ? a : b; + } +}; + #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() namespace ct = ::cuda::tiles; -template __tile__ auto as_float(T v) { return ct::element_cast(v); } -template __tile__ auto from_float(F f) { return ct::element_cast>(f); } +template +__tile__ auto as_float(T v) +{ + return ct::element_cast(v); +} +template +__tile__ auto from_float(F f) +{ + return ct::element_cast>(f); +} -struct tile_relu { template __tile__ auto operator()(T v) const { - auto f = as_float(v); return from_float(ct::select(f > 0.0f, f, f - f)); } }; -struct tile_sigmoid { template __tile__ auto operator()(T v) const { - auto f = as_float(v); return from_float(1.0f / (1.0f + ct::exp(-f))); } }; -struct tile_tanh { template __tile__ auto operator()(T v) const { - return from_float(ct::tanh(as_float(v))); } }; -struct tile_gelu { template __tile__ auto operator()(T v) const { +struct tile_relu +{ + template + __tile__ auto operator()(T v) const + { + auto f = as_float(v); + return from_float(ct::select(f > 0.0f, f, f - f)); + } +}; +struct tile_sigmoid +{ + template + __tile__ auto operator()(T v) const + { + auto f = as_float(v); + return from_float(1.0f / (1.0f + ct::exp(-f))); + } +}; +struct tile_tanh +{ + template + __tile__ auto operator()(T v) const + { + return from_float(ct::tanh(as_float(v))); + } +}; +struct tile_gelu +{ + template + __tile__ auto operator()(T v) const + { constexpr float k0 = 0.7978845608028654f, k1 = 0.044715f; auto f = as_float(v); - return from_float(0.5f * f * (1.0f + ct::tanh(k0 * (f + k1 * f * f * f)))); } }; -struct tile_sin { template __tile__ auto operator()(T v) const { return from_float(ct::sin(as_float(v))); } }; -struct tile_exp { template __tile__ auto operator()(T v) const { return from_float(ct::exp(as_float(v))); } }; - -struct tile_binary_add { template __tile__ auto operator()(A a, B b) const { return a + b; } }; -struct tile_binary_sub { template __tile__ auto operator()(A a, B b) const { return a - b; } }; -struct tile_binary_mul { template __tile__ auto operator()(A a, B b) const { return a * b; } }; -struct tile_binary_div { template __tile__ auto operator()(A a, B b) const { return a / b; } }; -struct tile_binary_le { template __tile__ auto operator()(A a, B b) const { return ct::element_cast>(a <= b); } }; -struct tile_binary_ge { template __tile__ auto operator()(A a, B b) const { return ct::element_cast>(a >= b); } }; -struct tile_binary_fmin { template __tile__ auto operator()(A a, B b) const { return ct::select(a < b, a, b); } }; -struct tile_binary_fmax { template __tile__ auto operator()(A a, B b) const { return ct::select(a > b, a, b); } }; + return from_float(0.5f * f * (1.0f + ct::tanh(k0 * (f + k1 * f * f * f)))); + } +}; +struct tile_sin +{ + template + __tile__ auto operator()(T v) const + { + return from_float(ct::sin(as_float(v))); + } +}; +struct tile_exp +{ + template + __tile__ auto operator()(T v) const + { + return from_float(ct::exp(as_float(v))); + } +}; + +struct tile_binary_add +{ + template + __tile__ auto operator()(A a, B b) const + { + return a + b; + } +}; +struct tile_binary_sub +{ + template + __tile__ auto operator()(A a, B b) const + { + return a - b; + } +}; +struct tile_binary_mul +{ + template + __tile__ auto operator()(A a, B b) const + { + return a * b; + } +}; +struct tile_binary_div +{ + template + __tile__ auto operator()(A a, B b) const + { + return a / b; + } +}; +struct tile_binary_le +{ + template + __tile__ auto operator()(A a, B b) const + { + return ct::element_cast>(a <= b); + } +}; +struct tile_binary_ge +{ + template + __tile__ auto operator()(A a, B b) const + { + return ct::element_cast>(a >= b); + } +}; +struct tile_binary_fmin +{ + template + __tile__ auto operator()(A a, B b) const + { + return ct::select(a < b, a, b); + } +}; +struct tile_binary_fmax +{ + template + __tile__ auto operator()(A a, B b) const + { + return ct::select(a > b, a, b); + } +}; CUB_NAMESPACE_BEGIN namespace transform { // Unary -template struct tile_eligible : ::cuda::std::true_type {}; -template struct tile_eligible : ::cuda::std::true_type {}; -template struct tile_eligible : ::cuda::std::true_type {}; -template struct tile_eligible : ::cuda::std::true_type {}; -template struct tile_eligible : ::cuda::std::true_type {}; -template struct tile_eligible : ::cuda::std::true_type {}; -template <> struct tile_operator { using type = tile_relu; }; -template <> struct tile_operator { using type = tile_sigmoid; }; -template <> struct tile_operator { using type = tile_tanh; }; -template <> struct tile_operator { using type = tile_gelu; }; -template <> struct tile_operator { using type = tile_sin; }; -template <> struct tile_operator { using type = tile_exp; }; - -// MUFU-heavy unary ops: hint to tile policy picker to cap items/thread at vector width on sub-4-byte types. -template <> struct tile_mufu_heavy : ::cuda::std::true_type {}; -template <> struct tile_mufu_heavy : ::cuda::std::true_type {}; -template <> struct tile_mufu_heavy : ::cuda::std::true_type {}; -template <> struct tile_mufu_heavy : ::cuda::std::true_type {}; -template <> struct tile_mufu_heavy : ::cuda::std::true_type {}; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template <> +struct tile_operator +{ + using type = tile_relu; +}; +template <> +struct tile_operator +{ + using type = tile_sigmoid; +}; +template <> +struct tile_operator +{ + using type = tile_tanh; +}; +template <> +struct tile_operator +{ + using type = tile_gelu; +}; +template <> +struct tile_operator +{ + using type = tile_sin; +}; +template <> +struct tile_operator +{ + using type = tile_exp; +}; + +// MUFU-heavy unary ops: hint the tile policy picker to cap items/thread at the vector width on +// sub-4-byte types. +template <> +struct tile_mufu_heavy : ::cuda::std::true_type +{}; +template <> +struct tile_mufu_heavy : ::cuda::std::true_type +{}; +template <> +struct tile_mufu_heavy : ::cuda::std::true_type +{}; +template <> +struct tile_mufu_heavy : ::cuda::std::true_type +{}; +template <> +struct tile_mufu_heavy : ::cuda::std::true_type +{}; // Binary -template struct tile_eligible : ::cuda::std::true_type {}; -template struct tile_eligible : ::cuda::std::true_type {}; -template struct tile_eligible : ::cuda::std::true_type {}; -template struct tile_eligible : ::cuda::std::true_type {}; -template struct tile_eligible : ::cuda::std::true_type {}; -template struct tile_eligible : ::cuda::std::true_type {}; -template struct tile_eligible : ::cuda::std::true_type {}; -template struct tile_eligible : ::cuda::std::true_type {}; -template <> struct tile_operator { using type = tile_binary_add; }; -template <> struct tile_operator { using type = tile_binary_sub; }; -template <> struct tile_operator { using type = tile_binary_mul; }; -template <> struct tile_operator { using type = tile_binary_div; }; -template <> struct tile_operator { using type = tile_binary_le; }; -template <> struct tile_operator { using type = tile_binary_ge; }; -template <> struct tile_operator { using type = tile_binary_fmin; }; -template <> struct tile_operator { using type = tile_binary_fmax; }; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template +struct tile_eligible : ::cuda::std::true_type +{}; +template <> +struct tile_operator +{ + using type = tile_binary_add; +}; +template <> +struct tile_operator +{ + using type = tile_binary_sub; +}; +template <> +struct tile_operator +{ + using type = tile_binary_mul; +}; +template <> +struct tile_operator +{ + using type = tile_binary_div; +}; +template <> +struct tile_operator +{ + using type = tile_binary_le; +}; +template <> +struct tile_operator +{ + using type = tile_binary_ge; +}; +template <> +struct tile_operator +{ + using type = tile_binary_fmin; +}; +template <> +struct tile_operator +{ + using type = tile_binary_fmax; +}; } // namespace transform CUB_NAMESPACE_END +#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() + +#ifdef TUNE_T +using element_types = nvbench::type_list; +#else +using element_types = nvbench::type_list< +# if _CCCL_HAS_NVFP16() + __half, +# endif +# if _CCCL_HAS_NVBF16() + __nv_bfloat16, +# endif + float>; #endif -// ======================================================================== -// Bench harness. -// ======================================================================== template -void run_unary(nvbench::state& state) { - const auto n = state.get_int64("Elements{io}"); - T *in, *out; - cudaMalloc(&in, n * sizeof(T)); cudaMalloc(&out, n * sizeof(T)); - bench_init::rand_fill(in, n, 0xA111); cudaDeviceSynchronize(); - state.add_element_count(n); - state.add_global_memory_reads(n); - state.add_global_memory_writes(n); - state.exec([&](nvbench::launch& launch) { - cub::DeviceTransform::Transform( - ::cuda::std::make_tuple(in), out, n, Op{}, launch.get_stream()); - }); - cudaFree(in); cudaFree(out); +static void run_unary(nvbench::state& state) +try +{ + const auto n = state.get_int64("Elements{io}"); + thrust::device_vector in(n, T(1)); + thrust::device_vector out(n, thrust::no_init); + + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + bench_transform(state, cuda::std::tuple{in.begin()}, out.begin(), n, Op{}); +} +catch (const std::bad_alloc&) +{ + state.skip("Skipping: out of memory."); } template -void run_binary(nvbench::state& state) { - const auto n = state.get_int64("Elements{io}"); - T *a, *b, *out; - cudaMalloc(&a, n*sizeof(T)); cudaMalloc(&b, n*sizeof(T)); cudaMalloc(&out, n*sizeof(T)); - bench_init::rand_fill(a, n, 0xA111); - bench_init::rand_fill(b, n, 0xB222); - cudaDeviceSynchronize(); - state.add_element_count(n); - state.add_global_memory_reads(2*n); - state.add_global_memory_writes(n); - state.exec([&](nvbench::launch& launch) { - cub::DeviceTransform::Transform( - ::cuda::std::make_tuple(a, b), out, n, Op{}, launch.get_stream()); - }); - cudaFree(a); cudaFree(b); cudaFree(out); +static void run_binary(nvbench::state& state) +try +{ + const auto n = state.get_int64("Elements{io}"); + thrust::device_vector a(n, T(1)); + thrust::device_vector b(n, T(1)); + thrust::device_vector out(n, thrust::no_init); + + state.add_element_count(n); + state.add_global_memory_reads(2 * n); + state.add_global_memory_writes(n); + bench_transform(state, cuda::std::tuple{a.begin(), b.begin()}, out.begin(), n, Op{}); +} +catch (const std::bad_alloc&) +{ + state.skip("Skipping: out of memory."); } -using element_types = nvbench::type_list<__half, __nv_bfloat16, float>; -inline auto pt_sizes = std::vector{16, 20, 24, 28, 31}; - -#define UNARY_BENCH(name, op) \ - template void name##_bench(nvbench::state& state, nvbench::type_list) { run_unary(state); } \ - NVBENCH_BENCH_TYPES(name##_bench, NVBENCH_TYPE_AXES(element_types)).set_name("tile_" #name).add_int64_power_of_two_axis("Elements{io}", pt_sizes); - -UNARY_BENCH(relu, relu_op) -UNARY_BENCH(sigmoid, sigmoid_op) -UNARY_BENCH(tanh, tanh_op) -UNARY_BENCH(gelu, gelu_op) -UNARY_BENCH(sin, sin_op) -UNARY_BENCH(exp, exp_op) - -#define BINARY_BENCH(name, op) \ - template void name##_bench(nvbench::state& state, nvbench::type_list) { run_binary(state); } \ - NVBENCH_BENCH_TYPES(name##_bench, NVBENCH_TYPE_AXES(element_types)).set_name("tile_pt_" #name).add_int64_power_of_two_axis("Elements{io}", pt_sizes); - -BINARY_BENCH(add, binary_add) -BINARY_BENCH(sub, binary_sub) -BINARY_BENCH(mul, binary_mul) -BINARY_BENCH(div, binary_div) -BINARY_BENCH(le, binary_le) -BINARY_BENCH(ge, binary_ge) -BINARY_BENCH(fmin, binary_fmin) -BINARY_BENCH(fmax, binary_fmax) - -NVBENCH_MAIN +inline auto pt_sizes = nvbench::range(16, 32, 4); + +#define UNARY_BENCH(name, op) \ + template \ + static void name##_bench(nvbench::state& state, nvbench::type_list) \ + { \ + run_unary(state); \ + } \ + NVBENCH_BENCH_TYPES(name##_bench, NVBENCH_TYPE_AXES(element_types)) \ + .set_name("tile_" #name) \ + .set_type_axes_names({"T{ct}"}) \ + .add_int64_power_of_two_axis("Elements{io}", pt_sizes) + +UNARY_BENCH(relu, relu_op); +UNARY_BENCH(sigmoid, sigmoid_op); +UNARY_BENCH(tanh, tanh_op); +UNARY_BENCH(gelu, gelu_op); +UNARY_BENCH(sin, sin_op); +UNARY_BENCH(exp, exp_op); + +#define BINARY_BENCH(name, op) \ + template \ + static void name##_bench(nvbench::state& state, nvbench::type_list) \ + { \ + run_binary(state); \ + } \ + NVBENCH_BENCH_TYPES(name##_bench, NVBENCH_TYPE_AXES(element_types)) \ + .set_name("tile_pt_" #name) \ + .set_type_axes_names({"T{ct}"}) \ + .add_int64_power_of_two_axis("Elements{io}", pt_sizes) + +BINARY_BENCH(add, binary_add); +BINARY_BENCH(sub, binary_sub); +BINARY_BENCH(mul, binary_mul); +BINARY_BENCH(div, binary_div); +BINARY_BENCH(le, binary_le); +BINARY_BENCH(ge, binary_ge); +BINARY_BENCH(fmin, binary_fmin); +BINARY_BENCH(fmax, binary_fmax); diff --git a/cub/benchmarks/bench/transform/tile/test_device_transform.cu b/cub/benchmarks/bench/transform/tile/test_device_transform.cu deleted file mode 100644 index e2c7a5006bb..00000000000 --- a/cub/benchmarks/bench/transform/tile/test_device_transform.cu +++ /dev/null @@ -1,217 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -// SPDX-License-Identifier: BSD-3-Clause - -// Standalone correctness tests for cub::DeviceTransform with the tile -// dispatch hook on. Exercises: -// - Built-in trait specs (cuda::std::plus, cuda::std::multiplies) -// - User-registered trait specs (square_op, identity_op) -// - cub::DeviceTransform::Fill (zero-input case) -// -// Built under --enable-tile + CCCL_ENABLE_TILE_TRANSFORM_DISPATCH so the -// hook routes eligible combos to the tile kernel. Sits next to the benches -// so it builds against the same tileiras toolchain; not part of CCCL's -// catch2 suite. - -#include - -#include - -#include -#include - -#include -#include -#include -#include -#include - -#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() -# include -#endif - -namespace { - -int g_failures = 0; - -#define CUDA_CHECK(expr) \ - do { \ - cudaError_t _e = (expr); \ - if (_e != cudaSuccess) { \ - std::fprintf(stderr, "%s:%d CUDA error: %s\n", __FILE__, __LINE__, \ - cudaGetErrorString(_e)); \ - std::exit(2); \ - } \ - } while (0) - -template -bool eq(T a, T b) { return a == b; } -inline bool eq(float a, float b) { - float diff = std::fabs(a - b); - float tol = 1e-5f * std::fmax(std::fabs(a), std::fabs(b)); - return diff <= std::fmax(tol, 1e-6f); -} - -template -void expect_array(const char* name, const std::vector& got, const std::vector& want) { - if (got.size() != want.size()) { - std::fprintf(stderr, "[FAIL] %s: size %zu != %zu\n", name, got.size(), want.size()); - ++g_failures; - return; - } - int mismatches = 0; - for (size_t i = 0; i < got.size(); ++i) { - if (!eq(got[i], want[i])) { - if (mismatches < 4) { - std::fprintf(stderr, "[FAIL] %s: idx=%zu got=%g want=%g\n", - name, i, double(got[i]), double(want[i])); - } - ++mismatches; - } - } - if (mismatches) { ++g_failures; std::fprintf(stderr, "[FAIL] %s: %d mismatches\n", name, mismatches); } - else { std::printf("[ OK ] %s (n=%zu)\n", name, got.size()); } -} - -// User-defined scalar functors (the call-site type). identity_op and square_op -// don't have a cuda::std equivalent, so we self-register them. add and mul map -// to cuda::std::plus / cuda::std::multiplies which CCCL already ships specs for. - -struct identity_op { - template __host__ __device__ T operator()(T a) const { return a; } -}; -struct square_op { - template __host__ __device__ T operator()(T a) const { return a * a; } -}; - -#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() -namespace ct = ::cuda::tiles; - -// Tile-friendly substitutes (must be stateless + trivially default constructible). -struct tile_identity_op { - template __tile__ auto operator()(T v) const { return v; } -}; -struct tile_square_op { - template __tile__ auto operator()(T v) const { return v * v; } -}; -#endif - -template -std::vector ramp(int64_t n, T start = T{0}, T step = T{1}) { - std::vector v(n); - for (int64_t i = 0; i < n; ++i) v[i] = T(start + step * T(i)); - return v; -} - -template -struct GpuVec { - T* d{}; - int64_t n{}; - explicit GpuVec(int64_t n) : n(n) { CUDA_CHECK(cudaMalloc(&d, n * sizeof(T))); } - explicit GpuVec(const std::vector& h) : GpuVec(int64_t(h.size())) { - CUDA_CHECK(cudaMemcpy(d, h.data(), n * sizeof(T), cudaMemcpyHostToDevice)); - } - ~GpuVec() { if (d) cudaFree(d); } - std::vector to_host() const { - std::vector h(n); - CUDA_CHECK(cudaMemcpy(h.data(), d, n * sizeof(T), cudaMemcpyDeviceToHost)); - return h; - } -}; - -template -void test_identity(int64_t n) { - auto h_in = ramp(n, T{1}, T{1}); - GpuVec dx(h_in), dy(n); - CUDA_CHECK(cub::DeviceTransform::Transform( - ::cuda::std::make_tuple(dx.d), dy.d, n, identity_op{})); - CUDA_CHECK(cudaDeviceSynchronize()); - expect_array("identity", dy.to_host(), h_in); -} - -template -void test_square(int64_t n) { - auto h_in = ramp(n, T{1}, T{1}); - std::vector want(n); - for (int64_t i = 0; i < n; ++i) want[i] = h_in[i] * h_in[i]; - GpuVec dx(h_in), dy(n); - CUDA_CHECK(cub::DeviceTransform::Transform( - ::cuda::std::make_tuple(dx.d), dy.d, n, square_op{})); - CUDA_CHECK(cudaDeviceSynchronize()); - expect_array("square", dy.to_host(), want); -} - -template -void test_add(int64_t n) { - auto ha = ramp(n, T{1}, T{1}); - auto hb = ramp(n, T{100}, T{2}); - std::vector want(n); - for (int64_t i = 0; i < n; ++i) want[i] = ha[i] + hb[i]; - GpuVec da(ha), db(hb), dc(n); - CUDA_CHECK(cub::DeviceTransform::Transform( - ::cuda::std::make_tuple(da.d, db.d), dc.d, n, ::cuda::std::plus{})); - CUDA_CHECK(cudaDeviceSynchronize()); - expect_array("add", dc.to_host(), want); -} - -template -void test_mul(int64_t n) { - auto ha = ramp(n, T{1}, T{1}); - auto hb = ramp(n, T{3}, T{1}); - std::vector want(n); - for (int64_t i = 0; i < n; ++i) want[i] = ha[i] * hb[i]; - GpuVec da(ha), db(hb), dc(n); - CUDA_CHECK(cub::DeviceTransform::Transform( - ::cuda::std::make_tuple(da.d, db.d), dc.d, n, ::cuda::std::multiplies{})); - CUDA_CHECK(cudaDeviceSynchronize()); - expect_array("mul", dc.to_host(), want); -} - -template -void test_fill(int64_t n, T value) { - GpuVec dy(n); - CUDA_CHECK(cub::DeviceTransform::Fill(dy.d, n, value)); - CUDA_CHECK(cudaDeviceSynchronize()); - std::vector want(n, value); - expect_array("fill", dy.to_host(), want); -} - -} // namespace - -// User self-registers identity_op and square_op as tile-eligible. -#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() -CUB_NAMESPACE_BEGIN -namespace transform -{ -template <> struct tile_eligible : ::cuda::std::true_type {}; -template <> struct tile_eligible : ::cuda::std::true_type {}; -template <> struct tile_eligible : ::cuda::std::true_type {}; -template <> struct tile_eligible : ::cuda::std::true_type {}; -template <> struct tile_operator { using type = tile_identity_op; }; -template <> struct tile_operator { using type = tile_square_op; }; -} // namespace transform -CUB_NAMESPACE_END -#endif - -int main() { - // pow-2, multiple tiles - test_identity(4096); - test_square(2048); - test_add(4096); - test_mul(2048); - test_fill(1024, 42); - - // non-pow-2 num_items (still multiple of 16 to satisfy assume_divisible<16>) - test_add(4112); // 16 * 257 - test_fill(1008, -7); // 16 * 63 - - // single full tile and below-one-tile (still >=16, div by 16) - test_square(16); - test_add(64); - - if (g_failures) { - std::fprintf(stderr, "\n%d test group(s) FAILED\n", g_failures); - return 1; - } - std::printf("\nall tests passed\n"); - return 0; -} From 01418396cec19a620a7bee9a1974deceef09562b Mon Sep 17 00:00:00 2001 From: Nan An Date: Thu, 11 Jun 2026 18:51:10 -0700 Subject: [PATCH 60/83] accept cuda::aligned_size_t<16> as a compile-time tile-commit hint DeviceTransform::Transform now accepts num_items as cuda::aligned_size_t -- the same opt-in promise cuda::memcpy_async uses (pointers N-aligned, num_items a multiple of N). When N>=16 and the op is tile-eligible, the hook commits to the tile path at compile time and skips runtime_preconditions_valid; otherwise it falls back to the existing runtime alignment/divisibility check. No overload needed -- NumItemsT deduces the type; __get_size_align_v reads the alignment. num_items is unwrapped to a plain integer for the offset machinery (choose_signed_offset requires integral), so plain-integer callers are byte-for-byte unchanged (count_t == NumItemsT, count == num_items). Verified on sm_120 that the aligned_size_t commit path is bit-correct. --- cub/cub/device/device_transform.cuh | 42 ++++++++++++++++++----------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index 7d8bd316e81..24a828c5fc0 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -97,7 +98,16 @@ struct DeviceTransform // https://github.com/NVIDIA/cccl/issues/8805 for data. We use choose_signed_offset to just check if it can hold the // value passed by the user, but otherwise ignore the chosen signed offset type. using offset_t = ::cuda::std::int64_t; - if (const cudaError_t error = detail::choose_signed_offset::is_exceeding_offset_type(num_items)) + + // num_items may be a plain integer or a cuda::aligned_size_t -- an opt-in promise (the same one + // cuda::memcpy_async uses) that the pointers are N-aligned and num_items is a multiple of N. Unwrap + // it to a plain integer for the offset machinery (choose_signed_offset requires an integral type); + // the alignment promise is read separately by the tile hook below. For a plain integer this is a + // no-op: count_t == NumItemsT and count == num_items. + constexpr ::cuda::std::size_t num_items_align = ::cuda::__get_size_align_v; + using count_t = ::cuda::std::conditional_t<(num_items_align > 1), ::cuda::std::size_t, NumItemsT>; + const count_t count = static_cast(num_items); + if (const cudaError_t error = detail::choose_signed_offset::is_exceeding_offset_type(count)) { return error; } @@ -105,24 +115,24 @@ struct DeviceTransform const auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env).get(); #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() - // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible AND - // the runtime alignment + divisibility preconditions hold, route to the - // tile kernel. Otherwise fall through to the standard CUB dispatch - // below -- CUB's existing kernels handle the unaligned tail case via - // their own internal logic, so misalignment is a graceful fallback, - // not an error. + // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible we route to the tile kernel: + // - if num_items is a cuda::aligned_size_t=16, the caller has promised 16-byte pointer + // alignment + divisibility, so we commit to tile at compile time and skip the runtime check; + // - otherwise we check the alignment/divisibility preconditions at runtime and fall through to + // the standard CUB dispatch below if they do not hold (CUB's kernels handle the unaligned/tail + // case, so this is a graceful fallback, not an error). if constexpr (StableAddress == detail::transform::requires_stable_address::no && ::cuda::std::is_same_v - && cub::detail::transform::tile::tile_dispatch_eligible_v< - TransformOp, - RandomAccessIteratorOut, - RandomAccessIteratorsIn...>) + && cub::detail::transform::tile:: + tile_dispatch_eligible_v) { - if (cub::detail::transform::tile::runtime_preconditions_valid( - inputs, output, static_cast(num_items))) + if constexpr (num_items_align >= 16) + { + return cub::detail::transform::tile::dispatch(inputs, output, static_cast(count), stream); + } + else if (cub::detail::transform::tile::runtime_preconditions_valid(inputs, output, static_cast(count))) { - return cub::detail::transform::tile::dispatch( - inputs, output, static_cast(num_items), stream); + return cub::detail::transform::tile::dispatch(inputs, output, static_cast(count), stream); } } #endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() @@ -145,7 +155,7 @@ struct DeviceTransform return detail::transform::dispatch( ::cuda::std::move(inputs), ::cuda::std::move(output), - static_cast(num_items), + static_cast(count), ::cuda::std::move(predicate), ::cuda::std::move(transform_op), stream, From 99042d3933723f1c231ec5b80ca85d4a92793cd0 Mon Sep 17 00:00:00 2001 From: Nan An Date: Thu, 11 Jun 2026 19:16:55 -0700 Subject: [PATCH 61/83] clang-format kernel_transform_tile.cuh and tuning_transform_tile.cuh --- .../dispatch/kernels/kernel_transform_tile.cuh | 16 ++++++++-------- .../dispatch/tuning/tuning_transform_tile.cuh | 2 -- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index 585cefc833d..778721f6257 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -17,15 +17,14 @@ #if _CCCL_CUB_HAS_TILE_TRANSFORM() -# include - # include +# include + CUB_NAMESPACE_BEGIN namespace detail::transform::tile { - // Build a tile partition_view for a 1D contiguous buffer. The two annotations are load-bearing: // assume_aligned<16> -- promises the pointer is 16-byte aligned, so the compiler can pick LDG.E.128 vectorized // loads/stores. @@ -36,7 +35,7 @@ namespace detail::transform::tile template [[nodiscard]] __tile__ auto make_aligned_partition_view(T* ptr, N n) { - namespace ct = ::cuda::tiles; + namespace ct = ::cuda::tiles; const auto ptr_align = ct::assume_aligned<16>(ptr); auto span = ct::tensor_span{ptr_align, ct::extents<::cuda::std::int64_t, ct::dynamic_extent>{n}}; return ct::partition_view{span, ct::shape{}}; @@ -52,13 +51,15 @@ template __tile_global__ void transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins) { - namespace ct = ::cuda::tiles; + namespace ct = ::cuda::tiles; using cub::detail::transform::tile::make_aligned_partition_view; const auto bx = ct::bid().x; const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); const auto out_view = make_aligned_partition_view(out, n); - auto load_one = [bx, n](auto* ptr) { return make_aligned_partition_view(ptr, n).load_masked(bx); }; + auto load_one = [bx, n](auto* ptr) { + return make_aligned_partition_view(ptr, n).load_masked(bx); + }; out_view.store_masked(Fn{}(load_one(ins)...), bx); } @@ -66,7 +67,7 @@ transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, co template __tile_global__ void fill_kernel(const ::cuda::std::int64_t num_items, T* __restrict__ out, const T value) { - namespace ct = ::cuda::tiles; + namespace ct = ::cuda::tiles; using cub::detail::transform::tile::make_aligned_partition_view; const auto bx = ct::bid().x; @@ -75,7 +76,6 @@ __tile_global__ void fill_kernel(const ::cuda::std::int64_t num_items, T* __rest using tile_t = ct::tile>; out_view.store_masked(ct::full(value), bx); } - } // namespace detail::transform::tile CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh index a7715e6f195..8a11ad60f7a 100644 --- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh @@ -36,7 +36,6 @@ CUB_NAMESPACE_BEGIN namespace detail::transform::tile { - // mufu_heavy=true tells the policy the functor body has heavy MUFU usage. // for small data types, vectorized load will make them arrive packed in // registers and the compiler unpacks them and packs them back. reducing the @@ -79,7 +78,6 @@ constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability return items * threads_per_block; } - } // namespace detail::transform::tile CUB_NAMESPACE_END From 4a5ec541567abc89550186f071341523faaaaee8 Mon Sep 17 00:00:00 2001 From: Nan An Date: Thu, 11 Jun 2026 23:48:26 -0700 Subject: [PATCH 62/83] guard fp16/bf16 in tile pytorch bench on CTK 12.2+ Matches the base transform/pytorch.cu: __half/__nv_bfloat16 are only added to the type axis under _CCCL_CTK_AT_LEAST(12, 2). On CTK 12.0 __nv_bfloat16 has only float/double constructors, so constructing it from an int literal (T(1) in run_unary/run_binary) is ambiguous -- which broke the CTK 12.0 build. --- cub/benchmarks/bench/transform/tile/pytorch.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu index 006f88ff5c6..ccb4680ba21 100644 --- a/cub/benchmarks/bench/transform/tile/pytorch.cu +++ b/cub/benchmarks/bench/transform/tile/pytorch.cu @@ -425,10 +425,10 @@ CUB_NAMESPACE_END using element_types = nvbench::type_list; #else using element_types = nvbench::type_list< -# if _CCCL_HAS_NVFP16() +# if _CCCL_HAS_NVFP16() && _CCCL_CTK_AT_LEAST(12, 2) __half, # endif -# if _CCCL_HAS_NVBF16() +# if _CCCL_HAS_NVBF16() && _CCCL_CTK_AT_LEAST(12, 2) __nv_bfloat16, # endif float>; From 76ac55f715cd3da405538d3db9d3320426cdd91c Mon Sep 17 00:00:00 2001 From: Nan An Date: Fri, 12 Jun 2026 12:24:01 -0700 Subject: [PATCH 63/83] drop redundant comment on the gate macro --- cub/cub/device/dispatch/dispatch_transform_tile_config.cuh | 3 --- 1 file changed, 3 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh index 8c25ea9bd30..05c928acc1c 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh @@ -31,9 +31,6 @@ #define _CCCL_CUB_HAS_TILE_TRANSFORM() _CCCL_TILE_COMPILATION() -// Defined as a literal 1/0 (not (_CCCL_CUB_HAS_TILE_TRANSFORM() && defined(...))) so that -// `#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()` in non-system code (benches, tests) does not -// generate `defined` via macro expansion, which is UB and trips -Wexpansion-to-defined under -Werror. #if _CCCL_CUB_HAS_TILE_TRANSFORM() && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) # define _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() 1 #else From 67b94897cb54721828597632f4f0487c41c8f32e Mon Sep 17 00:00:00 2001 From: Nan An Date: Fri, 12 Jun 2026 15:37:47 -0700 Subject: [PATCH 64/83] address review nits in tile transform dispatch - device_transform.cuh: reflow the aligned_size_t comment to 120 cols; use `constexpr auto num_items_align` instead of spelling out ::cuda::std::size_t - dispatch_transform_tile.cuh: use ::cuda::std::iter_value_t over the CUB-local cub::detail::it_value_t; drop the redundant include (cudaError_t/cudaStream_t/cudaGetLastError come in transitively via cub/util_debug.cuh) --- cub/cub/device/device_transform.cuh | 14 ++++++-------- .../device/dispatch/dispatch_transform_tile.cuh | 5 ++--- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index 24a828c5fc0..9fff2582bc5 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -99,14 +99,12 @@ struct DeviceTransform // value passed by the user, but otherwise ignore the chosen signed offset type. using offset_t = ::cuda::std::int64_t; - // num_items may be a plain integer or a cuda::aligned_size_t -- an opt-in promise (the same one - // cuda::memcpy_async uses) that the pointers are N-aligned and num_items is a multiple of N. Unwrap - // it to a plain integer for the offset machinery (choose_signed_offset requires an integral type); - // the alignment promise is read separately by the tile hook below. For a plain integer this is a - // no-op: count_t == NumItemsT and count == num_items. - constexpr ::cuda::std::size_t num_items_align = ::cuda::__get_size_align_v; - using count_t = ::cuda::std::conditional_t<(num_items_align > 1), ::cuda::std::size_t, NumItemsT>; - const count_t count = static_cast(num_items); + // num_items may be a plain integer or cuda::aligned_size_t (the cuda::memcpy_async-style opt-in promising N-byte + // pointer alignment + size divisibility). Unwrap to a plain integer for the offset machinery (choose_signed_offset + // needs an integral type); the tile hook below reads the alignment promise. No-op for a plain integer. + constexpr auto num_items_align = ::cuda::__get_size_align_v; + using count_t = ::cuda::std::conditional_t<(num_items_align > 1), ::cuda::std::size_t, NumItemsT>; + const count_t count = static_cast(num_items); if (const cudaError_t error = detail::choose_signed_offset::is_exceeding_offset_type(count)) { return error; diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 7ce52562fa0..cee5effa0bb 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -37,6 +37,7 @@ # include # include +# include # include # include # include @@ -45,8 +46,6 @@ # include # include -# include - CUB_NAMESPACE_BEGIN namespace detail::transform::tile @@ -115,7 +114,7 @@ template inline constexpr bool tile_dispatch_eligible_v = THRUST_NS_QUALIFIER::is_contiguous_iterator_v && (THRUST_NS_QUALIFIER::is_contiguous_iterator_v && ...) - && cub::transform::tile_eligible_v, sizeof...(InIters)>; + && cub::transform::tile_eligible_v, sizeof...(InIters)>; // Runtime predicate consulted by the cub::DeviceTransform tile hook before // it commits to the tile path. Mirrors how CUB's dispatch_t::CanVectorize From 8614d456c66eda5c95b6782dfc29ecf993cf97a8 Mon Sep 17 00:00:00 2001 From: Nan An Date: Fri, 12 Jun 2026 15:37:47 -0700 Subject: [PATCH 65/83] collapse tile_eligible to a single variable template tile_eligible was a false_type struct that tile_eligible_v just forwarded to. Drop the struct and make tile_eligible_v the specializable extension point directly -- same (Op,T,NIn) granularity, one name instead of two. Updates the built-in half/bf16 specializations and the bench/test registrations to specialize the variable template (partial specialization over T). --- .../bench/transform/tile/babelstream.cu | 12 ++---- cub/benchmarks/bench/transform/tile/copy.cu | 3 +- .../bench/transform/tile/grayscale.cu | 3 +- .../bench/transform/tile/pytorch.cu | 42 +++++++------------ .../dispatch_transform_tile_traits.cuh | 31 ++++++-------- cub/test/catch2_test_device_transform_tile.cu | 6 +-- 6 files changed, 34 insertions(+), 63 deletions(-) diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu index 6e9caf03f2d..60ccfd9f5f7 100644 --- a/cub/benchmarks/bench/transform/tile/babelstream.cu +++ b/cub/benchmarks/bench/transform/tile/babelstream.cu @@ -86,17 +86,13 @@ CUB_NAMESPACE_BEGIN namespace transform { template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template <> struct tile_operator { diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu index 85ed12e0d4d..e766c420286 100644 --- a/cub/benchmarks/bench/transform/tile/copy.cu +++ b/cub/benchmarks/bench/transform/tile/copy.cu @@ -36,8 +36,7 @@ CUB_NAMESPACE_BEGIN namespace transform { template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template <> struct tile_operator { diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu index f9ab98d62ad..daee79afc16 100644 --- a/cub/benchmarks/bench/transform/tile/grayscale.cu +++ b/cub/benchmarks/bench/transform/tile/grayscale.cu @@ -41,8 +41,7 @@ CUB_NAMESPACE_BEGIN namespace transform { template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template <> struct tile_operator { diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu index ccb4680ba21..527ac65eb72 100644 --- a/cub/benchmarks/bench/transform/tile/pytorch.cu +++ b/cub/benchmarks/bench/transform/tile/pytorch.cu @@ -286,23 +286,17 @@ namespace transform { // Unary template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template <> struct tile_operator { @@ -354,29 +348,21 @@ struct tile_mufu_heavy : ::cuda::std::true_type // Binary template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template <> struct tile_operator { diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh index f51d280264b..c347f6f0631 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh @@ -4,10 +4,9 @@ // Compile-time policy for cub::DeviceTransform's tile path. // // PUBLIC EXTENSION POINTS (cub::transform) -- two independent axes: -// tile_eligible -- specialize to true_type to opt a (functor -// type, element type, input arity) combo into -// the tile dispatch path. Eligibility only. -// tile_eligible_v<...> -- variable-template companion. +// tile_eligible_v -- specialize to true to opt a (functor type, +// element type, input arity) combo into the +// tile dispatch path. Eligibility only. // tile_operator -- the __tile__ functor the tile kernel runs // for Op. No default: every tile-eligible Op // must specialize it with `using type = and tile_operator. +// always registers both: tile_eligible_v and tile_operator. // // INTERNAL (cub::detail::transform::tile): // tile_plus, tile_multiplies -- shipped tile-friendly substitutes used by @@ -56,16 +55,14 @@ CUB_NAMESPACE_BEGIN // Public extension surface. namespace transform { +// Opt a (functor type, element type, input arity) combo into the tile dispatch path: specialize this to +// true for the combo. Eligibility only -- the __tile__ functor to actually run is named by tile_operator. template -struct tile_eligible : ::cuda::std::false_type -{}; - -template -inline constexpr bool tile_eligible_v = tile_eligible::value; +inline constexpr bool tile_eligible_v = false; // The __tile__ functor the tile kernel runs for Op -- the tile-side mirror of the scalar Op. There is // no default: a scalar functor cannot be invoked on ct::tile, so every tile-eligible Op must specialize -// this with a `type` naming a stateless __tile__ functor. tile_eligible says a combo MAY use the +// this with a `type` naming a stateless __tile__ functor. tile_eligible_v says a combo MAY use the // tile path; tile_operator says WHAT the tile kernel runs. template struct tile_operator @@ -120,11 +117,9 @@ namespace transform // cuda::std::plus / multiplies are scalar ops, so each is marked eligible and given a tile_operator mirror. # if _CCCL_HAS_NVFP16() template <> -struct tile_eligible<::cuda::std::plus<::__half>, ::__half, 2> : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v<::cuda::std::plus<::__half>, ::__half, 2> = true; template <> -struct tile_eligible<::cuda::std::multiplies<::__half>, ::__half, 2> : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v<::cuda::std::multiplies<::__half>, ::__half, 2> = true; template <> struct tile_operator<::cuda::std::plus<::__half>> { @@ -139,11 +134,9 @@ struct tile_operator<::cuda::std::multiplies<::__half>> # if _CCCL_HAS_NVBF16() template <> -struct tile_eligible<::cuda::std::plus<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v<::cuda::std::plus<::__nv_bfloat16>, ::__nv_bfloat16, 2> = true; template <> -struct tile_eligible<::cuda::std::multiplies<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v<::cuda::std::multiplies<::__nv_bfloat16>, ::__nv_bfloat16, 2> = true; template <> struct tile_operator<::cuda::std::plus<::__nv_bfloat16>> { diff --git a/cub/test/catch2_test_device_transform_tile.cu b/cub/test/catch2_test_device_transform_tile.cu index 3ff05fac915..f77eea0e31b 100644 --- a/cub/test/catch2_test_device_transform_tile.cu +++ b/cub/test/catch2_test_device_transform_tile.cu @@ -66,8 +66,7 @@ CUB_NAMESPACE_BEGIN namespace transform { template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template <> struct tile_operator { @@ -75,8 +74,7 @@ struct tile_operator }; template -struct tile_eligible : ::cuda::std::true_type -{}; +inline constexpr bool tile_eligible_v = true; template <> struct tile_operator { From 81f21333b3a65ccf29b06d28e4903272ea968caa Mon Sep 17 00:00:00 2001 From: Nan An Date: Fri, 12 Jun 2026 15:37:47 -0700 Subject: [PATCH 66/83] avoid nvcc 13.4 tile lambda-linkage miscompile: call partition-view helper directly transform_kernel loaded inputs through a load_one lambda that called make_aligned_partition_view. Under --expt-relaxed-constexpr (which the benches pass) nvcc 13.4 treats the implicitly-constexpr lambda as __host__ __device__, so a non-tile copy references the __tile__ helper -- which has no body outside tile space -- and emits it as a bodiless internal-linkage declaration, tripping the IR verifier ("Broken module"). Call the helper directly in the pack expansion instead; the kernel body is pure-tile so the conflict can't arise. Minimal repro + nvbug filed; same execution-space family as the _CCCL_API __tile__ strip. --- .../kernels/kernel_transform_tile.cuh | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index 778721f6257..a8bcd7dd836 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -47,30 +47,26 @@ template // // assume_divisible<16> -- promises num_items % 16 == 0, so the tile DSL can elide tail handling. // assume_bounded_below<0> -- promises num_items >= 0; enables sign-comparison simplifications. +// +// NOTE: make_aligned_partition_view is invoked directly. do NOT wrap these calls in a lambda because of compiler bug: +// templated __tile__ helper + a lambda that calls it + --expt-relaxed-constexpr produces invalid IR. template __tile_global__ void transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins) { - namespace ct = ::cuda::tiles; - using cub::detail::transform::tile::make_aligned_partition_view; + namespace ct = ::cuda::tiles; const auto bx = ct::bid().x; + const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); - const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); const auto out_view = make_aligned_partition_view(out, n); - auto load_one = [bx, n](auto* ptr) { - return make_aligned_partition_view(ptr, n).load_masked(bx); - }; - - out_view.store_masked(Fn{}(load_one(ins)...), bx); + out_view.store_masked(Fn{}(make_aligned_partition_view(ins, n).load_masked(bx)...), bx); } template __tile_global__ void fill_kernel(const ::cuda::std::int64_t num_items, T* __restrict__ out, const T value) { - namespace ct = ::cuda::tiles; - using cub::detail::transform::tile::make_aligned_partition_view; - const auto bx = ct::bid().x; - + namespace ct = ::cuda::tiles; + const auto bx = ct::bid().x; const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); const auto out_view = make_aligned_partition_view(out, n); using tile_t = ct::tile>; From 393ce96381fbc447158ec29d27f86f9d1dd87a62 Mon Sep 17 00:00:00 2001 From: Nan An Date: Fri, 12 Jun 2026 16:10:42 -0700 Subject: [PATCH 67/83] add opt-in CMake option to build cub::DeviceTransform's tile path under --enable-tile CCCL_ENABLE_TILE_TRANSFORM_DISPATCH (default OFF, defined in cub/CMakeLists.txt) scopes nvcc --enable-tile + the dispatch opt-in macro to just the tile transform test and the bench/transform/tile benches, via per-target compile options in test/ and benchmarks/. This replaces forcing --enable-tile through global CMAKE_CUDA_FLAGS, which also lands on the C++17 c2h helper lib where cuda_tile.h hard-errors on the dialect. CI never sets the option, so the tile path stays compiled out there; locally the test builds + passes (960 assertions on sm_120). --- cub/CMakeLists.txt | 10 ++++++++++ cub/benchmarks/CMakeLists.txt | 16 ++++++++++++++++ cub/test/CMakeLists.txt | 16 ++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/cub/CMakeLists.txt b/cub/CMakeLists.txt index 4c8c778f7fe..4b872a7993c 100644 --- a/cub/CMakeLists.txt +++ b/cub/CMakeLists.txt @@ -10,6 +10,16 @@ option(CUB_ENABLE_HEADER_TESTING "Test that all public headers compile." ON) option(CUB_ENABLE_TESTING "Build CUB testing suite." ON) option(CUB_ENABLE_EXAMPLES "Build CUB examples." ON) +# Opt-in: build cub::DeviceTransform's tile-DSL path (test + benches) under `nvcc --enable-tile`. +# Defaults OFF; CI never sets it, so the tile code stays gated out except in an explicit local +# --enable-tile build. Applied per-target in test/ and benchmarks/ -- never via global CMAKE_CUDA_FLAGS, +# which would also hit the C++17 c2h helper lib where cuda_tile.h hard-errors on the dialect. +option( + CCCL_ENABLE_TILE_TRANSFORM_DISPATCH + "Build cub::DeviceTransform's tile path (requires nvcc --enable-tile)." + OFF +) + option(CUB_ENABLE_TUNING "Build CUB tuning suite." OFF) if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") set(CUB_ENABLE_TUNING OFF) diff --git a/cub/benchmarks/CMakeLists.txt b/cub/benchmarks/CMakeLists.txt index e54cf1c80db..c9e9e7893cd 100644 --- a/cub/benchmarks/CMakeLists.txt +++ b/cub/benchmarks/CMakeLists.txt @@ -91,6 +91,22 @@ function(add_bench target_name bench_name bench_src) cccl.nvbench_helper nvbench::main ) + + # Tile-DSL transform benches: build under --enable-tile + the dispatch opt-in when requested. Gated by + # CCCL_ENABLE_TILE_TRANSFORM_DISPATCH (default OFF) so CI builds the tile/ benches with the tile path off. + if ( + CCCL_ENABLE_TILE_TRANSFORM_DISPATCH + AND "${bench_src}" MATCHES "/transform/tile/" + ) + target_compile_options( + ${bench_target} + PRIVATE "$<$:--enable-tile>" + ) + target_compile_definitions( + ${bench_target} + PRIVATE CCCL_ENABLE_TILE_TRANSFORM_DISPATCH + ) + endif() endfunction() function(add_bench_dir bench_dir) diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt index ce46a86b93e..0214861c053 100644 --- a/cub/test/CMakeLists.txt +++ b/cub/test/CMakeLists.txt @@ -172,6 +172,22 @@ function( target_compile_options(${test_target} PRIVATE -ftemplate-depth=1000) # for handling large type lists endif() + # Tile-DSL transform test: compile under --enable-tile and turn on the dispatch hook. Gated by + # CCCL_ENABLE_TILE_TRANSFORM_DISPATCH (default OFF) so CI keeps the tile path compiled out. + if ( + CCCL_ENABLE_TILE_TRANSFORM_DISPATCH + AND "${test_src}" MATCHES "test_device_transform_tile\\.cu$" + ) + target_compile_options( + ${test_target} + PRIVATE $<$:--enable-tile> + ) + target_compile_definitions( + ${test_target} + PRIVATE CCCL_ENABLE_TILE_TRANSFORM_DISPATCH + ) + endif() + # enable lambdas for all API examples if ("${test_src}" MATCHES "test.+_api\\.cu$") target_compile_options( From 76d02eb7dc1b1399b506542070cfdfdd9683899e Mon Sep 17 00:00:00 2001 From: Nan An Date: Fri, 12 Jun 2026 16:56:41 -0700 Subject: [PATCH 68/83] gate cub::DeviceTransform tile path on CTK 13.4 tile C++ exists since 13.3, but the 13.3 tile compiler has too many codegen issues, so 13.4 is the supported floor. _CCCL_CUB_HAS_TILE_TRANSFORM() now requires _CCCL_CTK_AT_LEAST(13, 4) in addition to --enable-tile, so the tile headers compile out entirely below 13.4. The CMake option errors cleanly if enabled on < 13.4 instead of failing later on an unrecognized --enable-tile. --- cub/CMakeLists.txt | 11 +++++++++++ .../dispatch/dispatch_transform_tile_config.cuh | 14 ++++++++------ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/cub/CMakeLists.txt b/cub/CMakeLists.txt index 4b872a7993c..e45b5635e91 100644 --- a/cub/CMakeLists.txt +++ b/cub/CMakeLists.txt @@ -19,6 +19,17 @@ option( "Build cub::DeviceTransform's tile path (requires nvcc --enable-tile)." OFF ) +if ( + CCCL_ENABLE_TILE_TRANSFORM_DISPATCH + AND "${CMAKE_CUDA_COMPILER_ID}" STREQUAL "NVIDIA" + AND "${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS 13.4 +) + message( + FATAL_ERROR + "CCCL_ENABLE_TILE_TRANSFORM_DISPATCH requires CUDA 13.4+ (nvcc --enable-tile). " + "Found ${CMAKE_CUDA_COMPILER_VERSION}." + ) +endif() option(CUB_ENABLE_TUNING "Build CUB tuning suite." OFF) if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh index 05c928acc1c..833bca94d83 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh @@ -6,11 +6,13 @@ // // _CCCL_CUB_HAS_TILE_TRANSFORM() // True when nvcc is compiling in tile mode (--enable-tile, i.e. -// _CCCL_TILE_COMPILATION()). The other preconditions tile needs are -// enforced where they belong: CTK 13.3+ is implied because --enable-tile -// is a 13.3+ nvcc flag, and C++20 is enforced by cuda_tile.h itself with -// an explicit #error. When false, the tile headers (kernel / tuning / -// dispatch / traits) are skipped entirely. +// _CCCL_TILE_COMPILATION()) AND the toolkit is CTK 13.4+. tile C++ exists +// since 13.3, but we require 13.4: the 13.3 tile compiler has too many +// codegen issues, so 13.4 is the supported floor. (C++20 is enforced by +// cuda_tile.h itself with an explicit #error.) The sm_80+ requirement is +// handled at runtime in the dispatch + NV_IF_TARGET in the kernels, not +// here, since this gate is host+device. When false, the tile headers +// (kernel / tuning / dispatch / traits) are skipped entirely. // // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() // True when the dispatch hook in cub::DeviceTransform should fire. Same as @@ -29,7 +31,7 @@ # pragma system_header #endif // no system header -#define _CCCL_CUB_HAS_TILE_TRANSFORM() _CCCL_TILE_COMPILATION() +#define _CCCL_CUB_HAS_TILE_TRANSFORM() (_CCCL_TILE_COMPILATION() && _CCCL_CTK_AT_LEAST(13, 4)) #if _CCCL_CUB_HAS_TILE_TRANSFORM() && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) # define _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() 1 From 6e713645baef0ae4340cfa52487e6028e23a75f4 Mon Sep 17 00:00:00 2001 From: Nan An Date: Fri, 12 Jun 2026 17:45:39 -0700 Subject: [PATCH 69/83] gate the tile path on sm_80 (NV_IF_TARGET + runtime cc check) tile requires sm_80+. Mirrors how CUB handles its arch-specific ublkcp kernel -- no CMake arch gate: (1) NV_IF_TARGET(NV_PROVIDES_SM_80) around the transform/fill kernel bodies, so sub-80 cubins carry no tile SASS; (2) a runtime device_supports_tile() (cc >= 80) check in the dispatch hook -- below sm_80, or if the capability query fails, fall back to standard CUB. --enable-tile itself accepts all CTK-13.4 arches, so the floor is enforced at runtime, exactly like ublkcp's cc >= 90 policy gate. --- cub/cub/device/device_transform.cuh | 22 +++++++++----- .../dispatch/dispatch_transform_tile.cuh | 20 ++++++++----- .../kernels/kernel_transform_tile.cuh | 30 +++++++++++-------- 3 files changed, 46 insertions(+), 26 deletions(-) diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index 9fff2582bc5..c6059b1cb69 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -113,24 +113,32 @@ struct DeviceTransform const auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env).get(); #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() - // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible we route to the tile kernel: + // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible and the device is sm_80+ we route to the + // tile kernel: // - if num_items is a cuda::aligned_size_t=16, the caller has promised 16-byte pointer // alignment + divisibility, so we commit to tile at compile time and skip the runtime check; // - otherwise we check the alignment/divisibility preconditions at runtime and fall through to // the standard CUB dispatch below if they do not hold (CUB's kernels handle the unaligned/tail // case, so this is a graceful fallback, not an error). + // device_supports_tile() enforces the sm_80+ hardware floor at runtime; below it (or if the capability + // query fails) we fall through to the standard CUB dispatch. if constexpr (StableAddress == detail::transform::requires_stable_address::no && ::cuda::std::is_same_v && cub::detail::transform::tile:: tile_dispatch_eligible_v) { - if constexpr (num_items_align >= 16) + if (cub::detail::transform::tile::device_supports_tile()) { - return cub::detail::transform::tile::dispatch(inputs, output, static_cast(count), stream); - } - else if (cub::detail::transform::tile::runtime_preconditions_valid(inputs, output, static_cast(count))) - { - return cub::detail::transform::tile::dispatch(inputs, output, static_cast(count), stream); + if constexpr (num_items_align >= 16) + { + return cub::detail::transform::tile::dispatch( + inputs, output, static_cast(count), stream); + } + else if (cub::detail::transform::tile::runtime_preconditions_valid(inputs, output, static_cast(count))) + { + return cub::detail::transform::tile::dispatch( + inputs, output, static_cast(count), stream); + } } } #endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index cee5effa0bb..ff20071c6bb 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -10,7 +10,7 @@ // used by `dispatch` // User-facing extension points (tile_eligible / tile_mufu_heavy) live in // dispatch_transform_tile_traits.cuh under cub::transform. -// Requires CTK 13.3 or newer and nvcc invoked with --enable-tile. +// Requires CTK 13.4 or newer and nvcc invoked with --enable-tile. #pragma once @@ -32,11 +32,13 @@ # include # include # include +# include # include # include # include +# include # include # include # include @@ -104,18 +106,22 @@ struct DeviceTransform } }; -// Combined compile-time predicate used by cub::DeviceTransform's __transform_internal -// to decide whether to route a given (Op, OutIter, InIters...) to the tile path. -// The call site lifts this into an `if constexpr`: when this is true the hook -// tries the tile kernel first and, on runtime alignment / divisibility -// failure, falls through to the standard CUB dispatch below. When false, the -// tile branch is discarded and only CUB's standard path is emitted. +// Combined compile-time predicate for whether (Op, OutIter, InIters...) can use the tile path. We use this with +// `if constexpr` for dispatch: when true the hook tries the tile kernel first and, on runtime alignment/divisibility +// failure, falls through to the standard CUB dispatch; when false the tile branch is discarded entirely. template inline constexpr bool tile_dispatch_eligible_v = THRUST_NS_QUALIFIER::is_contiguous_iterator_v && (THRUST_NS_QUALIFIER::is_contiguous_iterator_v && ...) && cub::transform::tile_eligible_v, sizeof...(InIters)>; +// Runtime arch gate: tile needs sm_80+. False (fall back to CUB) below sm_80 or if the cc query fails. +[[nodiscard]] CUB_RUNTIME_FUNCTION inline bool device_supports_tile() +{ + ::cuda::compute_capability cc{}; + return cub::detail::ptx_compute_cap(cc) == ::cudaSuccess && cc >= ::cuda::compute_capability{8, 0}; +} + // Runtime predicate consulted by the cub::DeviceTransform tile hook before // it commits to the tile path. Mirrors how CUB's dispatch_t::CanVectorize // guards the vectorized kernel. The tile kernels use ct::assume_aligned<16> diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index a8bcd7dd836..21345c6bf31 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -19,6 +19,8 @@ # include +# include + # include CUB_NAMESPACE_BEGIN @@ -48,29 +50,33 @@ template // assume_divisible<16> -- promises num_items % 16 == 0, so the tile DSL can elide tail handling. // assume_bounded_below<0> -- promises num_items >= 0; enables sign-comparison simplifications. // +// The body is guarded by NV_IF_TARGET(NV_PROVIDES_SM_80): tile requires sm_80+, so on older arches the kernel +// compiles to a no-op (no unsupported SASS). The dispatch only launches it on sm_80+ devices (runtime cc check). +// // NOTE: make_aligned_partition_view is invoked directly. do NOT wrap these calls in a lambda because of compiler bug: // templated __tile__ helper + a lambda that calls it + --expt-relaxed-constexpr produces invalid IR. template __tile_global__ void transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins) { - namespace ct = ::cuda::tiles; - const auto bx = ct::bid().x; - const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); - - const auto out_view = make_aligned_partition_view(out, n); - out_view.store_masked(Fn{}(make_aligned_partition_view(ins, n).load_masked(bx)...), bx); + namespace ct = ::cuda::tiles; + NV_IF_TARGET( + NV_PROVIDES_SM_80, + (const auto bx = ct::bid().x; const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); + const auto out_view = make_aligned_partition_view(out, n); + out_view.store_masked(Fn{}(make_aligned_partition_view(ins, n).load_masked(bx)...), bx);)); } template __tile_global__ void fill_kernel(const ::cuda::std::int64_t num_items, T* __restrict__ out, const T value) { - namespace ct = ::cuda::tiles; - const auto bx = ct::bid().x; - const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); - const auto out_view = make_aligned_partition_view(out, n); - using tile_t = ct::tile>; - out_view.store_masked(ct::full(value), bx); + namespace ct = ::cuda::tiles; + NV_IF_TARGET( + NV_PROVIDES_SM_80, + (const auto bx = ct::bid().x; const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); + const auto out_view = make_aligned_partition_view(out, n); + using tile_t = ct::tile>; + out_view.store_masked(ct::full(value), bx);)); } } // namespace detail::transform::tile From a1c01c71087ffea97afaa4f1b9fc742eca2f57a2 Mon Sep 17 00:00:00 2001 From: Nan An Date: Fri, 12 Jun 2026 17:45:39 -0700 Subject: [PATCH 70/83] default CCCL_ENABLE_TILE_TRANSFORM_DISPATCH ON for CTK 13.4+ Instead of a hard OFF default, the option now defaults ON when the toolkit can build the tile path (nvcc NVIDIA 13.4+), so 13.4+ configs -- including CI -- build and run the tile transform test (and benches) automatically; below 13.4 it stays OFF and compiles out. The sm_80+ hardware floor is handled at runtime (dispatch cc check + NV_IF_TARGET in the kernels), so an auto-enabled 13.4+ build still runs correctly on any GPU (falls back to standard CUB below sm_80). An explicit ON below 13.4 still errors via the existing guard. --- cub/CMakeLists.txt | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/cub/CMakeLists.txt b/cub/CMakeLists.txt index e45b5635e91..820614d1e65 100644 --- a/cub/CMakeLists.txt +++ b/cub/CMakeLists.txt @@ -10,14 +10,21 @@ option(CUB_ENABLE_HEADER_TESTING "Test that all public headers compile." ON) option(CUB_ENABLE_TESTING "Build CUB testing suite." ON) option(CUB_ENABLE_EXAMPLES "Build CUB examples." ON) -# Opt-in: build cub::DeviceTransform's tile-DSL path (test + benches) under `nvcc --enable-tile`. -# Defaults OFF; CI never sets it, so the tile code stays gated out except in an explicit local -# --enable-tile build. Applied per-target in test/ and benchmarks/ -- never via global CMAKE_CUDA_FLAGS, -# which would also hit the C++17 c2h helper lib where cuda_tile.h hard-errors on the dialect. +# Build cub::DeviceTransform's tile-DSL path (test + benches) under `nvcc --enable-tile`. Defaults ON when the +# toolkit can build it (CTK 13.4+), so 13.4+ configs -- including CI -- exercise the tile path automatically; OFF +# and compiled out below 13.4. The sm_80+ floor is enforced at runtime (dispatch cc check + NV_IF_TARGET in the +# kernels), so a 13.4+ build still runs correctly on any GPU. +set(_cccl_tile_transform_default OFF) +if ( + "${CMAKE_CUDA_COMPILER_ID}" STREQUAL "NVIDIA" + AND NOT "${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS 13.4 +) + set(_cccl_tile_transform_default ON) +endif() option( CCCL_ENABLE_TILE_TRANSFORM_DISPATCH "Build cub::DeviceTransform's tile path (requires nvcc --enable-tile)." - OFF + ${_cccl_tile_transform_default} ) if ( CCCL_ENABLE_TILE_TRANSFORM_DISPATCH From b9d49d02d8b0193382e0c17b4bd810b36bc1a013 Mon Sep 17 00:00:00 2001 From: Nan An Date: Fri, 12 Jun 2026 18:02:09 -0700 Subject: [PATCH 71/83] trim verbose comments in the tile transform headers Comment-only cleanup: tighten the over-long doc blocks (gate-macro CTK note, tile_operator, runtime_preconditions_valid, the dispatch bridge, and the kernel header) and pack multiline comments to the 120-col limit. No code change. --- .../dispatch/dispatch_transform_tile.cuh | 20 +++++-------------- .../dispatch_transform_tile_config.cuh | 9 ++------- .../dispatch_transform_tile_traits.cuh | 5 +---- .../kernels/kernel_transform_tile.cuh | 18 +++++++---------- 4 files changed, 15 insertions(+), 37 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index ff20071c6bb..48393cc6dcc 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -122,11 +122,8 @@ inline constexpr bool tile_dispatch_eligible_v = return cub::detail::ptx_compute_cap(cc) == ::cudaSuccess && cc >= ::cuda::compute_capability{8, 0}; } -// Runtime predicate consulted by the cub::DeviceTransform tile hook before -// it commits to the tile path. Mirrors how CUB's dispatch_t::CanVectorize -// guards the vectorized kernel. The tile kernels use ct::assume_aligned<16> -// and ct::assume_divisible<16>, so violating these at runtime is UB. -// Returns false to tell the hook to fall back to the standard CUB dispatch. +// Runtime precondition the tile hook checks before dispatching: 16-byte pointer alignment + num_items % 16 == 0 +// (the kernels assume_aligned<16>/assume_divisible<16>, so violating these is UB). False -> fall back to CUB. template [[nodiscard]] CUB_RUNTIME_FUNCTION bool runtime_preconditions_valid(::cuda::std::tuple const& inputs, OutIter output, OffsetT num_items) @@ -150,16 +147,9 @@ runtime_preconditions_valid(::cuda::std::tuple const& inputs, OutIte return aligned_out && aligned_in && (num_items % items_divisor) == 0; } -// Bridge between cub::DeviceTransform::__transform_internal and the tile -// DeviceTransform above. Precondition: tile_dispatch_eligible_v is true AND runtime_preconditions_valid returned true. The kernel -// itself assumes 16-byte pointer alignment and num_items divisibility; the -// caller (the hook in device_transform.cuh) is responsible for checking -// runtime_preconditions_valid first. -// -// The tile kernel is launched with tile_operator_t: for a scalar Op that is its -// registered tile-friendly mirror (a __tile__ functor), and for an already-tile Op it -// is Op itself. A scalar functor cannot be invoked on ct::tile arguments. +// Bridge from cub::DeviceTransform::__transform_internal to the tile DeviceTransform. Precondition (the caller +// checks it): tile_dispatch_eligible_v is true AND runtime_preconditions_valid returned true. Launches the kernel +// with tile_operator_t -- Op's registered __tile__ mirror (a scalar functor can't be invoked on ct::tile). template [[nodiscard]] CUB_RUNTIME_FUNCTION ::cudaError_t dispatch(::cuda::std::tuple inputs, OutIter output, OffsetT num_items, ::cudaStream_t stream) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh index 833bca94d83..0c06b091335 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh @@ -5,13 +5,8 @@ // share. Two macros: // // _CCCL_CUB_HAS_TILE_TRANSFORM() -// True when nvcc is compiling in tile mode (--enable-tile, i.e. -// _CCCL_TILE_COMPILATION()) AND the toolkit is CTK 13.4+. tile C++ exists -// since 13.3, but we require 13.4: the 13.3 tile compiler has too many -// codegen issues, so 13.4 is the supported floor. (C++20 is enforced by -// cuda_tile.h itself with an explicit #error.) The sm_80+ requirement is -// handled at runtime in the dispatch + NV_IF_TARGET in the kernels, not -// here, since this gate is host+device. When false, the tile headers +// True when nvcc is in tile mode (--enable-tile / _CCCL_TILE_COMPILATION()) AND CTK 13.4+. The sm_80+ +// requirement is handled at runtime + NV_IF_TARGET in the kernels, not here. When false, the tile headers // (kernel / tuning / dispatch / traits) are skipped entirely. // // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh index c347f6f0631..d9db2d4684c 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh @@ -60,10 +60,7 @@ namespace transform template inline constexpr bool tile_eligible_v = false; -// The __tile__ functor the tile kernel runs for Op -- the tile-side mirror of the scalar Op. There is -// no default: a scalar functor cannot be invoked on ct::tile, so every tile-eligible Op must specialize -// this with a `type` naming a stateless __tile__ functor. tile_eligible_v says a combo MAY use the -// tile path; tile_operator says WHAT the tile kernel runs. +// The __tile__ functor the tile kernel runs for Op. template struct tile_operator { diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index 21345c6bf31..bd36bd1843d 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -43,18 +43,14 @@ template return ct::partition_view{span, ct::shape{}}; } -// Tile DSL kernels backing cub::DeviceTransform's tile path. The kernels assume 16-byte alignment on every pointer -// and 16-byte divisibility on num_items so the compiler can pick LDG.E.128. Callers in the dispatch header are -// responsible for honoring those preconditions. +// Tile DSL kernels backing cub::DeviceTransform's tile path. They assume 16-byte pointer alignment + 16-divisible +// num_items (so the compiler picks LDG.E.128); the dispatch header honors that. NV_IF_TARGET(NV_PROVIDES_SM_80) +// guards the body -- tile needs sm_80+, so sub-80 arches get a no-op kernel (dispatch only launches it on sm_80+). +// assume_divisible<16> -- num_items % 16 == 0, so the tile DSL can elide tail handling. +// assume_bounded_below<0> -- num_items >= 0; enables sign-comparison simplifications. // -// assume_divisible<16> -- promises num_items % 16 == 0, so the tile DSL can elide tail handling. -// assume_bounded_below<0> -- promises num_items >= 0; enables sign-comparison simplifications. -// -// The body is guarded by NV_IF_TARGET(NV_PROVIDES_SM_80): tile requires sm_80+, so on older arches the kernel -// compiles to a no-op (no unsupported SASS). The dispatch only launches it on sm_80+ devices (runtime cc check). -// -// NOTE: make_aligned_partition_view is invoked directly. do NOT wrap these calls in a lambda because of compiler bug: -// templated __tile__ helper + a lambda that calls it + --expt-relaxed-constexpr produces invalid IR. +// NOTE: make_aligned_partition_view is invoked directly -- do NOT wrap these calls in a lambda: nvcc 13.4 +// miscompiles a templated __tile__ helper called via a lambda under --expt-relaxed-constexpr (invalid IR). template __tile_global__ void transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins) From f76647e75c45fa6bbf8b0f1f7fdecfaf2329fe8a Mon Sep 17 00:00:00 2001 From: Nan An Date: Sat, 13 Jun 2026 20:37:37 -0700 Subject: [PATCH 72/83] tidy tile bench type axes copy: drop float (same 4-byte width as int32_t -- redundant for an identity-copy bandwidth bench, which only depends on element size). All tile benches: use nvbench:: type aliases (int8_t/int16_t/int32_t/float32_t/float64_t) for the element-type axis instead of std:: -- same underlying types, but nvbench's namespaced spelling is the bench convention. __half/__nv_bfloat16 (pytorch) keep their names; no nvbench alias. --- cub/benchmarks/bench/transform/tile/babelstream.cu | 2 +- cub/benchmarks/bench/transform/tile/copy.cu | 2 +- cub/benchmarks/bench/transform/tile/grayscale.cu | 2 +- cub/benchmarks/bench/transform/tile/pytorch.cu | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu index 60ccfd9f5f7..45260565b2a 100644 --- a/cub/benchmarks/bench/transform/tile/babelstream.cu +++ b/cub/benchmarks/bench/transform/tile/babelstream.cu @@ -122,7 +122,7 @@ CUB_NAMESPACE_END #ifdef TUNE_T using element_types = nvbench::type_list; #else -using element_types = nvbench::type_list; +using element_types = nvbench::type_list; #endif inline auto array_size_powers = nvbench::range(16, 32, 4); diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu index e766c420286..e6b869f15e0 100644 --- a/cub/benchmarks/bench/transform/tile/copy.cu +++ b/cub/benchmarks/bench/transform/tile/copy.cu @@ -49,7 +49,7 @@ CUB_NAMESPACE_END #ifdef TUNE_T using element_types = nvbench::type_list; #else -using element_types = nvbench::type_list; +using element_types = nvbench::type_list; #endif template diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu index daee79afc16..f7e2a581ae0 100644 --- a/cub/benchmarks/bench/transform/tile/grayscale.cu +++ b/cub/benchmarks/bench/transform/tile/grayscale.cu @@ -54,7 +54,7 @@ CUB_NAMESPACE_END #ifdef TUNE_T using value_types = nvbench::type_list; #else -using value_types = nvbench::type_list; +using value_types = nvbench::type_list; #endif template diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu index 527ac65eb72..be11d97523f 100644 --- a/cub/benchmarks/bench/transform/tile/pytorch.cu +++ b/cub/benchmarks/bench/transform/tile/pytorch.cu @@ -417,7 +417,7 @@ using element_types = nvbench::type_list< # if _CCCL_HAS_NVBF16() && _CCCL_CTK_AT_LEAST(12, 2) __nv_bfloat16, # endif - float>; + nvbench::float32_t>; #endif template From b1df2c620102c685d0fca29ec6c8ed0bffb4ef0f Mon Sep 17 00:00:00 2001 From: Nan An Date: Mon, 15 Jun 2026 09:51:48 -0700 Subject: [PATCH 73/83] collapse tile_mufu_heavy to a single variable template Same treatment as tile_eligible: drop the false_type struct, make tile_mufu_heavy_v the specializable extension point directly. The dispatch consumer and the pytorch bench's specializations already used the _v form; this just removes the redundant struct, matching the _v-trait convention. --- cub/benchmarks/bench/transform/tile/pytorch.cu | 17 ++++++----------- .../dispatch/dispatch_transform_tile_traits.cuh | 14 ++++---------- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu index be11d97523f..a63a89f68da 100644 --- a/cub/benchmarks/bench/transform/tile/pytorch.cu +++ b/cub/benchmarks/bench/transform/tile/pytorch.cu @@ -2,7 +2,7 @@ // SPDX-License-Identifier: BSD-3-Clause // Tile variant of the PyTorch-style transform benches. Each named op registers a tile_operator -// substitute (gated); MUFU-heavy ops also opt into tile_mufu_heavy<> so the tile policy picker caps +// substitute (gated); MUFU-heavy ops also opt into tile_mufu_heavy_v so the tile policy picker caps // items/thread at the vector width on sub-4-byte types. Under --enable-tile + // CCCL_ENABLE_TILE_TRANSFORM_DISPATCH the dispatch hook routes them to the tile kernel; otherwise this // is the standard CUB path. This file disappears once tile dispatch is fully transparent. @@ -331,20 +331,15 @@ struct tile_operator // MUFU-heavy unary ops: hint the tile policy picker to cap items/thread at the vector width on // sub-4-byte types. template <> -struct tile_mufu_heavy : ::cuda::std::true_type -{}; +inline constexpr bool tile_mufu_heavy_v = true; template <> -struct tile_mufu_heavy : ::cuda::std::true_type -{}; +inline constexpr bool tile_mufu_heavy_v = true; template <> -struct tile_mufu_heavy : ::cuda::std::true_type -{}; +inline constexpr bool tile_mufu_heavy_v = true; template <> -struct tile_mufu_heavy : ::cuda::std::true_type -{}; +inline constexpr bool tile_mufu_heavy_v = true; template <> -struct tile_mufu_heavy : ::cuda::std::true_type -{}; +inline constexpr bool tile_mufu_heavy_v = true; // Binary template diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh index d9db2d4684c..ad4e05926e0 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh @@ -16,9 +16,7 @@ // on ct::tile. Omitting it is a clear // static_assert, not a cryptic kernel error. // tile_operator_t -- alias for tile_operator::type. -// tile_mufu_heavy -- specialize to flag Op as MUFU-heavy; the -// tile policy picker uses this hint. -// tile_mufu_heavy_v<...> -- variable-template companion. +// tile_mufu_heavy_v -- specialize to true to flag Op as MUFU-heavy; the tile policy picker uses it. // // Eligibility ("may this combo use the tile path?") and substitution ("which // __tile__ functor do we actually run?") are separate traits, so an eligible op @@ -72,14 +70,10 @@ struct tile_operator template using tile_operator_t = typename tile_operator::type; -// Hint that Op uses MUFU (multi-function unit, sin/cos/exp/log/tanh/rcp/rsq). Setting this makes -// the tile policy picker cap items/thread so MUFU pipes are not oversaturated. +// Hint that Op uses MUFU (multi-function unit, sin/cos/exp/log/tanh/rcp/rsq); specialize to true to make the tile +// policy picker cap items/thread so MUFU pipes are not oversaturated. template -struct tile_mufu_heavy : ::cuda::std::false_type -{}; - -template -inline constexpr bool tile_mufu_heavy_v = tile_mufu_heavy::value; +inline constexpr bool tile_mufu_heavy_v = false; } // namespace transform // Internal substitutes shipped by CCCL. From 7428773a71fe2fa2ad4fbd24f8162be76e21f1d3 Mon Sep 17 00:00:00 2001 From: Nan An Date: Mon, 15 Jun 2026 15:23:42 -0700 Subject: [PATCH 74/83] libcudacxx: undef tile-unsupported builtins instead of stripping _CCCL_API Restore upstream _CCCL_API; undef __builtin_assume_aligned and __builtin_launder under tile. Green on 13.5; 13.4 still needs the _CCCL_TILE strip (exec-check unfixed). --- libcudacxx/include/cuda/std/__cccl/builtin.h | 5 +++++ libcudacxx/include/cuda/std/__cccl/visibility.h | 10 +--------- libcudacxx/include/cuda/std/__new/launder.h | 4 ++++ 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h index f6cd76cc929..69d1509ebd2 100644 --- a/libcudacxx/include/cuda/std/__cccl/builtin.h +++ b/libcudacxx/include/cuda/std/__cccl/builtin.h @@ -112,6 +112,11 @@ # define _CCCL_BUILTIN_ASSUME_ALIGNED(...) __builtin_assume_aligned(__VA_ARGS__) #endif // _CCCL_HAS_BUILTIN(__builtin_assume_aligned) +#if _CCCL_TILE_COMPILATION() // __builtin_assume_aligned is not supported in tile mode +# undef _CCCL_BUILTIN_ASSUME_ALIGNED +# define _CCCL_BUILTIN_ASSUME_ALIGNED(_Ptr, ...) (_Ptr) +#endif // _CCCL_TILE_COMPILATION() + #if _CCCL_CHECK_BUILTIN(builtin_constant_p) || _CCCL_COMPILER(GCC) # define _CCCL_BUILTIN_CONSTANT_P(...) __builtin_constant_p(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(builtin_constant_p) diff --git a/libcudacxx/include/cuda/std/__cccl/visibility.h b/libcudacxx/include/cuda/std/__cccl/visibility.h index 47337d8d8fd..075a98130aa 100644 --- a/libcudacxx/include/cuda/std/__cccl/visibility.h +++ b/libcudacxx/include/cuda/std/__cccl/visibility.h @@ -116,15 +116,7 @@ # define _CCCL_DEVICE_API _CCCL_DEVICE # define _CCCL_TILE_API _CCCL_TILE #else // ^^^ _CCCL_COMPILER(NVHPC) ^^^ / vvv !_CCCL_COMPILER(NVHPC) vvv -// Local fork patch: drop _CCCL_TILE from _CCCL_API. Under the tile compiler's -// local-only context check, marking a host/device utility __tile__ means its -// body must satisfy tile restrictions even when the caller is non-tile. That -// fails for any utility that takes a user-provided callable (apply, invoke, -// visit, runtime_assume_aligned, ...). Drop the marker globally; tile DSL -// code in this branch uses its own tile-marked operations and doesn't depend -// on libcudacxx utilities being tile-callable. Revert when upstream fixes the -// marking discipline (or the compiler adopts per-instantiation checking). -# define _CCCL_API _CCCL_HOST_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION +# define _CCCL_API _CCCL_TILE _CCCL_HOST_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION # define _CCCL_HOST_DEVICE_API _CCCL_HOST_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION # define _CCCL_HOST_API _CCCL_HOST _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION # define _CCCL_DEVICE_API _CCCL_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION diff --git a/libcudacxx/include/cuda/std/__new/launder.h b/libcudacxx/include/cuda/std/__new/launder.h index e2f3af192a0..3d67950fc18 100644 --- a/libcudacxx/include/cuda/std/__new/launder.h +++ b/libcudacxx/include/cuda/std/__new/launder.h @@ -32,6 +32,10 @@ # define _CCCL_BUILTIN_LAUNDER(...) __builtin_launder(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(builtin_launder) || _CCCL_COMPILER(GCC, >=, 7) || _CCCL_COMPILER(MSVC) +#if _CCCL_TILE_COMPILATION() // __builtin_launder is not supported in tile mode +# undef _CCCL_BUILTIN_LAUNDER +#endif // _CCCL_TILE_COMPILATION() + _CCCL_BEGIN_NAMESPACE_CUDA_STD template From 0c13142fb3a1d31db586d2d4b7999b65e7b1d5e3 Mon Sep 17 00:00:00 2001 From: Nan An Date: Mon, 22 Jun 2026 14:38:10 -0700 Subject: [PATCH 75/83] drop aligned_size_t tile-commit hint so all DeviceTransform changes are behind the dispatch macro --- cub/cub/device/device_transform.cuh | 41 ++++++++--------------------- 1 file changed, 11 insertions(+), 30 deletions(-) diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index c6059b1cb69..5a8cfdad497 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -98,14 +97,7 @@ struct DeviceTransform // https://github.com/NVIDIA/cccl/issues/8805 for data. We use choose_signed_offset to just check if it can hold the // value passed by the user, but otherwise ignore the chosen signed offset type. using offset_t = ::cuda::std::int64_t; - - // num_items may be a plain integer or cuda::aligned_size_t (the cuda::memcpy_async-style opt-in promising N-byte - // pointer alignment + size divisibility). Unwrap to a plain integer for the offset machinery (choose_signed_offset - // needs an integral type); the tile hook below reads the alignment promise. No-op for a plain integer. - constexpr auto num_items_align = ::cuda::__get_size_align_v; - using count_t = ::cuda::std::conditional_t<(num_items_align > 1), ::cuda::std::size_t, NumItemsT>; - const count_t count = static_cast(num_items); - if (const cudaError_t error = detail::choose_signed_offset::is_exceeding_offset_type(count)) + if (const cudaError_t error = detail::choose_signed_offset::is_exceeding_offset_type(num_items)) { return error; } @@ -113,32 +105,21 @@ struct DeviceTransform const auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env).get(); #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() - // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible and the device is sm_80+ we route to the - // tile kernel: - // - if num_items is a cuda::aligned_size_t=16, the caller has promised 16-byte pointer - // alignment + divisibility, so we commit to tile at compile time and skip the runtime check; - // - otherwise we check the alignment/divisibility preconditions at runtime and fall through to - // the standard CUB dispatch below if they do not hold (CUB's kernels handle the unaligned/tail - // case, so this is a graceful fallback, not an error). - // device_supports_tile() enforces the sm_80+ hardware floor at runtime; below it (or if the capability - // query fails) we fall through to the standard CUB dispatch. + // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible and the device is sm_80+, we check the + // alignment/divisibility preconditions at runtime and route to the tile kernel; we fall through to the standard + // CUB dispatch below if they do not hold (CUB's kernels handle the unaligned/tail case, so this is a graceful + // fallback, not an error). device_supports_tile() enforces the sm_80+ hardware floor at runtime; below it (or if + // the capability query fails) we fall through to the standard CUB dispatch. if constexpr (StableAddress == detail::transform::requires_stable_address::no && ::cuda::std::is_same_v && cub::detail::transform::tile:: tile_dispatch_eligible_v) { - if (cub::detail::transform::tile::device_supports_tile()) + if (cub::detail::transform::tile::device_supports_tile() + && cub::detail::transform::tile::runtime_preconditions_valid(inputs, output, static_cast(num_items))) { - if constexpr (num_items_align >= 16) - { - return cub::detail::transform::tile::dispatch( - inputs, output, static_cast(count), stream); - } - else if (cub::detail::transform::tile::runtime_preconditions_valid(inputs, output, static_cast(count))) - { - return cub::detail::transform::tile::dispatch( - inputs, output, static_cast(count), stream); - } + return cub::detail::transform::tile::dispatch( + inputs, output, static_cast(num_items), stream); } } #endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() @@ -161,7 +142,7 @@ struct DeviceTransform return detail::transform::dispatch( ::cuda::std::move(inputs), ::cuda::std::move(output), - static_cast(count), + static_cast(num_items), ::cuda::std::move(predicate), ::cuda::std::move(transform_op), stream, From da688b6beb678f1f2d54bd1d966e37e35e465d92 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 23 Jun 2026 13:17:47 -0700 Subject: [PATCH 76/83] tile: drop internal DeviceTransform struct; fold tile-size pick into free-function dispatch --- .../dispatch/dispatch_transform_tile.cuh | 55 +++++-------------- 1 file changed, 15 insertions(+), 40 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 48393cc6dcc..0370c258448 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -4,10 +4,8 @@ // Internal dispatch helpers for cub::DeviceTransform's tile path: // tile_dispatch_eligible_v -- compile-time predicate the hook consults // runtime_preconditions_valid -- runtime alignment + divisibility predicate -// dispatch -- bridge that launches the tile kernel with -// the trait's substitute functor -// DeviceTransform -- internal tile-local Transform/Fill wrappers -// used by `dispatch` +// dispatch -- bridge that picks the tile size and launches +// the tile kernel with the trait's substitute functor // User-facing extension points (tile_eligible / tile_mufu_heavy) live in // dispatch_transform_tile_traits.cuh under cub::transform. // Requires CTK 13.4 or newer and nvcc invoked with --enable-tile. @@ -75,37 +73,6 @@ template - [[nodiscard]] static ::cudaError_t Transform( - ::cuda::std::tuple inputs, Out* output, ::cuda::std::int64_t num_items, Fn, ::cudaStream_t stream = nullptr) - { - constexpr int chosen = - (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size(MufuHeavy); - return cub::detail::transform::tile::launch_impl( - inputs, output, num_items, stream, ::cuda::std::index_sequence_for{}); - } - - // Fill - template - [[nodiscard]] static ::cudaError_t - Fill(T* output, ::cuda::std::int64_t num_items, T value, ::cudaStream_t stream = nullptr) - { - if (num_items <= 0) - { - return ::cudaSuccess; - } - constexpr int chosen = (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size(); - // One CTA per tile; see launch_impl -- num_blocks can't exceed the unsigned grid x-dim for - // any device-sized num_items. - const ::cuda::std::int64_t num_blocks = ::cuda::ceil_div(num_items, ::cuda::std::int64_t{chosen}); - cub::detail::transform::tile::fill_kernel - <<(num_blocks), 1, 0, stream>>>(num_items, output, value); - return CubDebug(::cudaGetLastError()); - } -}; - // Combined compile-time predicate for whether (Op, OutIter, InIters...) can use the tile path. We use this with // `if constexpr` for dispatch: when true the hook tries the tile kernel first and, on runtime alignment/divisibility // failure, falls through to the standard CUB dispatch; when false the tile branch is discarded entirely. @@ -154,21 +121,29 @@ template inputs, OutIter output, OffsetT num_items, ::cudaStream_t stream) { - auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output); - auto in_ptrs = ::cuda::std::apply( + const auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output); + const auto in_ptrs = ::cuda::std::apply( [](auto... iters) { return ::cuda::std::make_tuple(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters)...); }, inputs); - // The tile functor to run for TransformOp: its registered tile_operator mirror. + + // The tile functor to run for TransformOp: its registered tile_operator mirror (a scalar functor can't be + // invoked on ct::tile). using tile_op_t = cub::transform::tile_operator_t; static_assert(::cuda::std::is_empty_v, "tile_operator type must be stateless (the tile kernel default-constructs it)"); static_assert(::cuda::std::is_trivially_default_constructible_v, "tile_operator type must be trivially default constructible"); - return DeviceTransform::Transform<0, cub::transform::tile_mufu_heavy_v, tile_op_t>( - in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), tile_op_t{}, stream); + // Pick the tile size from the element types (no caller override -- mirrors the regular path, where the policy + // drives the size), then launch. + constexpr int tile_size = + cub::detail::transform::tile::pick_tile_size<::cuda::std::iter_value_t, + ::cuda::std::iter_value_t...>( + cub::transform::tile_mufu_heavy_v); + return cub::detail::transform::tile::launch_impl( + in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), stream, ::cuda::std::index_sequence_for{}); } } // namespace detail::transform::tile From ebede8815781d203dcec7eb0a2cff494526ba99c Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 23 Jun 2026 13:20:22 -0700 Subject: [PATCH 77/83] tile: remove unused fill_kernel (no Fill hook wired) --- .../dispatch/kernels/kernel_transform_tile.cuh | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index bd36bd1843d..b9627748cb3 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -43,7 +43,7 @@ template return ct::partition_view{span, ct::shape{}}; } -// Tile DSL kernels backing cub::DeviceTransform's tile path. They assume 16-byte pointer alignment + 16-divisible +// Tile DSL kernel backing cub::DeviceTransform's tile path. It assumes 16-byte pointer alignment + 16-divisible // num_items (so the compiler picks LDG.E.128); the dispatch header honors that. NV_IF_TARGET(NV_PROVIDES_SM_80) // guards the body -- tile needs sm_80+, so sub-80 arches get a no-op kernel (dispatch only launches it on sm_80+). // assume_divisible<16> -- num_items % 16 == 0, so the tile DSL can elide tail handling. @@ -62,18 +62,6 @@ transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, co const auto out_view = make_aligned_partition_view(out, n); out_view.store_masked(Fn{}(make_aligned_partition_view(ins, n).load_masked(bx)...), bx);)); } - -template -__tile_global__ void fill_kernel(const ::cuda::std::int64_t num_items, T* __restrict__ out, const T value) -{ - namespace ct = ::cuda::tiles; - NV_IF_TARGET( - NV_PROVIDES_SM_80, - (const auto bx = ct::bid().x; const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); - const auto out_view = make_aligned_partition_view(out, n); - using tile_t = ct::tile>; - out_view.store_masked(ct::full(value), bx);)); -} } // namespace detail::transform::tile CUB_NAMESPACE_END From cd8d429a93cfdce838ed0d8fc21ea1bf1112a362 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 23 Jun 2026 13:26:30 -0700 Subject: [PATCH 78/83] tile: drop redundant inline comments in dispatch --- cub/cub/device/dispatch/dispatch_transform_tile.cuh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 0370c258448..8a44d0f9e9f 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -128,16 +128,12 @@ dispatch(::cuda::std::tuple inputs, OutIter output, OffsetT num_item }, inputs); - // The tile functor to run for TransformOp: its registered tile_operator mirror (a scalar functor can't be - // invoked on ct::tile). using tile_op_t = cub::transform::tile_operator_t; static_assert(::cuda::std::is_empty_v, "tile_operator type must be stateless (the tile kernel default-constructs it)"); static_assert(::cuda::std::is_trivially_default_constructible_v, "tile_operator type must be trivially default constructible"); - // Pick the tile size from the element types (no caller override -- mirrors the regular path, where the policy - // drives the size), then launch. constexpr int tile_size = cub::detail::transform::tile::pick_tile_size<::cuda::std::iter_value_t, ::cuda::std::iter_value_t...>( From 8a8724bfc18a57cce076bf45b398a0e463c0752e Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 23 Jun 2026 13:26:30 -0700 Subject: [PATCH 79/83] tile: use Apache-2.0 license header for new bench files --- cub/benchmarks/bench/transform/tile/babelstream.cu | 2 +- cub/benchmarks/bench/transform/tile/copy.cu | 2 +- cub/benchmarks/bench/transform/tile/grayscale.cu | 2 +- cub/benchmarks/bench/transform/tile/pytorch.cu | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu index 45260565b2a..412d5957da3 100644 --- a/cub/benchmarks/bench/transform/tile/babelstream.cu +++ b/cub/benchmarks/bench/transform/tile/babelstream.cu @@ -1,5 +1,5 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -// SPDX-License-Identifier: BSD-3-Clause +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // Tile variant of the BabelStream transform bench. The lambdas of the base benchmark are replaced by // named, stateless ops that register a tile_operator substitute (gated). Under --enable-tile + diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu index e6b869f15e0..a0c32e3d16d 100644 --- a/cub/benchmarks/bench/transform/tile/copy.cu +++ b/cub/benchmarks/bench/transform/tile/copy.cu @@ -1,5 +1,5 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -// SPDX-License-Identifier: BSD-3-Clause +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // Pure copy (identity transform) -- measures plain load/store bandwidth through the tile // load_masked/store_masked path. The identity op registers a tile_operator substitute (gated); under diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu index f7e2a581ae0..fbc539fa31c 100644 --- a/cub/benchmarks/bench/transform/tile/grayscale.cu +++ b/cub/benchmarks/bench/transform/tile/grayscale.cu @@ -1,5 +1,5 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -// SPDX-License-Identifier: BSD-3-Clause +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // Tile variant of the grayscale transform bench. Unlike the base bench (a single rgb_t struct // input), this uses three separate R/G/B streams so the inputs are plain element types the tile path diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu index a63a89f68da..deff05e2852 100644 --- a/cub/benchmarks/bench/transform/tile/pytorch.cu +++ b/cub/benchmarks/bench/transform/tile/pytorch.cu @@ -1,5 +1,5 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -// SPDX-License-Identifier: BSD-3-Clause +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // Tile variant of the PyTorch-style transform benches. Each named op registers a tile_operator // substitute (gated); MUFU-heavy ops also opt into tile_mufu_heavy_v so the tile policy picker caps From 20fd23705687ea3204d84b21e2c937853387590d Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 23 Jun 2026 13:28:35 -0700 Subject: [PATCH 80/83] tile: clang-format dispatch and fix stale struct reference in comment --- .../device/dispatch/dispatch_transform_tile.cuh | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 8a44d0f9e9f..6803aa66541 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -114,7 +114,7 @@ runtime_preconditions_valid(::cuda::std::tuple const& inputs, OutIte return aligned_out && aligned_in && (num_items % items_divisor) == 0; } -// Bridge from cub::DeviceTransform::__transform_internal to the tile DeviceTransform. Precondition (the caller +// Bridge from cub::DeviceTransform::__transform_internal to the tile kernel. Precondition (the caller // checks it): tile_dispatch_eligible_v is true AND runtime_preconditions_valid returned true. Launches the kernel // with tile_operator_t -- Op's registered __tile__ mirror (a scalar functor can't be invoked on ct::tile). template @@ -134,12 +134,15 @@ dispatch(::cuda::std::tuple inputs, OutIter output, OffsetT num_item static_assert(::cuda::std::is_trivially_default_constructible_v, "tile_operator type must be trivially default constructible"); - constexpr int tile_size = - cub::detail::transform::tile::pick_tile_size<::cuda::std::iter_value_t, - ::cuda::std::iter_value_t...>( - cub::transform::tile_mufu_heavy_v); + constexpr int tile_size = cub::detail::transform::tile::pick_tile_size<::cuda::std::iter_value_t, + ::cuda::std::iter_value_t...>( + cub::transform::tile_mufu_heavy_v); return cub::detail::transform::tile::launch_impl( - in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), stream, ::cuda::std::index_sequence_for{}); + in_ptrs, + out_ptr, + static_cast<::cuda::std::int64_t>(num_items), + stream, + ::cuda::std::index_sequence_for{}); } } // namespace detail::transform::tile From 581fcf4e364e22d82abc3c8f164d719d3b142594 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 23 Jun 2026 13:43:53 -0700 Subject: [PATCH 81/83] tile: mark out_ptr const in runtime_preconditions_valid --- cub/cub/device/dispatch/dispatch_transform_tile.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh index 6803aa66541..c319f429682 100644 --- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -101,7 +101,7 @@ runtime_preconditions_valid(::cuda::std::tuple const& inputs, OutIte constexpr int byte_align = 16; constexpr int items_divisor = 16; - auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output); + const auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output); const bool aligned_out = ::cuda::std::is_sufficiently_aligned(out_ptr); const bool aligned_in = ::cuda::std::apply( [](auto... iters) { From 3ce1647aaf13c870aafe6aad0af2edd3594515e0 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 23 Jun 2026 13:43:53 -0700 Subject: [PATCH 82/83] tile: drop __restrict__ from kernel params (API permits in-place transforms) --- cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index b9627748cb3..d93dbd5bd2a 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -53,7 +53,7 @@ template // miscompiles a templated __tile__ helper called via a lambda under --expt-relaxed-constexpr (invalid IR). template __tile_global__ void -transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins) +transform_kernel(const ::cuda::std::int64_t num_items, Out* out, const Ins*... ins) { namespace ct = ::cuda::tiles; NV_IF_TARGET( From f7729e726b21400e55fa953f4c36231e5fb992a8 Mon Sep 17 00:00:00 2001 From: Nan An Date: Tue, 23 Jun 2026 13:51:08 -0700 Subject: [PATCH 83/83] tile: reflow transform_kernel signature (clang-format) --- cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh index d93dbd5bd2a..b2e8f2b5e68 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -52,8 +52,7 @@ template // NOTE: make_aligned_partition_view is invoked directly -- do NOT wrap these calls in a lambda: nvcc 13.4 // miscompiles a templated __tile__ helper called via a lambda under --expt-relaxed-constexpr (invalid IR). template -__tile_global__ void -transform_kernel(const ::cuda::std::int64_t num_items, Out* out, const Ins*... ins) +__tile_global__ void transform_kernel(const ::cuda::std::int64_t num_items, Out* out, const Ins*... ins) { namespace ct = ::cuda::tiles; NV_IF_TARGET(