diff --git a/cub/CMakeLists.txt b/cub/CMakeLists.txt index 4c8c778f7fe..820614d1e65 100644 --- a/cub/CMakeLists.txt +++ b/cub/CMakeLists.txt @@ -10,6 +10,34 @@ option(CUB_ENABLE_HEADER_TESTING "Test that all public headers compile." ON) option(CUB_ENABLE_TESTING "Build CUB testing suite." ON) option(CUB_ENABLE_EXAMPLES "Build CUB examples." ON) +# Build cub::DeviceTransform's tile-DSL path (test + benches) under `nvcc --enable-tile`. Defaults ON when the +# toolkit can build it (CTK 13.4+), so 13.4+ configs -- including CI -- exercise the tile path automatically; OFF +# and compiled out below 13.4. The sm_80+ floor is enforced at runtime (dispatch cc check + NV_IF_TARGET in the +# kernels), so a 13.4+ build still runs correctly on any GPU. +set(_cccl_tile_transform_default OFF) +if ( + "${CMAKE_CUDA_COMPILER_ID}" STREQUAL "NVIDIA" + AND NOT "${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS 13.4 +) + set(_cccl_tile_transform_default ON) +endif() +option( + CCCL_ENABLE_TILE_TRANSFORM_DISPATCH + "Build cub::DeviceTransform's tile path (requires nvcc --enable-tile)." + ${_cccl_tile_transform_default} +) +if ( + CCCL_ENABLE_TILE_TRANSFORM_DISPATCH + AND "${CMAKE_CUDA_COMPILER_ID}" STREQUAL "NVIDIA" + AND "${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS 13.4 +) + message( + FATAL_ERROR + "CCCL_ENABLE_TILE_TRANSFORM_DISPATCH requires CUDA 13.4+ (nvcc --enable-tile). " + "Found ${CMAKE_CUDA_COMPILER_VERSION}." + ) +endif() + option(CUB_ENABLE_TUNING "Build CUB tuning suite." OFF) if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") set(CUB_ENABLE_TUNING OFF) diff --git a/cub/benchmarks/CMakeLists.txt b/cub/benchmarks/CMakeLists.txt index e54cf1c80db..c9e9e7893cd 100644 --- a/cub/benchmarks/CMakeLists.txt +++ b/cub/benchmarks/CMakeLists.txt @@ -91,6 +91,22 @@ function(add_bench target_name bench_name bench_src) cccl.nvbench_helper nvbench::main ) + + # Tile-DSL transform benches: build under --enable-tile + the dispatch opt-in when requested. Gated by + # CCCL_ENABLE_TILE_TRANSFORM_DISPATCH (default OFF) so CI builds the tile/ benches with the tile path off. + if ( + CCCL_ENABLE_TILE_TRANSFORM_DISPATCH + AND "${bench_src}" MATCHES "/transform/tile/" + ) + target_compile_options( + ${bench_target} + PRIVATE "$<$:--enable-tile>" + ) + target_compile_definitions( + ${bench_target} + PRIVATE CCCL_ENABLE_TILE_TRANSFORM_DISPATCH + ) + endif() endfunction() function(add_bench_dir bench_dir) diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu new file mode 100644 index 00000000000..412d5957da3 --- /dev/null +++ b/cub/benchmarks/bench/transform/tile/babelstream.cu @@ -0,0 +1,230 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Tile variant of the BabelStream transform bench. The lambdas of the base benchmark are replaced by +// named, stateless ops that register a tile_operator substitute (gated). Under --enable-tile + +// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH the dispatch hook routes them to the tile kernel; otherwise this +// is the standard CUB transform path. This file disappears once tile dispatch is fully transparent. + +#include "../common.h" + +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() +# include +#endif + +// Stateless scalar ops, used at the call site in both build modes. Constants are baked in so the ops +// stay stateless (the tile substitute must be trivially default constructible): with startScalar == -2, +// `c * scalar` is `-(c + c)`, `b + scalar * c` is `b - c - c`, etc. +struct mul_op +{ + template + __host__ __device__ auto operator()(B b) const + { + return -(b + b); + } +}; +struct add_op +{ + template + __host__ __device__ auto operator()(A a, B b) const + { + return a + b; + } +}; +struct triad_op +{ + template + __host__ __device__ auto operator()(B b, C c) const + { + return b - c - c; + } +}; +struct nstream_op +{ + template + __host__ __device__ auto operator()(A a, B b, C c) const + { + return a + b - c - c; + } +}; + +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() +struct tile_mul_op +{ + template + __tile__ auto operator()(B b) const + { + return -(b + b); + } +}; +struct tile_add_op +{ + template + __tile__ auto operator()(A a, B b) const + { + return a + b; + } +}; +struct tile_triad_op +{ + template + __tile__ auto operator()(B b, C c) const + { + return b - c - c; + } +}; +struct tile_nstream_op +{ + template + __tile__ auto operator()(A a, B b, C c) const + { + return a + b - c - c; + } +}; + +CUB_NAMESPACE_BEGIN +namespace transform +{ +template +inline constexpr bool tile_eligible_v = true; +template +inline constexpr bool tile_eligible_v = true; +template +inline constexpr bool tile_eligible_v = true; +template +inline constexpr bool tile_eligible_v = true; +template <> +struct tile_operator +{ + using type = tile_mul_op; +}; +template <> +struct tile_operator +{ + using type = tile_add_op; +}; +template <> +struct tile_operator +{ + using type = tile_triad_op; +}; +template <> +struct tile_operator +{ + using type = tile_nstream_op; +}; +} // namespace transform +CUB_NAMESPACE_END +#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() + +// The tile path does not support __int128 (no tensor_span/partition_view for it), so the type axis +// omits it relative to the base babelstream bench. +#ifdef TUNE_T +using element_types = nvbench::type_list; +#else +using element_types = nvbench::type_list; +#endif + +inline auto array_size_powers = nvbench::range(16, 32, 4); + +// Same constant inputs as the base bench so nstream maintains a consistent workload. +inline constexpr auto startA = 11; +inline constexpr auto startB = 2; +inline constexpr auto startC = 1; +inline constexpr auto startScalar = -2; +static_assert(startA == (startA + startB + startScalar * startC), "nstream must have a consistent workload"); + +template +static void mul(nvbench::state& state, nvbench::type_list) +try +{ + const auto n = state.get_int64("Elements{io}"); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + bench_transform(state, cuda::std::tuple{c.begin()}, b.begin(), n, mul_op{}); +} +catch (const std::bad_alloc&) +{ + state.skip("Skipping: out of memory."); +} + +NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(element_types)) + .set_name("tile_mul") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers); + +template +static void add(nvbench::state& state, nvbench::type_list) +try +{ + const auto n = state.get_int64("Elements{io}"); + thrust::device_vector a(n, startA); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + state.add_element_count(n); + state.add_global_memory_reads(2 * n); + state.add_global_memory_writes(n); + bench_transform(state, cuda::std::tuple{a.begin(), b.begin()}, c.begin(), n, add_op{}); +} +catch (const std::bad_alloc&) +{ + state.skip("Skipping: out of memory."); +} + +NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(element_types)) + .set_name("tile_add") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers); + +template +static void triad(nvbench::state& state, nvbench::type_list) +try +{ + const auto n = state.get_int64("Elements{io}"); + thrust::device_vector a(n, startA); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + state.add_element_count(n); + state.add_global_memory_reads(2 * n); + state.add_global_memory_writes(n); + bench_transform(state, cuda::std::tuple{b.begin(), c.begin()}, a.begin(), n, triad_op{}); +} +catch (const std::bad_alloc&) +{ + state.skip("Skipping: out of memory."); +} + +NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(element_types)) + .set_name("tile_triad") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers); + +template +static void nstream(nvbench::state& state, nvbench::type_list) +try +{ + const auto n = state.get_int64("Elements{io}"); + thrust::device_vector a(n, startA); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + state.add_element_count(n); + state.add_global_memory_reads(3 * n); + state.add_global_memory_writes(n); + bench_transform(state, cuda::std::tuple{a.begin(), b.begin(), c.begin()}, a.begin(), n, nstream_op{}); +} +catch (const std::bad_alloc&) +{ + state.skip("Skipping: out of memory."); +} + +NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types)) + .set_name("tile_nstream") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers); diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu new file mode 100644 index 00000000000..a0c32e3d16d --- /dev/null +++ b/cub/benchmarks/bench/transform/tile/copy.cu @@ -0,0 +1,77 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Pure copy (identity transform) -- measures plain load/store bandwidth through the tile +// load_masked/store_masked path. The identity op registers a tile_operator substitute (gated); under +// --enable-tile + CCCL_ENABLE_TILE_TRANSFORM_DISPATCH the dispatch hook routes it to the tile kernel, +// otherwise it falls through to CUB's standard transform. This file disappears once tile dispatch is +// fully transparent. + +#include "../common.h" + +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() +# include +#endif + +struct identity +{ + template + __host__ __device__ auto operator()(T v) const + { + return v; + } +}; + +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() +struct tile_identity +{ + template + __tile__ auto operator()(T v) const + { + return v; + } +}; + +CUB_NAMESPACE_BEGIN +namespace transform +{ +template +inline constexpr bool tile_eligible_v = true; +template <> +struct tile_operator +{ + using type = tile_identity; +}; +} // namespace transform +CUB_NAMESPACE_END +#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() + +#ifdef TUNE_T +using element_types = nvbench::type_list; +#else +using element_types = nvbench::type_list; +#endif + +template +static void copy(nvbench::state& state, nvbench::type_list) +try +{ + const auto n = state.get_int64("Elements{io}"); + + thrust::device_vector in = generate(n); + thrust::device_vector out(n, thrust::no_init); + + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + bench_transform(state, cuda::std::tuple{in.begin()}, out.begin(), n, identity{}); +} +catch (const std::bad_alloc&) +{ + state.skip("Skipping: out of memory."); +} + +NVBENCH_BENCH_TYPES(copy, NVBENCH_TYPE_AXES(element_types)) + .set_name("tile_copy") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 32, 4)); diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu new file mode 100644 index 00000000000..fbc539fa31c --- /dev/null +++ b/cub/benchmarks/bench/transform/tile/grayscale.cu @@ -0,0 +1,84 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Tile variant of the grayscale transform bench. Unlike the base bench (a single rgb_t struct +// input), this uses three separate R/G/B streams so the inputs are plain element types the tile path +// can vectorize. The named rgb_to_y op registers a tile_operator substitute (gated). This file +// disappears once tile dispatch is fully transparent. + +#include "../common.h" + +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() +# include +#endif + +struct rgb_to_y +{ + template + __host__ __device__ auto operator()(R r, G g, B b) const + { + constexpr float w_r = 0.2989f; + constexpr float w_g = 0.587f; + constexpr float w_b = 0.114f; + return w_r * r + w_g * g + w_b * b; + } +}; + +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() +struct tile_rgb_to_y +{ + template + __tile__ auto operator()(R r, G g, B b) const + { + constexpr float w_r = 0.2989f; + constexpr float w_g = 0.587f; + constexpr float w_b = 0.114f; + return w_r * r + w_g * g + w_b * b; + } +}; + +CUB_NAMESPACE_BEGIN +namespace transform +{ +template +inline constexpr bool tile_eligible_v = true; +template <> +struct tile_operator +{ + using type = tile_rgb_to_y; +}; +} // namespace transform +CUB_NAMESPACE_END +#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() + +#ifdef TUNE_T +using value_types = nvbench::type_list; +#else +using value_types = nvbench::type_list; +#endif + +template +static void grayscale(nvbench::state& state, nvbench::type_list) +try +{ + const auto n = state.get_int64("Elements{io}"); + + thrust::device_vector r = generate(n); + thrust::device_vector g = generate(n); + thrust::device_vector b = generate(n); + thrust::device_vector out(n, thrust::no_init); + + state.add_element_count(n); + state.add_global_memory_reads(3 * n); // matches the base bench's rgb_t = 3 * sizeof(T) + state.add_global_memory_writes(n); + bench_transform(state, cuda::std::tuple{r.begin(), g.begin(), b.begin()}, out.begin(), n, rgb_to_y{}); +} +catch (const std::bad_alloc&) +{ + state.skip("Skipping: out of memory."); +} + +NVBENCH_BENCH_TYPES(grayscale, NVBENCH_TYPE_AXES(value_types)) + .set_name("tile_grayscale") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 32, 4)); diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu new file mode 100644 index 00000000000..deff05e2852 --- /dev/null +++ b/cub/benchmarks/bench/transform/tile/pytorch.cu @@ -0,0 +1,493 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Tile variant of the PyTorch-style transform benches. Each named op registers a tile_operator +// substitute (gated); MUFU-heavy ops also opt into tile_mufu_heavy_v so the tile policy picker caps +// items/thread at the vector width on sub-4-byte types. Under --enable-tile + +// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH the dispatch hook routes them to the tile kernel; otherwise this +// is the standard CUB path. This file disappears once tile dispatch is fully transparent. + +#include +#include + +#include + +#include "../common.h" + +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() +# include +#endif + +// Scalar ops the user passes to Transform. Sub-4-byte input types compute in float and cast back, +// matching the tile substitutes below. +template +__host__ __device__ float to_f(T v) +{ + return static_cast(v); +} +template +__host__ __device__ T from_f(float f) +{ + return static_cast(f); +} + +struct relu_op +{ + template + __host__ __device__ T operator()(T v) const + { + float f = to_f(v); + return from_f(f > 0.0f ? f : 0.0f); + } +}; +struct sigmoid_op +{ + template + __host__ __device__ T operator()(T v) const + { + float f = to_f(v); + return from_f(1.0f / (1.0f + ::cuda::std::exp(-f))); + } +}; +struct tanh_op +{ + template + __host__ __device__ T operator()(T v) const + { + return from_f(::cuda::std::tanh(to_f(v))); + } +}; +struct gelu_op +{ + template + __host__ __device__ T operator()(T v) const + { + constexpr float k0 = 0.7978845608028654f, k1 = 0.044715f; + float f = to_f(v); + return from_f(0.5f * f * (1.0f + ::cuda::std::tanh(k0 * (f + k1 * f * f * f)))); + } +}; +struct sin_op +{ + template + __host__ __device__ T operator()(T v) const + { + return from_f(::cuda::std::sin(to_f(v))); + } +}; +struct exp_op +{ + template + __host__ __device__ T operator()(T v) const + { + return from_f(::cuda::std::exp(to_f(v))); + } +}; + +struct binary_add +{ + template + __host__ __device__ auto operator()(A a, B b) const + { + return a + b; + } +}; +struct binary_sub +{ + template + __host__ __device__ auto operator()(A a, B b) const + { + return a - b; + } +}; +struct binary_mul +{ + template + __host__ __device__ auto operator()(A a, B b) const + { + return a * b; + } +}; +struct binary_div +{ + template + __host__ __device__ auto operator()(A a, B b) const + { + return a / b; + } +}; +struct binary_le +{ + template + __host__ __device__ A operator()(A a, B b) const + { + return static_cast(a <= b); + } +}; +struct binary_ge +{ + template + __host__ __device__ A operator()(A a, B b) const + { + return static_cast(a >= b); + } +}; +struct binary_fmin +{ + template + __host__ __device__ auto operator()(A a, B b) const + { + return a < b ? a : b; + } +}; +struct binary_fmax +{ + template + __host__ __device__ auto operator()(A a, B b) const + { + return a > b ? a : b; + } +}; + +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() +namespace ct = ::cuda::tiles; + +template +__tile__ auto as_float(T v) +{ + return ct::element_cast(v); +} +template +__tile__ auto from_float(F f) +{ + return ct::element_cast>(f); +} + +struct tile_relu +{ + template + __tile__ auto operator()(T v) const + { + auto f = as_float(v); + return from_float(ct::select(f > 0.0f, f, f - f)); + } +}; +struct tile_sigmoid +{ + template + __tile__ auto operator()(T v) const + { + auto f = as_float(v); + return from_float(1.0f / (1.0f + ct::exp(-f))); + } +}; +struct tile_tanh +{ + template + __tile__ auto operator()(T v) const + { + return from_float(ct::tanh(as_float(v))); + } +}; +struct tile_gelu +{ + template + __tile__ auto operator()(T v) const + { + constexpr float k0 = 0.7978845608028654f, k1 = 0.044715f; + auto f = as_float(v); + return from_float(0.5f * f * (1.0f + ct::tanh(k0 * (f + k1 * f * f * f)))); + } +}; +struct tile_sin +{ + template + __tile__ auto operator()(T v) const + { + return from_float(ct::sin(as_float(v))); + } +}; +struct tile_exp +{ + template + __tile__ auto operator()(T v) const + { + return from_float(ct::exp(as_float(v))); + } +}; + +struct tile_binary_add +{ + template + __tile__ auto operator()(A a, B b) const + { + return a + b; + } +}; +struct tile_binary_sub +{ + template + __tile__ auto operator()(A a, B b) const + { + return a - b; + } +}; +struct tile_binary_mul +{ + template + __tile__ auto operator()(A a, B b) const + { + return a * b; + } +}; +struct tile_binary_div +{ + template + __tile__ auto operator()(A a, B b) const + { + return a / b; + } +}; +struct tile_binary_le +{ + template + __tile__ auto operator()(A a, B b) const + { + return ct::element_cast>(a <= b); + } +}; +struct tile_binary_ge +{ + template + __tile__ auto operator()(A a, B b) const + { + return ct::element_cast>(a >= b); + } +}; +struct tile_binary_fmin +{ + template + __tile__ auto operator()(A a, B b) const + { + return ct::select(a < b, a, b); + } +}; +struct tile_binary_fmax +{ + template + __tile__ auto operator()(A a, B b) const + { + return ct::select(a > b, a, b); + } +}; + +CUB_NAMESPACE_BEGIN +namespace transform +{ +// Unary +template +inline constexpr bool tile_eligible_v = true; +template +inline constexpr bool tile_eligible_v = true; +template +inline constexpr bool tile_eligible_v = true; +template +inline constexpr bool tile_eligible_v = true; +template +inline constexpr bool tile_eligible_v = true; +template +inline constexpr bool tile_eligible_v = true; +template <> +struct tile_operator +{ + using type = tile_relu; +}; +template <> +struct tile_operator +{ + using type = tile_sigmoid; +}; +template <> +struct tile_operator +{ + using type = tile_tanh; +}; +template <> +struct tile_operator +{ + using type = tile_gelu; +}; +template <> +struct tile_operator +{ + using type = tile_sin; +}; +template <> +struct tile_operator +{ + using type = tile_exp; +}; + +// MUFU-heavy unary ops: hint the tile policy picker to cap items/thread at the vector width on +// sub-4-byte types. +template <> +inline constexpr bool tile_mufu_heavy_v = true; +template <> +inline constexpr bool tile_mufu_heavy_v = true; +template <> +inline constexpr bool tile_mufu_heavy_v = true; +template <> +inline constexpr bool tile_mufu_heavy_v = true; +template <> +inline constexpr bool tile_mufu_heavy_v = true; + +// Binary +template +inline constexpr bool tile_eligible_v = true; +template +inline constexpr bool tile_eligible_v = true; +template +inline constexpr bool tile_eligible_v = true; +template +inline constexpr bool tile_eligible_v = true; +template +inline constexpr bool tile_eligible_v = true; +template +inline constexpr bool tile_eligible_v = true; +template +inline constexpr bool tile_eligible_v = true; +template +inline constexpr bool tile_eligible_v = true; +template <> +struct tile_operator +{ + using type = tile_binary_add; +}; +template <> +struct tile_operator +{ + using type = tile_binary_sub; +}; +template <> +struct tile_operator +{ + using type = tile_binary_mul; +}; +template <> +struct tile_operator +{ + using type = tile_binary_div; +}; +template <> +struct tile_operator +{ + using type = tile_binary_le; +}; +template <> +struct tile_operator +{ + using type = tile_binary_ge; +}; +template <> +struct tile_operator +{ + using type = tile_binary_fmin; +}; +template <> +struct tile_operator +{ + using type = tile_binary_fmax; +}; +} // namespace transform +CUB_NAMESPACE_END +#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() + +#ifdef TUNE_T +using element_types = nvbench::type_list; +#else +using element_types = nvbench::type_list< +# if _CCCL_HAS_NVFP16() && _CCCL_CTK_AT_LEAST(12, 2) + __half, +# endif +# if _CCCL_HAS_NVBF16() && _CCCL_CTK_AT_LEAST(12, 2) + __nv_bfloat16, +# endif + nvbench::float32_t>; +#endif + +template +static void run_unary(nvbench::state& state) +try +{ + const auto n = state.get_int64("Elements{io}"); + thrust::device_vector in(n, T(1)); + thrust::device_vector out(n, thrust::no_init); + + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + bench_transform(state, cuda::std::tuple{in.begin()}, out.begin(), n, Op{}); +} +catch (const std::bad_alloc&) +{ + state.skip("Skipping: out of memory."); +} + +template +static void run_binary(nvbench::state& state) +try +{ + const auto n = state.get_int64("Elements{io}"); + thrust::device_vector a(n, T(1)); + thrust::device_vector b(n, T(1)); + thrust::device_vector out(n, thrust::no_init); + + state.add_element_count(n); + state.add_global_memory_reads(2 * n); + state.add_global_memory_writes(n); + bench_transform(state, cuda::std::tuple{a.begin(), b.begin()}, out.begin(), n, Op{}); +} +catch (const std::bad_alloc&) +{ + state.skip("Skipping: out of memory."); +} + +inline auto pt_sizes = nvbench::range(16, 32, 4); + +#define UNARY_BENCH(name, op) \ + template \ + static void name##_bench(nvbench::state& state, nvbench::type_list) \ + { \ + run_unary(state); \ + } \ + NVBENCH_BENCH_TYPES(name##_bench, NVBENCH_TYPE_AXES(element_types)) \ + .set_name("tile_" #name) \ + .set_type_axes_names({"T{ct}"}) \ + .add_int64_power_of_two_axis("Elements{io}", pt_sizes) + +UNARY_BENCH(relu, relu_op); +UNARY_BENCH(sigmoid, sigmoid_op); +UNARY_BENCH(tanh, tanh_op); +UNARY_BENCH(gelu, gelu_op); +UNARY_BENCH(sin, sin_op); +UNARY_BENCH(exp, exp_op); + +#define BINARY_BENCH(name, op) \ + template \ + static void name##_bench(nvbench::state& state, nvbench::type_list) \ + { \ + run_binary(state); \ + } \ + NVBENCH_BENCH_TYPES(name##_bench, NVBENCH_TYPE_AXES(element_types)) \ + .set_name("tile_pt_" #name) \ + .set_type_axes_names({"T{ct}"}) \ + .add_int64_power_of_two_axis("Elements{io}", pt_sizes) + +BINARY_BENCH(add, binary_add); +BINARY_BENCH(sub, binary_sub); +BINARY_BENCH(mul, binary_mul); +BINARY_BENCH(div, binary_div); +BINARY_BENCH(le, binary_le); +BINARY_BENCH(ge, binary_ge); +BINARY_BENCH(fmin, binary_fmin); +BINARY_BENCH(fmax, binary_fmax); diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index d8ad0354bfc..5a8cfdad497 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -15,8 +15,13 @@ #include #include +#include #include +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() +# include +#endif + #include #include #include @@ -99,6 +104,26 @@ struct DeviceTransform const auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env).get(); +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() + // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible and the device is sm_80+, we check the + // alignment/divisibility preconditions at runtime and route to the tile kernel; we fall through to the standard + // CUB dispatch below if they do not hold (CUB's kernels handle the unaligned/tail case, so this is a graceful + // fallback, not an error). device_supports_tile() enforces the sm_80+ hardware floor at runtime; below it (or if + // the capability query fails) we fall through to the standard CUB dispatch. + if constexpr (StableAddress == detail::transform::requires_stable_address::no + && ::cuda::std::is_same_v + && cub::detail::transform::tile:: + tile_dispatch_eligible_v) + { + if (cub::detail::transform::tile::device_supports_tile() + && cub::detail::transform::tile::runtime_preconditions_valid(inputs, output, static_cast(num_items))) + { + return cub::detail::transform::tile::dispatch( + inputs, output, static_cast(num_items), stream); + } + } +#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() + using tuning_env = ::cuda::std::execution::__query_result_or_t>; using default_policy_selector = diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh new file mode 100644 index 00000000000..c319f429682 --- /dev/null +++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh @@ -0,0 +1,151 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Internal dispatch helpers for cub::DeviceTransform's tile path: +// tile_dispatch_eligible_v -- compile-time predicate the hook consults +// runtime_preconditions_valid -- runtime alignment + divisibility predicate +// dispatch -- bridge that picks the tile size and launches +// the tile kernel with the trait's substitute functor +// User-facing extension points (tile_eligible / tile_mufu_heavy) live in +// dispatch_transform_tile_traits.cuh under cub::transform. +// Requires CTK 13.4 or newer and nvcc invoked with --enable-tile. + +#pragma once + +#include + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_CUB_HAS_TILE_TRANSFORM() + +# include +# include +# include +# include +# include + +# include +# include + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +CUB_NAMESPACE_BEGIN + +namespace detail::transform::tile +{ +template +[[nodiscard]] ::cudaError_t launch_impl( + ::cuda::std::tuple inputs, + Out* output, + ::cuda::std::int64_t num_items, + ::cudaStream_t stream, + ::cuda::std::index_sequence) +{ + if (num_items <= 0) + { + return ::cudaSuccess; + } + + // One CTA per tile. The cast to the unsigned grid x-dim can't truncate: num_blocks > 2^32-1 + // would need num_items > TileSize * 2^32 (>= 2^40 elements), more than any device can hold. + const ::cuda::std::int64_t num_blocks = ::cuda::ceil_div(num_items, ::cuda::std::int64_t{TileSize}); + + cub::detail::transform::tile::transform_kernel + <<(num_blocks), 1, 0, stream>>>(num_items, output, ::cuda::std::get(inputs)...); + + return CubDebug(::cudaGetLastError()); +} + +// Combined compile-time predicate for whether (Op, OutIter, InIters...) can use the tile path. We use this with +// `if constexpr` for dispatch: when true the hook tries the tile kernel first and, on runtime alignment/divisibility +// failure, falls through to the standard CUB dispatch; when false the tile branch is discarded entirely. +template +inline constexpr bool tile_dispatch_eligible_v = + THRUST_NS_QUALIFIER::is_contiguous_iterator_v + && (THRUST_NS_QUALIFIER::is_contiguous_iterator_v && ...) + && cub::transform::tile_eligible_v, sizeof...(InIters)>; + +// Runtime arch gate: tile needs sm_80+. False (fall back to CUB) below sm_80 or if the cc query fails. +[[nodiscard]] CUB_RUNTIME_FUNCTION inline bool device_supports_tile() +{ + ::cuda::compute_capability cc{}; + return cub::detail::ptx_compute_cap(cc) == ::cudaSuccess && cc >= ::cuda::compute_capability{8, 0}; +} + +// Runtime precondition the tile hook checks before dispatching: 16-byte pointer alignment + num_items % 16 == 0 +// (the kernels assume_aligned<16>/assume_divisible<16>, so violating these is UB). False -> fall back to CUB. +template +[[nodiscard]] CUB_RUNTIME_FUNCTION bool +runtime_preconditions_valid(::cuda::std::tuple const& inputs, OutIter output, OffsetT num_items) +{ + // Pointer alignment is in bytes (for LDG.E.128); the kernel's + // ct::assume_divisible applies to num_items as an element count. These + // are both 16 today by coincidence but live on different axes. + constexpr int byte_align = 16; + constexpr int items_divisor = 16; + + const auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output); + const bool aligned_out = ::cuda::std::is_sufficiently_aligned(out_ptr); + const bool aligned_in = ::cuda::std::apply( + [](auto... iters) { + return ( + (::cuda::std::is_sufficiently_aligned(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters))) + && ...); + }, + inputs); + + return aligned_out && aligned_in && (num_items % items_divisor) == 0; +} + +// Bridge from cub::DeviceTransform::__transform_internal to the tile kernel. Precondition (the caller +// checks it): tile_dispatch_eligible_v is true AND runtime_preconditions_valid returned true. Launches the kernel +// with tile_operator_t -- Op's registered __tile__ mirror (a scalar functor can't be invoked on ct::tile). +template +[[nodiscard]] CUB_RUNTIME_FUNCTION ::cudaError_t +dispatch(::cuda::std::tuple inputs, OutIter output, OffsetT num_items, ::cudaStream_t stream) +{ + const auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output); + const auto in_ptrs = ::cuda::std::apply( + [](auto... iters) { + return ::cuda::std::make_tuple(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters)...); + }, + inputs); + + using tile_op_t = cub::transform::tile_operator_t; + static_assert(::cuda::std::is_empty_v, + "tile_operator type must be stateless (the tile kernel default-constructs it)"); + static_assert(::cuda::std::is_trivially_default_constructible_v, + "tile_operator type must be trivially default constructible"); + + constexpr int tile_size = cub::detail::transform::tile::pick_tile_size<::cuda::std::iter_value_t, + ::cuda::std::iter_value_t...>( + cub::transform::tile_mufu_heavy_v); + return cub::detail::transform::tile::launch_impl( + in_ptrs, + out_ptr, + static_cast<::cuda::std::int64_t>(num_items), + stream, + ::cuda::std::index_sequence_for{}); +} +} // namespace detail::transform::tile + +CUB_NAMESPACE_END + +#endif // _CCCL_CUB_HAS_TILE_TRANSFORM() diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh new file mode 100644 index 00000000000..0c06b091335 --- /dev/null +++ b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh @@ -0,0 +1,35 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Single source of truth for the compile-time gates the tile transform headers +// share. Two macros: +// +// _CCCL_CUB_HAS_TILE_TRANSFORM() +// True when nvcc is in tile mode (--enable-tile / _CCCL_TILE_COMPILATION()) AND CTK 13.4+. The sm_80+ +// requirement is handled at runtime + NV_IF_TARGET in the kernels, not here. When false, the tile headers +// (kernel / tuning / dispatch / traits) are skipped entirely. +// +// _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() +// True when the dispatch hook in cub::DeviceTransform should fire. Same as +// _CCCL_CUB_HAS_TILE_TRANSFORM() plus the user opt-in macro +// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH. + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#define _CCCL_CUB_HAS_TILE_TRANSFORM() (_CCCL_TILE_COMPILATION() && _CCCL_CTK_AT_LEAST(13, 4)) + +#if _CCCL_CUB_HAS_TILE_TRANSFORM() && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) +# define _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() 1 +#else +# define _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() 0 +#endif diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh new file mode 100644 index 00000000000..ad4e05926e0 --- /dev/null +++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh @@ -0,0 +1,146 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Compile-time policy for cub::DeviceTransform's tile path. +// +// PUBLIC EXTENSION POINTS (cub::transform) -- two independent axes: +// tile_eligible_v -- specialize to true to opt a (functor type, +// element type, input arity) combo into the +// tile dispatch path. Eligibility only. +// tile_operator -- the __tile__ functor the tile kernel runs +// for Op. No default: every tile-eligible Op +// must specialize it with `using type = `, +// because a scalar functor (e.g. +// cuda::std::plus<__half>) cannot be invoked +// on ct::tile. Omitting it is a clear +// static_assert, not a cryptic kernel error. +// tile_operator_t -- alias for tile_operator::type. +// tile_mufu_heavy_v -- specialize to true to flag Op as MUFU-heavy; the tile policy picker uses it. +// +// Eligibility ("may this combo use the tile path?") and substitution ("which +// __tile__ functor do we actually run?") are separate traits, so an eligible op +// always registers both: tile_eligible_v and tile_operator. +// +// INTERNAL (cub::detail::transform::tile): +// tile_plus, tile_multiplies -- shipped tile-friendly substitutes used by +// the built-in specializations below. + +#pragma once + +#include + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_CUB_HAS_TILE_TRANSFORM() + +# include +# include +# include + +# include + +CUB_NAMESPACE_BEGIN + +// Public extension surface. +namespace transform +{ +// Opt a (functor type, element type, input arity) combo into the tile dispatch path: specialize this to +// true for the combo. Eligibility only -- the __tile__ functor to actually run is named by tile_operator. +template +inline constexpr bool tile_eligible_v = false; + +// The __tile__ functor the tile kernel runs for Op. +template +struct tile_operator +{ + static_assert(sizeof(Op) == 0, + "cub::transform::tile_operator must be specialized for every tile-eligible Op: " + "provide `using type = `."); +}; + +template +using tile_operator_t = typename tile_operator::type; + +// Hint that Op uses MUFU (multi-function unit, sin/cos/exp/log/tanh/rcp/rsq); specialize to true to make the tile +// policy picker cap items/thread so MUFU pipes are not oversaturated. +template +inline constexpr bool tile_mufu_heavy_v = false; +} // namespace transform + +// Internal substitutes shipped by CCCL. +namespace detail::transform::tile +{ +// Tile-friendly mirrors of common cuda::std ops. Each has a __tile__ +// templated operator() so it can be invoked from inside transform_kernel +// where the arguments are ct::tile rather than scalar T. +struct tile_plus +{ + template + __tile__ auto operator()(A a, B b) const + { + return a + b; + } +}; + +struct tile_multiplies +{ + template + __tile__ auto operator()(A a, B b) const + { + return a * b; + } +}; +} // namespace detail::transform::tile + +// Built-in trait specializations live in the public namespace alongside the +// trait, but reference the internal substitute functors. +namespace transform +{ +// cuda::std::plus / multiplies are scalar ops, so each is marked eligible and given a tile_operator mirror. +# if _CCCL_HAS_NVFP16() +template <> +inline constexpr bool tile_eligible_v<::cuda::std::plus<::__half>, ::__half, 2> = true; +template <> +inline constexpr bool tile_eligible_v<::cuda::std::multiplies<::__half>, ::__half, 2> = true; +template <> +struct tile_operator<::cuda::std::plus<::__half>> +{ + using type = cub::detail::transform::tile::tile_plus; +}; +template <> +struct tile_operator<::cuda::std::multiplies<::__half>> +{ + using type = cub::detail::transform::tile::tile_multiplies; +}; +# endif // _CCCL_HAS_NVFP16() + +# if _CCCL_HAS_NVBF16() +template <> +inline constexpr bool tile_eligible_v<::cuda::std::plus<::__nv_bfloat16>, ::__nv_bfloat16, 2> = true; +template <> +inline constexpr bool tile_eligible_v<::cuda::std::multiplies<::__nv_bfloat16>, ::__nv_bfloat16, 2> = true; +template <> +struct tile_operator<::cuda::std::plus<::__nv_bfloat16>> +{ + using type = cub::detail::transform::tile::tile_plus; +}; +template <> +struct tile_operator<::cuda::std::multiplies<::__nv_bfloat16>> +{ + using type = cub::detail::transform::tile::tile_multiplies; +}; +# endif // _CCCL_HAS_NVBF16() +} // namespace transform + +CUB_NAMESPACE_END + +#endif // _CCCL_CUB_HAS_TILE_TRANSFORM() diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh new file mode 100644 index 00000000000..b2e8f2b5e68 --- /dev/null +++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh @@ -0,0 +1,68 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_CUB_HAS_TILE_TRANSFORM() + +# include + +# include + +# include + +CUB_NAMESPACE_BEGIN + +namespace detail::transform::tile +{ +// Build a tile partition_view for a 1D contiguous buffer. The two annotations are load-bearing: +// assume_aligned<16> -- promises the pointer is 16-byte aligned, so the compiler can pick LDG.E.128 vectorized +// loads/stores. +// ct::extents -- explicit element type on the extent; CTAD would deduce uint32_t and wrap at 2^32. +// int64_t lets us cover the full num_items range. +// The caller is responsible for honoring assume_aligned<16>; the dispatch header's runtime_preconditions_valid +// enforces this before launching either kernel. +template +[[nodiscard]] __tile__ auto make_aligned_partition_view(T* ptr, N n) +{ + namespace ct = ::cuda::tiles; + const auto ptr_align = ct::assume_aligned<16>(ptr); + auto span = ct::tensor_span{ptr_align, ct::extents<::cuda::std::int64_t, ct::dynamic_extent>{n}}; + return ct::partition_view{span, ct::shape{}}; +} + +// Tile DSL kernel backing cub::DeviceTransform's tile path. It assumes 16-byte pointer alignment + 16-divisible +// num_items (so the compiler picks LDG.E.128); the dispatch header honors that. NV_IF_TARGET(NV_PROVIDES_SM_80) +// guards the body -- tile needs sm_80+, so sub-80 arches get a no-op kernel (dispatch only launches it on sm_80+). +// assume_divisible<16> -- num_items % 16 == 0, so the tile DSL can elide tail handling. +// assume_bounded_below<0> -- num_items >= 0; enables sign-comparison simplifications. +// +// NOTE: make_aligned_partition_view is invoked directly -- do NOT wrap these calls in a lambda: nvcc 13.4 +// miscompiles a templated __tile__ helper called via a lambda under --expt-relaxed-constexpr (invalid IR). +template +__tile_global__ void transform_kernel(const ::cuda::std::int64_t num_items, Out* out, const Ins*... ins) +{ + namespace ct = ::cuda::tiles; + NV_IF_TARGET( + NV_PROVIDES_SM_80, + (const auto bx = ct::bid().x; const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items)); + const auto out_view = make_aligned_partition_view(out, n); + out_view.store_masked(Fn{}(make_aligned_partition_view(ins, n).load_masked(bx)...), bx);)); +} +} // namespace detail::transform::tile + +CUB_NAMESPACE_END + +#endif // _CCCL_CUB_HAS_TILE_TRANSFORM() diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh new file mode 100644 index 00000000000..8a11ad60f7a --- /dev/null +++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh @@ -0,0 +1,85 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Policy picker for cub::DeviceTransform's tile path. Shares the +// bytes-in-flight target used by CUB's non-tile algorithms (calls +// tuning_transform.cuh's cc_to_min_bytes_in_flight) but expresses the +// answer as a TileSize, since tile kernels partition by compile-time +// shape rather than threads*items. + +#pragma once + +#include + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_CUB_HAS_TILE_TRANSFORM() + +# include + +# include +# include +# include +# include +# include +# include + +CUB_NAMESPACE_BEGIN + +namespace detail::transform::tile +{ +// mufu_heavy=true tells the policy the functor body has heavy MUFU usage. +// for small data types, vectorized load will make them arrive packed in +// registers and the compiler unpacks them and packs them back. reducing the +// compute work per thread helps here. need profiling to know the exact cause. +template +constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability cc = {10, 0}) +{ + constexpr int threads_per_block = 128; + constexpr int vector_bytes = 16; // LDG.E.128 -> 16 bytes + constexpr int max_items_per_thread = 32; + constexpr int max_occupancy = 16; + + constexpr auto min_elem = ::cuda::std::min({sizeof(Out), sizeof(Ins)...}); + constexpr int items_for_vec = ::cuda::ceil_div(vector_bytes, min_elem); + + // Fill (zero inputs) keeps the same latency target by counting output bytes. + constexpr auto bytes_per_iter = (sizeof...(Ins) > 0) ? (sizeof(Ins) + ... + ::cuda::std::size_t{0}) : sizeof(Out); + const int target = cub::detail::transform::cc_to_min_bytes_in_flight(cc); + const int items_for_latency = + static_cast(::cuda::ceil_div(target, max_occupancy * threads_per_block * bytes_per_iter)); + + int items = ::cuda::std::max(items_for_vec, items_for_latency); + items = static_cast(::cuda::next_power_of_two(static_cast(items))); + if (items > max_items_per_thread) + { + items = max_items_per_thread; + } + + if (mufu_heavy && min_elem < 4) + { + // Elements that fit in one 16-byte vector load -> items/thread cap for MUFU-heavy sub-4B ops. + // min_elem is size_t, so cast the quotient once here to keep this an int item count (matches + // items below, so the comparison/assignment stay int-vs-int: no sign-compare, no use-site casts). + constexpr int vec_items_cap = static_cast(vector_bytes / min_elem); // 16 for I8, 8 for I16/half/bf16 + if (items > vec_items_cap) + { + items = vec_items_cap; + } + } + + return items * threads_per_block; +} +} // namespace detail::transform::tile + +CUB_NAMESPACE_END + +#endif // _CCCL_CUB_HAS_TILE_TRANSFORM() diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt index ce46a86b93e..0214861c053 100644 --- a/cub/test/CMakeLists.txt +++ b/cub/test/CMakeLists.txt @@ -172,6 +172,22 @@ function( target_compile_options(${test_target} PRIVATE -ftemplate-depth=1000) # for handling large type lists endif() + # Tile-DSL transform test: compile under --enable-tile and turn on the dispatch hook. Gated by + # CCCL_ENABLE_TILE_TRANSFORM_DISPATCH (default OFF) so CI keeps the tile path compiled out. + if ( + CCCL_ENABLE_TILE_TRANSFORM_DISPATCH + AND "${test_src}" MATCHES "test_device_transform_tile\\.cu$" + ) + target_compile_options( + ${test_target} + PRIVATE $<$:--enable-tile> + ) + target_compile_definitions( + ${test_target} + PRIVATE CCCL_ENABLE_TILE_TRANSFORM_DISPATCH + ) + endif() + # enable lambdas for all API examples if ("${test_src}" MATCHES "test.+_api\\.cu$") target_compile_options( diff --git a/cub/test/catch2_test_device_transform_tile.cu b/cub/test/catch2_test_device_transform_tile.cu new file mode 100644 index 00000000000..f77eea0e31b --- /dev/null +++ b/cub/test/catch2_test_device_transform_tile.cu @@ -0,0 +1,143 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "insert_nested_NVTX_range_guard.h" + +#include + +#include + +// The tile dispatch path only exists when nvcc is invoked with --enable-tile and the user opts in via +// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH. In any other build this file compiles to a single skipped test. +#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() +# include + +# include + +# include "catch2_test_launch_helper.h" + +// %PARAM% TEST_LAUNCH lid 0:1:2 + +DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::Transform, transform_many); + +namespace ct = ::cuda::tiles; + +// Each scalar op (passed to Transform, used by the CUB fallback) pairs with a tile-side mirror +// registered through tile_operator. The bodies use tile-tile arithmetic and wrap for unsigned types, +// so the tile result is bit-exact with the host reference. + +// Unary: v * v. +struct square_op +{ + template + __host__ __device__ T operator()(T v) const + { + return static_cast(v * v); + } +}; +struct tile_square_op +{ + template + __tile__ auto operator()(T v) const + { + return v * v; + } +}; + +// Binary: a + b. +struct add_op +{ + template + __host__ __device__ auto operator()(A a, B b) const + { + return static_cast(a + b); + } +}; +struct tile_add_op +{ + template + __tile__ auto operator()(A a, B b) const + { + return a + b; + } +}; + +CUB_NAMESPACE_BEGIN +namespace transform +{ +template +inline constexpr bool tile_eligible_v = true; +template <> +struct tile_operator +{ + using type = tile_square_op; +}; + +template +inline constexpr bool tile_eligible_v = true; +template <> +struct tile_operator +{ + using type = tile_add_op; +}; +} // namespace transform +CUB_NAMESPACE_END + +// Unsigned types so arithmetic wraps deterministically and matches the host reference bit-for-bit. +using tile_types = c2h::type_list<::cuda::std::uint32_t, ::cuda::std::uint64_t>; + +// Sizes span the runtime preconditions: multiples of 16 (with aligned c2h buffers) take the tile +// kernel; the others fall back to the standard CUB dispatch. Both must produce identical results. +# define TILE_TRANSFORM_SIZES GENERATE(::cuda::std::int64_t{0}, 16, 32, 128, 1024, 4096, 65536, 17, 127, 1000) + +C2H_TEST("DeviceTransform tile dispatch: unary scalar op routed through its tile_operator substitute", + "[device][transform][tile]", + tile_types) +{ + using type = c2h::get<0, TestType>; + const ::cuda::std::int64_t num_items = TILE_TRANSFORM_SIZES; + CAPTURE(c2h::type_name(), num_items); + + c2h::device_vector in(num_items, thrust::no_init); + c2h::gen(C2H_SEED(2), in); + c2h::device_vector result(num_items, thrust::no_init); + + transform_many(::cuda::std::make_tuple(in.begin()), result.begin(), num_items, square_op{}); + + c2h::host_vector in_h = in; + c2h::host_vector reference_h(num_items, thrust::no_init); + std::transform(in_h.begin(), in_h.end(), reference_h.begin(), square_op{}); + REQUIRE(reference_h == result); +} + +C2H_TEST("DeviceTransform tile dispatch: binary scalar op routed through its tile_operator substitute", + "[device][transform][tile]", + tile_types) +{ + using type = c2h::get<0, TestType>; + const ::cuda::std::int64_t num_items = TILE_TRANSFORM_SIZES; + CAPTURE(c2h::type_name(), num_items); + + c2h::device_vector a(num_items, thrust::no_init); + c2h::device_vector b(num_items, thrust::no_init); + c2h::gen(C2H_SEED(2), a); + c2h::gen(C2H_SEED(2), b); + c2h::device_vector result(num_items, thrust::no_init); + + transform_many(::cuda::std::make_tuple(a.begin(), b.begin()), result.begin(), num_items, add_op{}); + + c2h::host_vector a_h = a; + c2h::host_vector b_h = b; + c2h::host_vector reference_h(num_items, thrust::no_init); + std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), add_op{}); + REQUIRE(reference_h == result); +} + +#else // !_CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() + +C2H_TEST("DeviceTransform tile dispatch requires --enable-tile", "[device][transform][tile]") +{ + SUCCEED("tile transform dispatch not enabled in this build"); +} + +#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h index f6cd76cc929..69d1509ebd2 100644 --- a/libcudacxx/include/cuda/std/__cccl/builtin.h +++ b/libcudacxx/include/cuda/std/__cccl/builtin.h @@ -112,6 +112,11 @@ # define _CCCL_BUILTIN_ASSUME_ALIGNED(...) __builtin_assume_aligned(__VA_ARGS__) #endif // _CCCL_HAS_BUILTIN(__builtin_assume_aligned) +#if _CCCL_TILE_COMPILATION() // __builtin_assume_aligned is not supported in tile mode +# undef _CCCL_BUILTIN_ASSUME_ALIGNED +# define _CCCL_BUILTIN_ASSUME_ALIGNED(_Ptr, ...) (_Ptr) +#endif // _CCCL_TILE_COMPILATION() + #if _CCCL_CHECK_BUILTIN(builtin_constant_p) || _CCCL_COMPILER(GCC) # define _CCCL_BUILTIN_CONSTANT_P(...) __builtin_constant_p(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(builtin_constant_p) diff --git a/libcudacxx/include/cuda/std/__new/launder.h b/libcudacxx/include/cuda/std/__new/launder.h index e2f3af192a0..3d67950fc18 100644 --- a/libcudacxx/include/cuda/std/__new/launder.h +++ b/libcudacxx/include/cuda/std/__new/launder.h @@ -32,6 +32,10 @@ # define _CCCL_BUILTIN_LAUNDER(...) __builtin_launder(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(builtin_launder) || _CCCL_COMPILER(GCC, >=, 7) || _CCCL_COMPILER(MSVC) +#if _CCCL_TILE_COMPILATION() // __builtin_launder is not supported in tile mode +# undef _CCCL_BUILTIN_LAUNDER +#endif // _CCCL_TILE_COMPILATION() + _CCCL_BEGIN_NAMESPACE_CUDA_STD template