From 6e91c891361ae961e09ef89f6bd6c18df66c02da Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Mon, 1 Jun 2026 14:29:10 -0700
Subject: [PATCH 01/83] tile DeviceTransform policy picker

---
 .../bench/transform/tile/device_transform.cuh | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 cub/benchmarks/bench/transform/tile/device_transform.cuh
diff --git a/cub/benchmarks/bench/transform/tile/device_transform.cuh b/cub/benchmarks/bench/transform/tile/device_transform.cuh
new file mode 100644
index 00000000000..2849deb92d5
--- /dev/null
+++ b/cub/benchmarks/bench/transform/tile/device_transform.cuh
@@ -0,0 +1,66 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// tile port of cub::DeviceTransform - tile-size policy picker.
+// Mirrors the bytes-in-flight target used by cub's transform policy so
+// the tile launches land at comparable occupancy.
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cstdint>
+
+namespace cub_tile::detail {
+
+constexpr int min_bytes_in_flight_per_sm(int cc_x10) {
+    if (cc_x10 >= 1000) return 64 * 1024;   // B200
+    if (cc_x10 >=  900) return 48 * 1024;   // H100/H200
+    if (cc_x10 >=  800) return 16 * 1024;   // A100
+    return 12 * 1024;
+}
+
+constexpr int ceil_div(int a, int b) { return (a + b - 1) / b; }
+constexpr int round_up_pow2(int x) {
+    int p = 1; while (p < x) p *= 2; return p;
+}
+constexpr int min_size(int a) { return a; }
+template <class... Ts> constexpr int min_size(int a, int b, Ts... rest) {
+    int m = a < b ? a : b; return min_size(m, rest...);
+}
+
+// mufu_heavy=true tells the policy the functor body has heavy MUFU usage.
+// for small data types, vectorized load will make them arrive packed in registers
+// and the compiler unpacks them and packs them back. reducing the compute work per
+// thread helps here.
+// need profiling to know the exact cause
+template <typename Out, typename... Ins>
+constexpr int pick_tile_size(bool mufu_heavy = false, int cc_x10 = 1000) {
+    constexpr int threads_per_block    = 128;
+    constexpr int vector_bytes         = 16;   // LDG.E.128 -> 16 bytes
+    constexpr int max_items_per_thread = 32;
+    constexpr int max_occupancy        = 16;
+
+    constexpr int min_elem = min_size(int(sizeof(Out)), int(sizeof(Ins))...);
+    constexpr int items_for_vec  = ceil_div(vector_bytes, min_elem);
+
+    // Fill (zero inputs) keeps the same latency target by counting output bytes.
+    constexpr int bytes_per_iter = (sizeof...(Ins) > 0)
+        ? (int(sizeof(Ins)) + ... + 0)
+        : int(sizeof(Out));
+    const int target = min_bytes_in_flight_per_sm(cc_x10);
+    const int items_for_latency =
+        ceil_div(target, max_occupancy * threads_per_block * bytes_per_iter);
+
+    int items = items_for_vec > items_for_latency ? items_for_vec : items_for_latency;
+    items = round_up_pow2(items);
+    if (items > max_items_per_thread) items = max_items_per_thread;
+
+    if (mufu_heavy && min_elem < 4) {
+        const int byte_cap = vector_bytes / min_elem;   // 16 for I8, 8 for I16/half/bf16
+        if (items > byte_cap) items = byte_cap;
+    }
+
+    return items * threads_per_block;
+}
+
+} // namespace cub_tile::detail

From 4980aab745a91c8d5611ce83ca5d5ba5d21e6619 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Mon, 1 Jun 2026 14:30:07 -0700
Subject: [PATCH 02/83] tile DeviceTransform kernels + public API

---
 .../bench/transform/tile/device_transform.cuh | 101 +++++++++++++++++-
 1 file changed, 98 insertions(+), 3 deletions(-)

diff --git a/cub/benchmarks/bench/transform/tile/device_transform.cuh b/cub/benchmarks/bench/transform/tile/device_transform.cuh
index 2849deb92d5..57a0b965985 100644
--- a/cub/benchmarks/bench/transform/tile/device_transform.cuh
+++ b/cub/benchmarks/bench/transform/tile/device_transform.cuh
@@ -1,14 +1,18 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// tile port of cub::DeviceTransform - tile-size policy picker.
-// Mirrors the bytes-in-flight target used by cub's transform policy so
-// the tile launches land at comparable occupancy.
+// tile port of cub::DeviceTransform.
+// Public surface mirrors cub::DeviceTransform::{Transform, Fill}; the
+// kernels themselves are written against the tile DSL (cuda::tiles).
 
 #pragma once
 
 #include <cuda_runtime.h>
 #include <cstdint>
+#include <cuda/std/tuple>
+#include <cuda/std/utility>
+
+#include "cuda_tile.h"
 
 namespace cub_tile::detail {
 
@@ -63,4 +67,95 @@ constexpr int pick_tile_size(bool mufu_heavy = false, int cc_x10 = 1000) {
     return items * threads_per_block;
 }
 
+template <int TileSize, typename Fn, typename Out, typename... Ins>
+__tile_global__ void transform_kernel(int64_t num_items_, Out* __restrict__ out_,
+                                      const Ins* __restrict__... ins_) {
+    namespace ct = cuda::tiles;
+
+    const auto bx = ct::bid().x;
+    Fn fn{};
+
+    auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_));
+    auto out       = ct::assume_aligned<16>(out_);
+
+    auto out_span = ct::tensor_span{out, ct::extents{num_items}};
+    auto out_view = ct::partition_view{out_span, ct::shape<TileSize>{}};
+
+    auto load_one = [bx, num_items](auto* ptr_) {
+        auto ptr  = ct::assume_aligned<16>(ptr_);
+        auto span = ct::tensor_span{ptr, ct::extents{num_items}};
+        auto view = ct::partition_view{span, ct::shape<TileSize>{}};
+        return view.load_masked(bx);
+    };
+
+    out_view.store_masked(fn(load_one(ins_)...), bx);
+}
+
+template <int TileSize, typename Fn, typename Out, typename... Ins,
+          ::cuda::std::size_t... Idx>
+cudaError_t launch_impl(
+    ::cuda::std::tuple<Ins*...> inputs,
+    Out* output,
+    int64_t num_items,
+    cudaStream_t stream,
+    ::cuda::std::index_sequence<Idx...>) {
+
+    if (num_items <= 0) return cudaSuccess;
+
+    const int64_t num_blocks = (num_items + TileSize - 1) / TileSize;
+
+    transform_kernel<TileSize, Fn><<<static_cast<unsigned int>(num_blocks), 1, 0, stream>>>(
+        num_items, output, ::cuda::std::get<Idx>(inputs)...);
+
+    return cudaGetLastError();
+}
+
+template <int TileSize, typename T>
+__tile_global__ void fill_kernel(int64_t num_items_, T* __restrict__ out_, T value) {
+    namespace ct = cuda::tiles;
+    const auto bx = ct::bid().x;
+
+    auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_));
+    auto out       = ct::assume_aligned<16>(out_);
+
+    auto out_span = ct::tensor_span{out, ct::extents{num_items}};
+    auto out_view = ct::partition_view{out_span, ct::shape<TileSize>{}};
+    using tile_t  = ct::tile<T, ct::shape<TileSize>>;
+    out_view.store_masked(ct::full<tile_t>(value), bx);
+}
+
 } // namespace cub_tile::detail
+
+namespace cub_tile {
+
+struct DeviceTransform {
+    template <int TileSize = 0,
+              bool MufuHeavy = false,
+              typename Fn, typename Out, typename... Ins>
+    static cudaError_t Transform(
+        ::cuda::std::tuple<Ins*...> inputs,
+        Out* output,
+        int64_t num_items,
+        Fn,
+        cudaStream_t stream = 0) {
+        constexpr int chosen = (TileSize > 0)
+            ? TileSize
+            : detail::pick_tile_size<Out, Ins...>(MufuHeavy);
+        return detail::launch_impl<chosen, Fn>(
+            inputs, output, num_items, stream,
+            ::cuda::std::index_sequence_for<Ins...>{});
+    }
+
+    // Fill
+    template <int TileSize = 0, typename T>
+    static cudaError_t Fill(T* output, int64_t num_items, T value, cudaStream_t stream = 0) {
+        if (num_items <= 0) return cudaSuccess;
+        constexpr int chosen = (TileSize > 0) ? TileSize : detail::pick_tile_size<T>();
+        const int64_t num_blocks = (num_items + chosen - 1) / chosen;
+        detail::fill_kernel<chosen, T><<<static_cast<unsigned int>(num_blocks), 1, 0, stream>>>(
+            num_items, output, value);
+        return cudaGetLastError();
+    }
+};
+
+} // namespace cub_tile

From 848a64519f259d56cb287764628973d690534f2e Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Mon, 1 Jun 2026 15:09:31 -0700
Subject: [PATCH 03/83] bench_init RNG helper

---
 .../bench/transform/tile/bench_init.cuh       | 67 +++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 cub/benchmarks/bench/transform/tile/bench_init.cuh

diff --git a/cub/benchmarks/bench/transform/tile/bench_init.cuh b/cub/benchmarks/bench/transform/tile/bench_init.cuh
new file mode 100644
index 00000000000..da3e37f8c40
--- /dev/null
+++ b/cub/benchmarks/bench/transform/tile/bench_init.cuh
@@ -0,0 +1,67 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cstdint>
+#include <type_traits>
+
+namespace bench_init {
+
+// splitmix64 — fast deterministic PRNG, one mix per element.
+__device__ __forceinline__ uint64_t splitmix64(uint64_t x) {
+    x += 0x9E3779B97F4A7C15ULL;
+    x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9ULL;
+    x = (x ^ (x >> 27)) * 0x94D049BB133111EBULL;
+    return x ^ (x >> 31);
+}
+
+// Map a uint64 to a "reasonable" finite value of T in roughly [-1, 1) for floats,
+// or to a non-zero byte for small ints (so neither all-zero nor pathological).
+template <typename T>
+__device__ __forceinline__ T from_random(uint64_t r) {
+    if constexpr (std::is_same_v<T, float>) {
+        // 24-bit mantissa precision, range (-1, 1)
+        uint32_t u = uint32_t(r >> 40);                // 24 bits
+        float f = float(u) * (1.0f / float(1u << 23)) - 1.0f;
+        return f;
+    } else if constexpr (std::is_same_v<T, double>) {
+        uint64_t u = r >> 11;                          // 53 bits
+        double d = double(u) * (1.0 / double(1ull << 52)) - 1.0;
+        return d;
+    } else if constexpr (std::is_same_v<T, __half>) {
+        uint32_t u = uint32_t(r >> 40);
+        float f = float(u) * (1.0f / float(1u << 23)) - 1.0f;
+        return __float2half(f);
+    } else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
+        uint32_t u = uint32_t(r >> 40);
+        float f = float(u) * (1.0f / float(1u << 23)) - 1.0f;
+        return __float2bfloat16(f);
+    } else {
+        // integer types: small non-zero values, biased away from zero so div is meaningful
+        int v = int(r & 0x7f) + 1;                     // 1..128
+        if (r & 0x100) v = -v;                          // sometimes negative
+        return T(v);
+    }
+}
+
+template <typename T>
+__global__ void rand_fill_kernel(T* __restrict__ p, int64_t n, uint64_t seed) {
+    int64_t stride = int64_t(gridDim.x) * blockDim.x;
+    for (int64_t i = int64_t(blockIdx.x) * blockDim.x + threadIdx.x; i < n; i += stride) {
+        p[i] = from_random<T>(splitmix64(seed ^ uint64_t(i)));
+    }
+}
+
+template <typename T>
+inline void rand_fill(T* p, int64_t n, uint64_t seed = 0xC0FFEE) {
+    int block = 256;
+    int64_t nblocks = (n + block - 1) / block;
+    int grid = int(nblocks < 65535 ? nblocks : 65535);
+    rand_fill_kernel<T><<<grid, block>>>(p, n, seed);
+}
+
+} // namespace bench_init

From 463213628fa7fa2bd5949a9e9e7b520559842366 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Mon, 1 Jun 2026 15:09:35 -0700
Subject: [PATCH 04/83] babelstream tile bench

---
 .../bench/transform/tile/babelstream.cu       | 117 ++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 cub/benchmarks/bench/transform/tile/babelstream.cu

diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu
new file mode 100644
index 00000000000..cbc7942b037
--- /dev/null
+++ b/cub/benchmarks/bench/transform/tile/babelstream.cu
@@ -0,0 +1,117 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// BabelStream-style bandwidth benchmarks on cub_tile::DeviceTransform.
+// Mirror of cub/benchmarks/bench/transform/babelstream.cu so we can compare
+// numbers side-by-side.
+
+#include <nvbench/nvbench.cuh>
+
+#include "device_transform.cuh"
+
+#include <cuda_runtime.h>
+#include <cuda/std/tuple>
+#include <vector>
+#include <cstdint>
+
+#include "bench_init.cuh"
+
+#ifndef TILE_SIZE
+#define TILE_SIZE 0     // 0 = auto-pick via detail::pick_tile_size
+#endif
+#define STR_(x) #x
+#define STR(x) STR_(x)
+
+struct mul_op     { template <class B>                  __tile__ auto operator()(B b) const         { return -(b + b); } };
+struct add_op     { template <class A, class B>         __tile__ auto operator()(A a, B b) const    { return a + b; } };
+struct triad_op   { template <class B, class C>         __tile__ auto operator()(B b, C c) const    { return b - c - c; } };
+struct nstream_op { template <class A, class B, class C> __tile__ auto operator()(A a, B b, C c) const { return a + b - c - c; } };
+
+// True if `bytes_needed` worth of GPU memory is available, with 5% headroom
+// for driver overhead. Caller should `state.skip(...)` on false.
+inline bool gpu_mem_available(size_t bytes_needed) {
+    size_t free_b = 0, total_b = 0;
+    if (cudaMemGetInfo(&free_b, &total_b) != cudaSuccess) return false;
+    return bytes_needed + (bytes_needed / 20) < free_b;
+}
+
+template <typename T>
+struct Buffers {
+    T *a{}, *b{}, *c{};
+    int64_t n{};
+    Buffers(int64_t n) : n(n) {
+        cudaMalloc(&a, n * sizeof(T));
+        cudaMalloc(&b, n * sizeof(T));
+        cudaMalloc(&c, n * sizeof(T));
+        // touch every page so HBM is actually backed (not cold-page tricks).
+        // values don't matter for BW measurement.
+        bench_init::rand_fill(a, n, 0xA111);
+        bench_init::rand_fill(b, n, 0xB222);
+        bench_init::rand_fill(c, n, 0xC333);
+        cudaDeviceSynchronize();
+    }
+    ~Buffers() { cudaFree(a); cudaFree(b); cudaFree(c); }
+};
+
+// --- benchmarks ---
+template <typename T>
+void mul(nvbench::state& state, nvbench::type_list<T>) {
+    auto n = state.get_int64("Elements{io}");
+    Buffers<T> buf(n);
+    state.add_element_count(n);
+    state.add_global_memory_reads<T>(n);
+    state.add_global_memory_writes<T>(n);
+    state.exec([&](nvbench::launch& launch) {
+        cub_tile::DeviceTransform::Transform<TILE_SIZE>(
+            ::cuda::std::make_tuple(buf.b), buf.c, n, mul_op{}, launch.get_stream());
+    });
+}
+
+template <typename T>
+void add(nvbench::state& state, nvbench::type_list<T>) {
+    auto n = state.get_int64("Elements{io}");
+    Buffers<T> buf(n);
+    state.add_element_count(n);
+    state.add_global_memory_reads<T>(2 * n);
+    state.add_global_memory_writes<T>(n);
+    state.exec([&](nvbench::launch& launch) {
+        cub_tile::DeviceTransform::Transform<TILE_SIZE>(
+            ::cuda::std::make_tuple(buf.a, buf.b), buf.c, n, add_op{}, launch.get_stream());
+    });
+}
+
+template <typename T>
+void triad(nvbench::state& state, nvbench::type_list<T>) {
+    auto n = state.get_int64("Elements{io}");
+    Buffers<T> buf(n);
+    state.add_element_count(n);
+    state.add_global_memory_reads<T>(2 * n);
+    state.add_global_memory_writes<T>(n);
+    state.exec([&](nvbench::launch& launch) {
+        cub_tile::DeviceTransform::Transform<TILE_SIZE>(
+            ::cuda::std::make_tuple(buf.b, buf.c), buf.a, n, triad_op{}, launch.get_stream());
+    });
+}
+
+template <typename T>
+void nstream(nvbench::state& state, nvbench::type_list<T>) {
+    auto n = state.get_int64("Elements{io}");
+    Buffers<T> buf(n);
+    state.add_element_count(n);
+    state.add_global_memory_reads<T>(3 * n);
+    state.add_global_memory_writes<T>(n);
+    state.exec([&](nvbench::launch& launch) {
+        cub_tile::DeviceTransform::Transform<TILE_SIZE>(
+            ::cuda::std::make_tuple(buf.a, buf.b, buf.c), buf.a, n, nstream_op{}, launch.get_stream());
+    });
+}
+
+using types = nvbench::type_list<std::int8_t, std::int16_t, float, double>;
+inline auto sizes = std::vector<nvbench::int64_t>{16, 20, 24, 28, 31};
+
+NVBENCH_BENCH_TYPES(mul,     NVBENCH_TYPE_AXES(types)).set_name("tile_mul_ts"     STR(TILE_SIZE)).add_int64_power_of_two_axis("Elements{io}", sizes);
+NVBENCH_BENCH_TYPES(add,     NVBENCH_TYPE_AXES(types)).set_name("tile_add_ts"     STR(TILE_SIZE)).add_int64_power_of_two_axis("Elements{io}", sizes);
+NVBENCH_BENCH_TYPES(triad,   NVBENCH_TYPE_AXES(types)).set_name("tile_triad_ts"   STR(TILE_SIZE)).add_int64_power_of_two_axis("Elements{io}", sizes);
+NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(types)).set_name("tile_nstream_ts" STR(TILE_SIZE)).add_int64_power_of_two_axis("Elements{io}", sizes);
+
+NVBENCH_MAIN

From ea6ae70a7ea63cc09dcb9bd2632e070fed83a18a Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Mon, 1 Jun 2026 15:09:54 -0700
Subject: [PATCH 05/83] pytorch tile bench

---
 .../bench/transform/tile/pytorch.cu           | 111 ++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 cub/benchmarks/bench/transform/tile/pytorch.cu

diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu
new file mode 100644
index 00000000000..76330520629
--- /dev/null
+++ b/cub/benchmarks/bench/transform/tile/pytorch.cu
@@ -0,0 +1,111 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// PyTorch ops on tile.  Uses ct::tanh / ct::sin / ct::exp / ct::select.
+
+#include <nvbench/nvbench.cuh>
+#include "device_transform.cuh"
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cuda/std/tuple>
+#include <vector>
+
+#include "bench_init.cuh"
+
+namespace ct = cuda::tiles;
+
+// --- Unary --- (compute in float, cast back so the same ops work for __half/__bf16/float)
+template <class T> __tile__ auto as_float(T v) { return ct::element_cast<float>(v); }
+template <class T, class F> __tile__ auto from_float(F f) { return ct::element_cast<ct::tile_element_t<T>>(f); }
+
+struct relu_op    { template <class T> __tile__ auto operator()(T v) const {
+    auto f = as_float(v); return from_float<T>(ct::select(f > 0.0f, f, f - f)); } };
+struct sigmoid_op { template <class T> __tile__ auto operator()(T v) const {
+    auto f = as_float(v); return from_float<T>(1.0f / (1.0f + ct::exp(-f))); } };
+struct tanh_op    { template <class T> __tile__ auto operator()(T v) const {
+    return from_float<T>(ct::tanh(as_float(v))); } };
+struct gelu_op    { template <class T> __tile__ auto operator()(T v) const {
+    constexpr float k0 = 0.7978845608028654f, k1 = 0.044715f;
+    auto f = as_float(v);
+    return from_float<T>(0.5f * f * (1.0f + ct::tanh(k0 * (f + k1 * f * f * f)))); } };
+struct sin_op { template <class T> __tile__ auto operator()(T v) const { return from_float<T>(ct::sin(as_float(v))); } };
+struct exp_op { template <class T> __tile__ auto operator()(T v) const { return from_float<T>(ct::exp(as_float(v))); } };
+
+// --- Binary ---
+struct binary_add  { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a + b; } };
+struct binary_sub  { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a - b; } };
+struct binary_mul  { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a * b; } };
+struct binary_div  { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a / b; } };
+// le/ge: cast the bool result tile to A's element type so it fits the float output buffer
+//        (CUB does the same implicit cast via its iterator path).
+struct binary_le   { template <class A, class B> __tile__ auto operator()(A a, B b) const { return ct::element_cast<ct::tile_element_t<A>>(a <= b); } };
+struct binary_ge   { template <class A, class B> __tile__ auto operator()(A a, B b) const { return ct::element_cast<ct::tile_element_t<A>>(a >= b); } };
+struct binary_fmin { template <class A, class B> __tile__ auto operator()(A a, B b) const { return ct::select(a < b, a, b); } };
+struct binary_fmax { template <class A, class B> __tile__ auto operator()(A a, B b) const { return ct::select(a > b, a, b); } };
+
+
+template <typename Op, typename T, bool MufuHeavy = false>
+void run_unary(nvbench::state& state) {
+    const auto n = state.get_int64("Elements{io}");
+    T *in, *out;
+    cudaMalloc(&in, n * sizeof(T)); cudaMalloc(&out, n * sizeof(T));
+    bench_init::rand_fill(in, n, 0xA111); cudaDeviceSynchronize();
+    state.add_element_count(n);
+    state.add_global_memory_reads<T>(n);
+    state.add_global_memory_writes<T>(n);
+    state.exec([&](nvbench::launch& launch) {
+        cub_tile::DeviceTransform::Transform<0, MufuHeavy>(
+            ::cuda::std::make_tuple(in), out, n, Op{}, launch.get_stream());
+    });
+    cudaFree(in); cudaFree(out);
+}
+
+template <typename Op, typename T>
+void run_binary(nvbench::state& state) {
+    const auto n = state.get_int64("Elements{io}");
+    T *a, *b, *out;
+    cudaMalloc(&a, n*sizeof(T)); cudaMalloc(&b, n*sizeof(T)); cudaMalloc(&out, n*sizeof(T));
+    bench_init::rand_fill(a, n, 0xA111);
+    bench_init::rand_fill(b, n, 0xB222);
+    cudaDeviceSynchronize();
+    state.add_element_count(n);
+    state.add_global_memory_reads<T>(2*n);
+    state.add_global_memory_writes<T>(n);
+    state.exec([&](nvbench::launch& launch) {
+        cub_tile::DeviceTransform::Transform(
+            ::cuda::std::make_tuple(a, b), out, n, Op{}, launch.get_stream());
+    });
+    cudaFree(a); cudaFree(b); cudaFree(out);
+}
+
+using element_types = nvbench::type_list<__half, __nv_bfloat16, float>;
+inline auto pt_sizes = std::vector<nvbench::int64_t>{16, 20, 24, 28, 31};
+
+#define UNARY_BENCH(name, op, mufu) \
+    template <typename T> void name##_bench(nvbench::state& state, nvbench::type_list<T>) { run_unary<op, T, mufu>(state); } \
+    NVBENCH_BENCH_TYPES(name##_bench, NVBENCH_TYPE_AXES(element_types)).set_name("tile_" #name).add_int64_power_of_two_axis("Elements{io}", pt_sizes);
+
+// MufuHeavy hint set for ops dominated by MUFU intrinsics (exp/tanh/sin/cos).
+// relu is just compare+select, so no hint.
+UNARY_BENCH(relu,    relu_op,    false)
+UNARY_BENCH(sigmoid, sigmoid_op, true)
+UNARY_BENCH(tanh,    tanh_op,    true)
+UNARY_BENCH(gelu,    gelu_op,    true)
+UNARY_BENCH(sin,     sin_op,     true)
+UNARY_BENCH(exp,     exp_op,     true)
+
+#define BINARY_BENCH(name, op) \
+    template <typename T> void name##_bench(nvbench::state& state, nvbench::type_list<T>) { run_binary<op, T>(state); } \
+    NVBENCH_BENCH_TYPES(name##_bench, NVBENCH_TYPE_AXES(element_types)).set_name("tile_pt_" #name).add_int64_power_of_two_axis("Elements{io}", pt_sizes);
+
+BINARY_BENCH(add,  binary_add)
+BINARY_BENCH(sub,  binary_sub)
+BINARY_BENCH(mul,  binary_mul)
+BINARY_BENCH(div,  binary_div)
+BINARY_BENCH(le,   binary_le)
+BINARY_BENCH(ge,   binary_ge)
+BINARY_BENCH(fmin, binary_fmin)
+BINARY_BENCH(fmax, binary_fmax)
+
+NVBENCH_MAIN

From 57c712c7178d5e668f5e256e16a440d3a93a8d4c Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Mon, 1 Jun 2026 15:09:58 -0700
Subject: [PATCH 06/83] copy/grayscale/fill tile benches

---
 cub/benchmarks/bench/transform/tile/copy.cu   | 44 +++++++++++++++
 cub/benchmarks/bench/transform/tile/fill.cu   | 30 +++++++++++
 .../bench/transform/tile/grayscale.cu         | 53 +++++++++++++++++++
 3 files changed, 127 insertions(+)
 create mode 100644 cub/benchmarks/bench/transform/tile/copy.cu
 create mode 100644 cub/benchmarks/bench/transform/tile/fill.cu
 create mode 100644 cub/benchmarks/bench/transform/tile/grayscale.cu

diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu
new file mode 100644
index 00000000000..6bb34a7157d
--- /dev/null
+++ b/cub/benchmarks/bench/transform/tile/copy.cu
@@ -0,0 +1,44 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// Pure copy bench (identity transform) — tile side.
+// Isolates the load/store path from any arithmetic on top: useful for
+// catching narrow-type store wars (e.g. byte stores capping BW).
+
+#include <nvbench/nvbench.cuh>
+#include "device_transform.cuh"
+#include <cuda_runtime.h>
+#include <cuda/std/tuple>
+#include <vector>
+#include <cstdint>
+
+#include "bench_init.cuh"
+
+struct identity {
+    template <class T> __tile__ auto operator()(T v) const { return v; }
+};
+
+template <typename T>
+void copy(nvbench::state& state, nvbench::type_list<T>) {
+    auto n = state.get_int64("Elements{io}");
+    T *in, *out;
+    cudaMalloc(&in, n * sizeof(T)); cudaMalloc(&out, n * sizeof(T));
+    bench_init::rand_fill(in, n, 0xA111); cudaDeviceSynchronize();
+    state.add_element_count(n);
+    state.add_global_memory_reads<T>(n);
+    state.add_global_memory_writes<T>(n);
+    state.exec([&](nvbench::launch& launch) {
+        cub_tile::DeviceTransform::Transform(
+            ::cuda::std::make_tuple(in), out, n, identity{}, launch.get_stream());
+    });
+    cudaFree(in); cudaFree(out);
+}
+
+using types = nvbench::type_list<std::int8_t, std::int16_t, std::int32_t, float, double>;
+inline auto sizes = std::vector<nvbench::int64_t>{16, 20, 24, 28, 31};
+
+NVBENCH_BENCH_TYPES(copy, NVBENCH_TYPE_AXES(types))
+    .set_name("tile_copy")
+    .add_int64_power_of_two_axis("Elements{io}", sizes);
+
+NVBENCH_MAIN
diff --git a/cub/benchmarks/bench/transform/tile/fill.cu b/cub/benchmarks/bench/transform/tile/fill.cu
new file mode 100644
index 00000000000..2b0da544d38
--- /dev/null
+++ b/cub/benchmarks/bench/transform/tile/fill.cu
@@ -0,0 +1,30 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// Fill: zero-input broadcast.  CUB models this as Transform with empty input tuple
+// and a no-arg op.  Tile can't express zero-input Transform directly, so we use the
+// dedicated cub_tile::DeviceTransform::Fill API which writes a constant.
+
+#include <nvbench/nvbench.cuh>
+#include "device_transform.cuh"
+#include <cuda_runtime.h>
+
+template <typename T>
+void fill(nvbench::state& state, nvbench::type_list<T>) {
+    const auto n = state.get_int64("Elements{io}");
+    T* out; cudaMalloc(&out, n * sizeof(T));
+    state.add_element_count(n);
+    state.add_global_memory_writes<T>(n);
+    state.exec([&](nvbench::launch& launch) {
+        cub_tile::DeviceTransform::Fill(out, n, T(42), launch.get_stream());
+    });
+    cudaFree(out);
+}
+
+// CUB sweeps integral types: int8/16/32/64
+using fill_types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t>;
+
+NVBENCH_BENCH_TYPES(fill, NVBENCH_TYPE_AXES(fill_types)).set_name("tile_fill")
+    .add_int64_power_of_two_axis("Elements{io}", std::vector<nvbench::int64_t>{16, 20, 24, 28, 31});
+
+NVBENCH_MAIN
diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu
new file mode 100644
index 00000000000..1f1db7fc737
--- /dev/null
+++ b/cub/benchmarks/bench/transform/tile/grayscale.cu
@@ -0,0 +1,53 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// Grayscale: RGB pixel -> luminance.  Uses a 3-component pixel type.
+// CUB stores rgb_t<float> (12 bytes) packed; tile may or may not accept this as an
+// element type.  If tile rejects rgb_t<float>, this bench will fail to compile —
+// we'll then fall back to treating R/G/B as three separate float streams.
+
+#include <nvbench/nvbench.cuh>
+#include "device_transform.cuh"
+#include "bench_init.cuh"
+#include <cuda_runtime.h>
+#include <cuda/std/tuple>
+#include <vector>
+
+// Three-stream version (R, G, B as separate input arrays).
+// Computationally equivalent to CUB's packed rgb_t version.
+struct rgb_to_y {
+    template <class R, class G, class B>
+    __tile__ auto operator()(R r, G g, B b) const {
+        constexpr float w_r = 0.2989f;
+        constexpr float w_g = 0.587f;
+        constexpr float w_b = 0.114f;
+        return w_r * r + w_g * g + w_b * b;
+    }
+};
+
+template <typename T>
+void grayscale(nvbench::state& state, nvbench::type_list<T>) {
+    const auto n = state.get_int64("Elements{io}");
+    T *r, *g, *b, *out;
+    cudaMalloc(&r, n*sizeof(T)); cudaMalloc(&g, n*sizeof(T)); cudaMalloc(&b, n*sizeof(T));
+    cudaMalloc(&out, n*sizeof(T));
+    bench_init::rand_fill(r, n, 0xA111);
+    bench_init::rand_fill(g, n, 0xA222);
+    bench_init::rand_fill(b, n, 0xA333);
+
+    state.add_element_count(n);
+    state.add_global_memory_reads<T>(3 * n);   // matches CUB's rgb_t<T> = 3*sizeof(T)
+    state.add_global_memory_writes<T>(n);
+    state.exec([&](nvbench::launch& launch) {
+        cub_tile::DeviceTransform::Transform(
+            ::cuda::std::make_tuple(r, g, b), out, n, rgb_to_y{}, launch.get_stream());
+    });
+    cudaFree(r); cudaFree(g); cudaFree(b); cudaFree(out);
+}
+
+using value_types = nvbench::type_list<float, double>;
+
+NVBENCH_BENCH_TYPES(grayscale, NVBENCH_TYPE_AXES(value_types)).set_name("tile_grayscale")
+    .add_int64_power_of_two_axis("Elements{io}", std::vector<nvbench::int64_t>{16, 20, 24, 28, 31});
+
+NVBENCH_MAIN

From 7ecb113fa21fdbfaa9320b65f41907c6afbe497d Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Mon, 1 Jun 2026 15:10:02 -0700
Subject: [PATCH 07/83] tile DeviceTransform tests

---
 .../transform/tile/test_device_transform.cu   | 171 ++++++++++++++++++
 1 file changed, 171 insertions(+)
 create mode 100644 cub/benchmarks/bench/transform/tile/test_device_transform.cu

diff --git a/cub/benchmarks/bench/transform/tile/test_device_transform.cu b/cub/benchmarks/bench/transform/tile/test_device_transform.cu
new file mode 100644
index 00000000000..d787a0df578
--- /dev/null
+++ b/cub/benchmarks/bench/transform/tile/test_device_transform.cu
@@ -0,0 +1,171 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// Standalone correctness tests for cub_tile::DeviceTransform.
+// Sits next to the benches so it builds against the same tileiras
+// toolchain and does not pretend to be part of CCCL's catch2 suite.
+
+#include "device_transform.cuh"
+
+#include <cuda_runtime.h>
+#include <cuda/std/tuple>
+
+#include <cstdio>
+#include <cstdint>
+#include <cstdlib>
+#include <cmath>
+#include <vector>
+
+namespace {
+
+int g_failures = 0;
+
+#define CUDA_CHECK(expr)                                                                  \
+    do {                                                                                  \
+        cudaError_t _e = (expr);                                                          \
+        if (_e != cudaSuccess) {                                                          \
+            std::fprintf(stderr, "%s:%d  CUDA error: %s\n", __FILE__, __LINE__,           \
+                         cudaGetErrorString(_e));                                         \
+            std::exit(2);                                                                 \
+        }                                                                                 \
+    } while (0)
+
+template <typename T>
+bool eq(T a, T b) { return a == b; }
+inline bool eq(float a, float b) {
+    float diff = std::fabs(a - b);
+    float tol  = 1e-5f * std::fmax(std::fabs(a), std::fabs(b));
+    return diff <= std::fmax(tol, 1e-6f);
+}
+
+template <typename T>
+void expect_array(const char* name, const std::vector<T>& got, const std::vector<T>& want) {
+    if (got.size() != want.size()) {
+        std::fprintf(stderr, "[FAIL] %s: size %zu != %zu\n", name, got.size(), want.size());
+        ++g_failures;
+        return;
+    }
+    int mismatches = 0;
+    for (size_t i = 0; i < got.size(); ++i) {
+        if (!eq(got[i], want[i])) {
+            if (mismatches < 4) {
+                std::fprintf(stderr, "[FAIL] %s: idx=%zu got=%g want=%g\n",
+                             name, i, double(got[i]), double(want[i]));
+            }
+            ++mismatches;
+        }
+    }
+    if (mismatches) { ++g_failures; std::fprintf(stderr, "[FAIL] %s: %d mismatches\n", name, mismatches); }
+    else            { std::printf("[ OK ] %s (n=%zu)\n", name, got.size()); }
+}
+
+struct identity_op { template <class A> __tile__ auto operator()(A a) const { return a; } };
+struct square_op   { template <class A> __tile__ auto operator()(A a) const { return a * a; } };
+struct add_op      { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a + b; } };
+struct mul_op      { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a * b; } };
+
+template <typename T>
+std::vector<T> ramp(int64_t n, T start = T{0}, T step = T{1}) {
+    std::vector<T> v(n);
+    for (int64_t i = 0; i < n; ++i) v[i] = T(start + step * T(i));
+    return v;
+}
+
+template <typename T>
+struct GpuVec {
+    T* d{};
+    int64_t n{};
+    explicit GpuVec(int64_t n) : n(n) { CUDA_CHECK(cudaMalloc(&d, n * sizeof(T))); }
+    explicit GpuVec(const std::vector<T>& h) : GpuVec(int64_t(h.size())) {
+        CUDA_CHECK(cudaMemcpy(d, h.data(), n * sizeof(T), cudaMemcpyHostToDevice));
+    }
+    ~GpuVec() { if (d) cudaFree(d); }
+    std::vector<T> to_host() const {
+        std::vector<T> h(n);
+        CUDA_CHECK(cudaMemcpy(h.data(), d, n * sizeof(T), cudaMemcpyDeviceToHost));
+        return h;
+    }
+};
+
+template <typename T>
+void test_identity(int64_t n) {
+    auto h_in = ramp<T>(n, T{1}, T{1});
+    GpuVec<T> dx(h_in), dy(n);
+    CUDA_CHECK(cub_tile::DeviceTransform::Transform(
+        ::cuda::std::make_tuple(dx.d), dy.d, n, identity_op{}));
+    CUDA_CHECK(cudaDeviceSynchronize());
+    expect_array("identity", dy.to_host(), h_in);
+}
+
+template <typename T>
+void test_square(int64_t n) {
+    auto h_in = ramp<T>(n, T{1}, T{1});
+    std::vector<T> want(n);
+    for (int64_t i = 0; i < n; ++i) want[i] = h_in[i] * h_in[i];
+    GpuVec<T> dx(h_in), dy(n);
+    CUDA_CHECK(cub_tile::DeviceTransform::Transform(
+        ::cuda::std::make_tuple(dx.d), dy.d, n, square_op{}));
+    CUDA_CHECK(cudaDeviceSynchronize());
+    expect_array("square", dy.to_host(), want);
+}
+
+template <typename T>
+void test_add(int64_t n) {
+    auto ha = ramp<T>(n, T{1},   T{1});
+    auto hb = ramp<T>(n, T{100}, T{2});
+    std::vector<T> want(n);
+    for (int64_t i = 0; i < n; ++i) want[i] = ha[i] + hb[i];
+    GpuVec<T> da(ha), db(hb), dc(n);
+    CUDA_CHECK(cub_tile::DeviceTransform::Transform(
+        ::cuda::std::make_tuple(da.d, db.d), dc.d, n, add_op{}));
+    CUDA_CHECK(cudaDeviceSynchronize());
+    expect_array("add", dc.to_host(), want);
+}
+
+template <typename T>
+void test_mul(int64_t n) {
+    auto ha = ramp<T>(n, T{1}, T{1});
+    auto hb = ramp<T>(n, T{3}, T{1});
+    std::vector<T> want(n);
+    for (int64_t i = 0; i < n; ++i) want[i] = ha[i] * hb[i];
+    GpuVec<T> da(ha), db(hb), dc(n);
+    CUDA_CHECK(cub_tile::DeviceTransform::Transform(
+        ::cuda::std::make_tuple(da.d, db.d), dc.d, n, mul_op{}));
+    CUDA_CHECK(cudaDeviceSynchronize());
+    expect_array("mul", dc.to_host(), want);
+}
+
+template <typename T>
+void test_fill(int64_t n, T value) {
+    GpuVec<T> dy(n);
+    CUDA_CHECK(cub_tile::DeviceTransform::Fill(dy.d, n, value));
+    CUDA_CHECK(cudaDeviceSynchronize());
+    std::vector<T> want(n, value);
+    expect_array("fill", dy.to_host(), want);
+}
+
+} // namespace
+
+int main() {
+    // pow-2, multiple tiles
+    test_identity<std::int32_t>(4096);
+    test_square<std::int32_t>(2048);
+    test_add<float>(4096);
+    test_mul<float>(2048);
+    test_fill<std::int32_t>(1024, 42);
+
+    // non-pow-2 num_items (still multiple of 16 to satisfy assume_divisible<16>)
+    test_add<float>(4112);     // 16 * 257
+    test_fill<std::int32_t>(1008, -7);   // 16 * 63
+
+    // single full tile and below-one-tile (still >=16, div by 16)
+    test_square<std::int32_t>(16);
+    test_add<float>(64);
+
+    if (g_failures) {
+        std::fprintf(stderr, "\n%d test group(s) FAILED\n", g_failures);
+        return 1;
+    }
+    std::printf("\nall tests passed\n");
+    return 0;
+}

From d5eff222b573c6133e408e2b7a91ecf81a308038 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 3 Jun 2026 13:27:03 -0700
Subject: [PATCH 08/83] move tile DeviceTransform header into CUB public path

The cub_tile::DeviceTransform implementation moves from
cub/benchmarks/bench/transform/tile/device_transform.cuh into
cub/cub/device/dispatch/dispatch_transform_tile.cuh so the CUB header
tree can reference it. The bench .cu files now include from the new
path. The hand-rolled ceil_div and round_up_pow2 helpers are replaced
with cuda::ceil_div and cuda::next_power_of_two from <cuda/cmath>.
---
 .../bench/transform/tile/babelstream.cu       |   2 +-
 cub/benchmarks/bench/transform/tile/copy.cu   |   2 +-
 .../bench/transform/tile/device_transform.cuh | 161 ---------------
 cub/benchmarks/bench/transform/tile/fill.cu   |   2 +-
 .../bench/transform/tile/grayscale.cu         |   2 +-
 .../bench/transform/tile/pytorch.cu           |   2 +-
 .../transform/tile/test_device_transform.cu   |   2 +-
 .../dispatch/dispatch_transform_tile.cuh      | 194 ++++++++++++++++++
 8 files changed, 200 insertions(+), 167 deletions(-)
 delete mode 100644 cub/benchmarks/bench/transform/tile/device_transform.cuh
 create mode 100644 cub/cub/device/dispatch/dispatch_transform_tile.cuh

diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu
index cbc7942b037..1e180f850a4 100644
--- a/cub/benchmarks/bench/transform/tile/babelstream.cu
+++ b/cub/benchmarks/bench/transform/tile/babelstream.cu
@@ -7,7 +7,7 @@
 
 #include <nvbench/nvbench.cuh>
 
-#include "device_transform.cuh"
+#include <cub/device/dispatch/dispatch_transform_tile.cuh>
 
 #include <cuda_runtime.h>
 #include <cuda/std/tuple>
diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu
index 6bb34a7157d..951af8b0fed 100644
--- a/cub/benchmarks/bench/transform/tile/copy.cu
+++ b/cub/benchmarks/bench/transform/tile/copy.cu
@@ -6,7 +6,7 @@
 // catching narrow-type store wars (e.g. byte stores capping BW).
 
 #include <nvbench/nvbench.cuh>
-#include "device_transform.cuh"
+#include <cub/device/dispatch/dispatch_transform_tile.cuh>
 #include <cuda_runtime.h>
 #include <cuda/std/tuple>
 #include <vector>
diff --git a/cub/benchmarks/bench/transform/tile/device_transform.cuh b/cub/benchmarks/bench/transform/tile/device_transform.cuh
deleted file mode 100644
index 57a0b965985..00000000000
--- a/cub/benchmarks/bench/transform/tile/device_transform.cuh
+++ /dev/null
@@ -1,161 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
-// SPDX-License-Identifier: BSD-3-Clause
-
-// tile port of cub::DeviceTransform.
-// Public surface mirrors cub::DeviceTransform::{Transform, Fill}; the
-// kernels themselves are written against the tile DSL (cuda::tiles).
-
-#pragma once
-
-#include <cuda_runtime.h>
-#include <cstdint>
-#include <cuda/std/tuple>
-#include <cuda/std/utility>
-
-#include "cuda_tile.h"
-
-namespace cub_tile::detail {
-
-constexpr int min_bytes_in_flight_per_sm(int cc_x10) {
-    if (cc_x10 >= 1000) return 64 * 1024;   // B200
-    if (cc_x10 >=  900) return 48 * 1024;   // H100/H200
-    if (cc_x10 >=  800) return 16 * 1024;   // A100
-    return 12 * 1024;
-}
-
-constexpr int ceil_div(int a, int b) { return (a + b - 1) / b; }
-constexpr int round_up_pow2(int x) {
-    int p = 1; while (p < x) p *= 2; return p;
-}
-constexpr int min_size(int a) { return a; }
-template <class... Ts> constexpr int min_size(int a, int b, Ts... rest) {
-    int m = a < b ? a : b; return min_size(m, rest...);
-}
-
-// mufu_heavy=true tells the policy the functor body has heavy MUFU usage.
-// for small data types, vectorized load will make them arrive packed in registers
-// and the compiler unpacks them and packs them back. reducing the compute work per
-// thread helps here.
-// need profiling to know the exact cause
-template <typename Out, typename... Ins>
-constexpr int pick_tile_size(bool mufu_heavy = false, int cc_x10 = 1000) {
-    constexpr int threads_per_block    = 128;
-    constexpr int vector_bytes         = 16;   // LDG.E.128 -> 16 bytes
-    constexpr int max_items_per_thread = 32;
-    constexpr int max_occupancy        = 16;
-
-    constexpr int min_elem = min_size(int(sizeof(Out)), int(sizeof(Ins))...);
-    constexpr int items_for_vec  = ceil_div(vector_bytes, min_elem);
-
-    // Fill (zero inputs) keeps the same latency target by counting output bytes.
-    constexpr int bytes_per_iter = (sizeof...(Ins) > 0)
-        ? (int(sizeof(Ins)) + ... + 0)
-        : int(sizeof(Out));
-    const int target = min_bytes_in_flight_per_sm(cc_x10);
-    const int items_for_latency =
-        ceil_div(target, max_occupancy * threads_per_block * bytes_per_iter);
-
-    int items = items_for_vec > items_for_latency ? items_for_vec : items_for_latency;
-    items = round_up_pow2(items);
-    if (items > max_items_per_thread) items = max_items_per_thread;
-
-    if (mufu_heavy && min_elem < 4) {
-        const int byte_cap = vector_bytes / min_elem;   // 16 for I8, 8 for I16/half/bf16
-        if (items > byte_cap) items = byte_cap;
-    }
-
-    return items * threads_per_block;
-}
-
-template <int TileSize, typename Fn, typename Out, typename... Ins>
-__tile_global__ void transform_kernel(int64_t num_items_, Out* __restrict__ out_,
-                                      const Ins* __restrict__... ins_) {
-    namespace ct = cuda::tiles;
-
-    const auto bx = ct::bid().x;
-    Fn fn{};
-
-    auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_));
-    auto out       = ct::assume_aligned<16>(out_);
-
-    auto out_span = ct::tensor_span{out, ct::extents{num_items}};
-    auto out_view = ct::partition_view{out_span, ct::shape<TileSize>{}};
-
-    auto load_one = [bx, num_items](auto* ptr_) {
-        auto ptr  = ct::assume_aligned<16>(ptr_);
-        auto span = ct::tensor_span{ptr, ct::extents{num_items}};
-        auto view = ct::partition_view{span, ct::shape<TileSize>{}};
-        return view.load_masked(bx);
-    };
-
-    out_view.store_masked(fn(load_one(ins_)...), bx);
-}
-
-template <int TileSize, typename Fn, typename Out, typename... Ins,
-          ::cuda::std::size_t... Idx>
-cudaError_t launch_impl(
-    ::cuda::std::tuple<Ins*...> inputs,
-    Out* output,
-    int64_t num_items,
-    cudaStream_t stream,
-    ::cuda::std::index_sequence<Idx...>) {
-
-    if (num_items <= 0) return cudaSuccess;
-
-    const int64_t num_blocks = (num_items + TileSize - 1) / TileSize;
-
-    transform_kernel<TileSize, Fn><<<static_cast<unsigned int>(num_blocks), 1, 0, stream>>>(
-        num_items, output, ::cuda::std::get<Idx>(inputs)...);
-
-    return cudaGetLastError();
-}
-
-template <int TileSize, typename T>
-__tile_global__ void fill_kernel(int64_t num_items_, T* __restrict__ out_, T value) {
-    namespace ct = cuda::tiles;
-    const auto bx = ct::bid().x;
-
-    auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_));
-    auto out       = ct::assume_aligned<16>(out_);
-
-    auto out_span = ct::tensor_span{out, ct::extents{num_items}};
-    auto out_view = ct::partition_view{out_span, ct::shape<TileSize>{}};
-    using tile_t  = ct::tile<T, ct::shape<TileSize>>;
-    out_view.store_masked(ct::full<tile_t>(value), bx);
-}
-
-} // namespace cub_tile::detail
-
-namespace cub_tile {
-
-struct DeviceTransform {
-    template <int TileSize = 0,
-              bool MufuHeavy = false,
-              typename Fn, typename Out, typename... Ins>
-    static cudaError_t Transform(
-        ::cuda::std::tuple<Ins*...> inputs,
-        Out* output,
-        int64_t num_items,
-        Fn,
-        cudaStream_t stream = 0) {
-        constexpr int chosen = (TileSize > 0)
-            ? TileSize
-            : detail::pick_tile_size<Out, Ins...>(MufuHeavy);
-        return detail::launch_impl<chosen, Fn>(
-            inputs, output, num_items, stream,
-            ::cuda::std::index_sequence_for<Ins...>{});
-    }
-
-    // Fill
-    template <int TileSize = 0, typename T>
-    static cudaError_t Fill(T* output, int64_t num_items, T value, cudaStream_t stream = 0) {
-        if (num_items <= 0) return cudaSuccess;
-        constexpr int chosen = (TileSize > 0) ? TileSize : detail::pick_tile_size<T>();
-        const int64_t num_blocks = (num_items + chosen - 1) / chosen;
-        detail::fill_kernel<chosen, T><<<static_cast<unsigned int>(num_blocks), 1, 0, stream>>>(
-            num_items, output, value);
-        return cudaGetLastError();
-    }
-};
-
-} // namespace cub_tile
diff --git a/cub/benchmarks/bench/transform/tile/fill.cu b/cub/benchmarks/bench/transform/tile/fill.cu
index 2b0da544d38..5514c1a1287 100644
--- a/cub/benchmarks/bench/transform/tile/fill.cu
+++ b/cub/benchmarks/bench/transform/tile/fill.cu
@@ -6,7 +6,7 @@
 // dedicated cub_tile::DeviceTransform::Fill API which writes a constant.
 
 #include <nvbench/nvbench.cuh>
-#include "device_transform.cuh"
+#include <cub/device/dispatch/dispatch_transform_tile.cuh>
 #include <cuda_runtime.h>
 
 template <typename T>
diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu
index 1f1db7fc737..14641c2d872 100644
--- a/cub/benchmarks/bench/transform/tile/grayscale.cu
+++ b/cub/benchmarks/bench/transform/tile/grayscale.cu
@@ -7,7 +7,7 @@
 // we'll then fall back to treating R/G/B as three separate float streams.
 
 #include <nvbench/nvbench.cuh>
-#include "device_transform.cuh"
+#include <cub/device/dispatch/dispatch_transform_tile.cuh>
 #include "bench_init.cuh"
 #include <cuda_runtime.h>
 #include <cuda/std/tuple>
diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu
index 76330520629..e1eee3e4452 100644
--- a/cub/benchmarks/bench/transform/tile/pytorch.cu
+++ b/cub/benchmarks/bench/transform/tile/pytorch.cu
@@ -4,7 +4,7 @@
 // PyTorch ops on tile.  Uses ct::tanh / ct::sin / ct::exp / ct::select.
 
 #include <nvbench/nvbench.cuh>
-#include "device_transform.cuh"
+#include <cub/device/dispatch/dispatch_transform_tile.cuh>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
diff --git a/cub/benchmarks/bench/transform/tile/test_device_transform.cu b/cub/benchmarks/bench/transform/tile/test_device_transform.cu
index d787a0df578..0df21dc66a3 100644
--- a/cub/benchmarks/bench/transform/tile/test_device_transform.cu
+++ b/cub/benchmarks/bench/transform/tile/test_device_transform.cu
@@ -5,7 +5,7 @@
 // Sits next to the benches so it builds against the same tileiras
 // toolchain and does not pretend to be part of CCCL's catch2 suite.
 
-#include "device_transform.cuh"
+#include <cub/device/dispatch/dispatch_transform_tile.cuh>
 
 #include <cuda_runtime.h>
 #include <cuda/std/tuple>
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
new file mode 100644
index 00000000000..f75cb8c3ccc
--- /dev/null
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -0,0 +1,194 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Tile port of cub::DeviceTransform. The public surface mirrors
+// cub::DeviceTransform::{Transform, Fill}; the kernels are written against the
+// tile DSL (cuda::tiles). This header is only safe to include when nvcc is
+// invoked with --enable-tile and CTK >= 13.3.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/cmath>
+#include <cuda/std/tuple>
+#include <cuda/std/utility>
+
+#include <cuda_runtime.h>
+#include <cuda_tile.h>
+
+#include <cstdint>
+
+namespace cub_tile::detail
+{
+
+constexpr int min_bytes_in_flight_per_sm(int cc_x10)
+{
+  if (cc_x10 >= 1000)
+  {
+    return 64 * 1024; // B200
+  }
+  if (cc_x10 >= 900)
+  {
+    return 48 * 1024; // H100/H200
+  }
+  if (cc_x10 >= 800)
+  {
+    return 16 * 1024; // A100
+  }
+  return 12 * 1024;
+}
+
+constexpr int min_size(int a)
+{
+  return a;
+}
+template <class... Ts>
+constexpr int min_size(int a, int b, Ts... rest)
+{
+  int m = a < b ? a : b;
+  return min_size(m, rest...);
+}
+
+// mufu_heavy=true tells the policy the functor body has heavy MUFU usage.
+// for small data types, vectorized load will make them arrive packed in
+// registers and the compiler unpacks them and packs them back. reducing the
+// compute work per thread helps here. need profiling to know the exact cause.
+template <typename Out, typename... Ins>
+constexpr int pick_tile_size(bool mufu_heavy = false, int cc_x10 = 1000)
+{
+  constexpr int threads_per_block    = 128;
+  constexpr int vector_bytes         = 16; // LDG.E.128 -> 16 bytes
+  constexpr int max_items_per_thread = 32;
+  constexpr int max_occupancy        = 16;
+
+  constexpr int min_elem      = min_size(int(sizeof(Out)), int(sizeof(Ins))...);
+  constexpr int items_for_vec = static_cast<int>(::cuda::ceil_div(vector_bytes, min_elem));
+
+  // Fill (zero inputs) keeps the same latency target by counting output bytes.
+  constexpr int bytes_per_iter = (sizeof...(Ins) > 0) ? (int(sizeof(Ins)) + ... + 0) : int(sizeof(Out));
+  const int target             = min_bytes_in_flight_per_sm(cc_x10);
+  const int items_for_latency =
+    static_cast<int>(::cuda::ceil_div(target, max_occupancy * threads_per_block * bytes_per_iter));
+
+  int items = items_for_vec > items_for_latency ? items_for_vec : items_for_latency;
+  items     = static_cast<int>(::cuda::next_power_of_two(static_cast<unsigned int>(items)));
+  if (items > max_items_per_thread)
+  {
+    items = max_items_per_thread;
+  }
+
+  if (mufu_heavy && min_elem < 4)
+  {
+    const int byte_cap = vector_bytes / min_elem; // 16 for I8, 8 for I16/half/bf16
+    if (items > byte_cap)
+    {
+      items = byte_cap;
+    }
+  }
+
+  return items * threads_per_block;
+}
+
+template <int TileSize, typename Fn, typename Out, typename... Ins>
+__tile_global__ void
+transform_kernel(int64_t num_items_, Out* __restrict__ out_, const Ins* __restrict__... ins_)
+{
+  namespace ct = cuda::tiles;
+
+  const auto bx = ct::bid().x;
+  Fn fn{};
+
+  auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_));
+  auto out       = ct::assume_aligned<16>(out_);
+
+  auto out_span = ct::tensor_span{out, ct::extents{num_items}};
+  auto out_view = ct::partition_view{out_span, ct::shape<TileSize>{}};
+
+  auto load_one = [bx, num_items](auto* ptr_) {
+    auto ptr  = ct::assume_aligned<16>(ptr_);
+    auto span = ct::tensor_span{ptr, ct::extents{num_items}};
+    auto view = ct::partition_view{span, ct::shape<TileSize>{}};
+    return view.load_masked(bx);
+  };
+
+  out_view.store_masked(fn(load_one(ins_)...), bx);
+}
+
+template <int TileSize, typename Fn, typename Out, typename... Ins, ::cuda::std::size_t... Idx>
+cudaError_t launch_impl(
+  ::cuda::std::tuple<Ins*...> inputs,
+  Out* output,
+  int64_t num_items,
+  cudaStream_t stream,
+  ::cuda::std::index_sequence<Idx...>)
+{
+  if (num_items <= 0)
+  {
+    return cudaSuccess;
+  }
+
+  const int64_t num_blocks = (num_items + TileSize - 1) / TileSize;
+
+  transform_kernel<TileSize, Fn><<<static_cast<unsigned int>(num_blocks), 1, 0, stream>>>(
+    num_items, output, ::cuda::std::get<Idx>(inputs)...);
+
+  return cudaGetLastError();
+}
+
+template <int TileSize, typename T>
+__tile_global__ void fill_kernel(int64_t num_items_, T* __restrict__ out_, T value)
+{
+  namespace ct  = cuda::tiles;
+  const auto bx = ct::bid().x;
+
+  auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_));
+  auto out       = ct::assume_aligned<16>(out_);
+
+  auto out_span = ct::tensor_span{out, ct::extents{num_items}};
+  auto out_view = ct::partition_view{out_span, ct::shape<TileSize>{}};
+  using tile_t  = ct::tile<T, ct::shape<TileSize>>;
+  out_view.store_masked(ct::full<tile_t>(value), bx);
+}
+
+} // namespace cub_tile::detail
+
+namespace cub_tile
+{
+
+struct DeviceTransform
+{
+  template <int TileSize = 0, bool MufuHeavy = false, typename Fn, typename Out, typename... Ins>
+  static cudaError_t
+  Transform(::cuda::std::tuple<Ins*...> inputs, Out* output, int64_t num_items, Fn, cudaStream_t stream = 0)
+  {
+    constexpr int chosen = (TileSize > 0) ? TileSize : detail::pick_tile_size<Out, Ins...>(MufuHeavy);
+    return detail::launch_impl<chosen, Fn>(
+      inputs, output, num_items, stream, ::cuda::std::index_sequence_for<Ins...>{});
+  }
+
+  // Fill
+  template <int TileSize = 0, typename T>
+  static cudaError_t Fill(T* output, int64_t num_items, T value, cudaStream_t stream = 0)
+  {
+    if (num_items <= 0)
+    {
+      return cudaSuccess;
+    }
+    constexpr int chosen     = (TileSize > 0) ? TileSize : detail::pick_tile_size<T>();
+    const int64_t num_blocks = (num_items + chosen - 1) / chosen;
+    detail::fill_kernel<chosen, T>
+      <<<static_cast<unsigned int>(num_blocks), 1, 0, stream>>>(num_items, output, value);
+    return cudaGetLastError();
+  }
+};
+
+} // namespace cub_tile

From 5f9885749d9371108f6034cb5e3b35314b4f696d Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 3 Jun 2026 13:32:59 -0700
Subject: [PATCH 09/83] move tile dispatch into cub::detail::transform::tile
 namespace

The cub_tile namespace and its hand-rolled detail layout move under
cub::detail::transform::tile to match how CUB groups the existing
transform internals. A type alias is kept at cub_tile::DeviceTransform
so the benches and tests still compile during the transition.

The whole file body is also gated by _CCCL_CTK_AT_LEAST(13, 3) so
older toolchains never see the tile DSL types.
---
 .../dispatch/dispatch_transform_tile.cuh      | 49 +++++++++++--------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index f75cb8c3ccc..584d9829963 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -3,8 +3,8 @@
 
 // Tile port of cub::DeviceTransform. The public surface mirrors
 // cub::DeviceTransform::{Transform, Fill}; the kernels are written against the
-// tile DSL (cuda::tiles). This header is only safe to include when nvcc is
-// invoked with --enable-tile and CTK >= 13.3.
+// tile DSL (cuda::tiles). This header requires CTK 13.3 or newer and nvcc
+// invoked with --enable-tile.
 
 #pragma once
 
@@ -18,16 +18,20 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/cmath>
-#include <cuda/std/tuple>
-#include <cuda/std/utility>
+#if _CCCL_CTK_AT_LEAST(13, 3)
 
-#include <cuda_runtime.h>
-#include <cuda_tile.h>
+#  include <cuda/cmath>
+#  include <cuda/std/tuple>
+#  include <cuda/std/utility>
 
-#include <cstdint>
+#  include <cuda_runtime.h>
+#  include <cuda_tile.h>
 
-namespace cub_tile::detail
+#  include <cstdint>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail::transform::tile
 {
 
 constexpr int min_bytes_in_flight_per_sm(int cc_x10)
@@ -159,20 +163,14 @@ __tile_global__ void fill_kernel(int64_t num_items_, T* __restrict__ out_, T val
   out_view.store_masked(ct::full<tile_t>(value), bx);
 }
 
-} // namespace cub_tile::detail
-
-namespace cub_tile
-{
-
 struct DeviceTransform
 {
   template <int TileSize = 0, bool MufuHeavy = false, typename Fn, typename Out, typename... Ins>
   static cudaError_t
   Transform(::cuda::std::tuple<Ins*...> inputs, Out* output, int64_t num_items, Fn, cudaStream_t stream = 0)
   {
-    constexpr int chosen = (TileSize > 0) ? TileSize : detail::pick_tile_size<Out, Ins...>(MufuHeavy);
-    return detail::launch_impl<chosen, Fn>(
-      inputs, output, num_items, stream, ::cuda::std::index_sequence_for<Ins...>{});
+    constexpr int chosen = (TileSize > 0) ? TileSize : pick_tile_size<Out, Ins...>(MufuHeavy);
+    return launch_impl<chosen, Fn>(inputs, output, num_items, stream, ::cuda::std::index_sequence_for<Ins...>{});
   }
 
   // Fill
@@ -183,12 +181,23 @@ struct DeviceTransform
     {
       return cudaSuccess;
     }
-    constexpr int chosen     = (TileSize > 0) ? TileSize : detail::pick_tile_size<T>();
+    constexpr int chosen     = (TileSize > 0) ? TileSize : pick_tile_size<T>();
     const int64_t num_blocks = (num_items + chosen - 1) / chosen;
-    detail::fill_kernel<chosen, T>
-      <<<static_cast<unsigned int>(num_blocks), 1, 0, stream>>>(num_items, output, value);
+    fill_kernel<chosen, T><<<static_cast<unsigned int>(num_blocks), 1, 0, stream>>>(num_items, output, value);
     return cudaGetLastError();
   }
 };
 
+} // namespace detail::transform::tile
+
+CUB_NAMESPACE_END
+
+// Compatibility shim. Existing benches and tests still call
+// cub_tile::DeviceTransform; once they move to cub::DeviceTransform with named
+// functors and the trait dispatch, this alias can be removed.
+namespace cub_tile
+{
+using DeviceTransform = ::cub::detail::transform::tile::DeviceTransform;
 } // namespace cub_tile
+
+#endif // _CCCL_CTK_AT_LEAST(13, 3)

From 8ae89ef3d3912d8951714c3e9a704f8e36163230 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 3 Jun 2026 13:45:33 -0700
Subject: [PATCH 10/83] add tile dispatch trait header

---
 .../dispatch_transform_tile_traits.cuh        | 81 +++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
new file mode 100644
index 00000000000..8bfdadaac87
--- /dev/null
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
@@ -0,0 +1,81 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Compile-time policy for cub::DeviceTransform's tile path.
+//
+// tile_eligible_v<Op, T, NIn> answers "should DeviceTransform::Transform
+// route to the tile kernel for this (functor, element type, input arity)?".
+// tile_mufu_heavy_v<Op> hints the tile policy picker that Op spends most of
+// its time on MUFU instructions, so the picker caps items/thread at the
+// vector width to avoid piling up MUFU work that cannot SIMD on Blackwell
+// for sub-4-byte types.
+//
+// This header is pure trait infrastructure; no callers yet. Specializations
+// land here as benches confirm tile wins for a (Op, T, NIn) combination.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_CTK_AT_LEAST(13, 3)
+
+#  include <cuda/std/__cccl/extended_data_types.h>
+#  include <cuda/std/__functional/operations.h>
+#  include <cuda/std/__type_traits/integral_constant.h>
+
+#  include <cstddef>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail::transform::tile
+{
+
+// Primary template: tile path is opt-in. Specialize for combinations where a
+// bench has shown the tile kernel beats the existing CUB algorithms.
+template <typename Op, typename T, ::cuda::std::size_t NIn>
+struct tile_eligible : ::cuda::std::false_type
+{};
+
+template <typename Op, typename T, ::cuda::std::size_t NIn>
+inline constexpr bool tile_eligible_v = tile_eligible<Op, T, NIn>::value;
+
+// Companion trait: report Op as MUFU-heavy so the tile policy picker caps
+// items/thread at the vector width on small element types. Default is false.
+template <typename Op>
+struct tile_mufu_heavy : ::cuda::std::false_type
+{};
+
+template <typename Op>
+inline constexpr bool tile_mufu_heavy_v = tile_mufu_heavy<Op>::value;
+
+#  if _CCCL_HAS_NVFP16()
+template <>
+struct tile_eligible<::cuda::std::plus<__half>, __half, 2> : ::cuda::std::true_type
+{};
+template <>
+struct tile_eligible<::cuda::std::multiplies<__half>, __half, 2> : ::cuda::std::true_type
+{};
+#  endif // _CCCL_HAS_NVFP16()
+
+#  if _CCCL_HAS_NVBF16()
+template <>
+struct tile_eligible<::cuda::std::plus<__nv_bfloat16>, __nv_bfloat16, 2> : ::cuda::std::true_type
+{};
+template <>
+struct tile_eligible<::cuda::std::multiplies<__nv_bfloat16>, __nv_bfloat16, 2> : ::cuda::std::true_type
+{};
+#  endif // _CCCL_HAS_NVBF16()
+
+} // namespace detail::transform::tile
+
+CUB_NAMESPACE_END
+
+#endif // _CCCL_CTK_AT_LEAST(13, 3)

From 37c8b718441a828fdbb4f46c5c0fafb9d128d87f Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 3 Jun 2026 13:54:47 -0700
Subject: [PATCH 11/83] split tile dispatch into kernel and tuning headers

---
 .../dispatch/dispatch_transform_tile.cuh      | 113 +-----------------
 .../kernels/kernel_transform_tile.cuh         |  76 ++++++++++++
 .../dispatch/tuning/tuning_transform_tile.cuh | 103 ++++++++++++++++
 3 files changed, 182 insertions(+), 110 deletions(-)
 create mode 100644 cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
 create mode 100644 cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 584d9829963..47f25c46bd9 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -20,12 +20,13 @@
 
 #if _CCCL_CTK_AT_LEAST(13, 3)
 
-#  include <cuda/cmath>
+#  include <cub/device/dispatch/kernels/kernel_transform_tile.cuh>
+#  include <cub/device/dispatch/tuning/tuning_transform_tile.cuh>
+
 #  include <cuda/std/tuple>
 #  include <cuda/std/utility>
 
 #  include <cuda_runtime.h>
-#  include <cuda_tile.h>
 
 #  include <cstdint>
 
@@ -34,99 +35,6 @@ CUB_NAMESPACE_BEGIN
 namespace detail::transform::tile
 {
 
-constexpr int min_bytes_in_flight_per_sm(int cc_x10)
-{
-  if (cc_x10 >= 1000)
-  {
-    return 64 * 1024; // B200
-  }
-  if (cc_x10 >= 900)
-  {
-    return 48 * 1024; // H100/H200
-  }
-  if (cc_x10 >= 800)
-  {
-    return 16 * 1024; // A100
-  }
-  return 12 * 1024;
-}
-
-constexpr int min_size(int a)
-{
-  return a;
-}
-template <class... Ts>
-constexpr int min_size(int a, int b, Ts... rest)
-{
-  int m = a < b ? a : b;
-  return min_size(m, rest...);
-}
-
-// mufu_heavy=true tells the policy the functor body has heavy MUFU usage.
-// for small data types, vectorized load will make them arrive packed in
-// registers and the compiler unpacks them and packs them back. reducing the
-// compute work per thread helps here. need profiling to know the exact cause.
-template <typename Out, typename... Ins>
-constexpr int pick_tile_size(bool mufu_heavy = false, int cc_x10 = 1000)
-{
-  constexpr int threads_per_block    = 128;
-  constexpr int vector_bytes         = 16; // LDG.E.128 -> 16 bytes
-  constexpr int max_items_per_thread = 32;
-  constexpr int max_occupancy        = 16;
-
-  constexpr int min_elem      = min_size(int(sizeof(Out)), int(sizeof(Ins))...);
-  constexpr int items_for_vec = static_cast<int>(::cuda::ceil_div(vector_bytes, min_elem));
-
-  // Fill (zero inputs) keeps the same latency target by counting output bytes.
-  constexpr int bytes_per_iter = (sizeof...(Ins) > 0) ? (int(sizeof(Ins)) + ... + 0) : int(sizeof(Out));
-  const int target             = min_bytes_in_flight_per_sm(cc_x10);
-  const int items_for_latency =
-    static_cast<int>(::cuda::ceil_div(target, max_occupancy * threads_per_block * bytes_per_iter));
-
-  int items = items_for_vec > items_for_latency ? items_for_vec : items_for_latency;
-  items     = static_cast<int>(::cuda::next_power_of_two(static_cast<unsigned int>(items)));
-  if (items > max_items_per_thread)
-  {
-    items = max_items_per_thread;
-  }
-
-  if (mufu_heavy && min_elem < 4)
-  {
-    const int byte_cap = vector_bytes / min_elem; // 16 for I8, 8 for I16/half/bf16
-    if (items > byte_cap)
-    {
-      items = byte_cap;
-    }
-  }
-
-  return items * threads_per_block;
-}
-
-template <int TileSize, typename Fn, typename Out, typename... Ins>
-__tile_global__ void
-transform_kernel(int64_t num_items_, Out* __restrict__ out_, const Ins* __restrict__... ins_)
-{
-  namespace ct = cuda::tiles;
-
-  const auto bx = ct::bid().x;
-  Fn fn{};
-
-  auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_));
-  auto out       = ct::assume_aligned<16>(out_);
-
-  auto out_span = ct::tensor_span{out, ct::extents{num_items}};
-  auto out_view = ct::partition_view{out_span, ct::shape<TileSize>{}};
-
-  auto load_one = [bx, num_items](auto* ptr_) {
-    auto ptr  = ct::assume_aligned<16>(ptr_);
-    auto span = ct::tensor_span{ptr, ct::extents{num_items}};
-    auto view = ct::partition_view{span, ct::shape<TileSize>{}};
-    return view.load_masked(bx);
-  };
-
-  out_view.store_masked(fn(load_one(ins_)...), bx);
-}
-
 template <int TileSize, typename Fn, typename Out, typename... Ins, ::cuda::std::size_t... Idx>
 cudaError_t launch_impl(
   ::cuda::std::tuple<Ins*...> inputs,
@@ -148,21 +56,6 @@ cudaError_t launch_impl(
   return cudaGetLastError();
 }
 
-template <int TileSize, typename T>
-__tile_global__ void fill_kernel(int64_t num_items_, T* __restrict__ out_, T value)
-{
-  namespace ct  = cuda::tiles;
-  const auto bx = ct::bid().x;
-
-  auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_));
-  auto out       = ct::assume_aligned<16>(out_);
-
-  auto out_span = ct::tensor_span{out, ct::extents{num_items}};
-  auto out_view = ct::partition_view{out_span, ct::shape<TileSize>{}};
-  using tile_t  = ct::tile<T, ct::shape<TileSize>>;
-  out_view.store_masked(ct::full<tile_t>(value), bx);
-}
-
 struct DeviceTransform
 {
   template <int TileSize = 0, bool MufuHeavy = false, typename Fn, typename Out, typename... Ins>
diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
new file mode 100644
index 00000000000..3d038c9068f
--- /dev/null
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -0,0 +1,76 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Tile DSL kernels backing cub::DeviceTransform's tile path. The kernels
+// assume 16-byte alignment on every pointer and 16-byte divisibility on
+// num_items so the compiler can pick LDG.E.128. Callers in the dispatch
+// header are responsible for honoring those preconditions.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_CTK_AT_LEAST(13, 3)
+
+#  include <cuda_tile.h>
+
+#  include <cstdint>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail::transform::tile
+{
+
+template <int TileSize, typename Fn, typename Out, typename... Ins>
+__tile_global__ void
+transform_kernel(int64_t num_items_, Out* __restrict__ out_, const Ins* __restrict__... ins_)
+{
+  namespace ct = cuda::tiles;
+
+  const auto bx = ct::bid().x;
+  Fn fn{};
+
+  auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_));
+  auto out       = ct::assume_aligned<16>(out_);
+
+  auto out_span = ct::tensor_span{out, ct::extents{num_items}};
+  auto out_view = ct::partition_view{out_span, ct::shape<TileSize>{}};
+
+  auto load_one = [bx, num_items](auto* ptr_) {
+    auto ptr  = ct::assume_aligned<16>(ptr_);
+    auto span = ct::tensor_span{ptr, ct::extents{num_items}};
+    auto view = ct::partition_view{span, ct::shape<TileSize>{}};
+    return view.load_masked(bx);
+  };
+
+  out_view.store_masked(fn(load_one(ins_)...), bx);
+}
+
+template <int TileSize, typename T>
+__tile_global__ void fill_kernel(int64_t num_items_, T* __restrict__ out_, T value)
+{
+  namespace ct  = cuda::tiles;
+  const auto bx = ct::bid().x;
+
+  auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_));
+  auto out       = ct::assume_aligned<16>(out_);
+
+  auto out_span = ct::tensor_span{out, ct::extents{num_items}};
+  auto out_view = ct::partition_view{out_span, ct::shape<TileSize>{}};
+  using tile_t  = ct::tile<T, ct::shape<TileSize>>;
+  out_view.store_masked(ct::full<tile_t>(value), bx);
+}
+
+} // namespace detail::transform::tile
+
+CUB_NAMESPACE_END
+
+#endif // _CCCL_CTK_AT_LEAST(13, 3)
diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
new file mode 100644
index 00000000000..86c2d1b394f
--- /dev/null
+++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
@@ -0,0 +1,103 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Policy picker for cub::DeviceTransform's tile path. Mirrors the
+// bytes-in-flight target used by CUB's non-tile algorithms (see
+// tuning_transform.cuh's cc_to_min_bytes_in_flight) but expresses the
+// answer as a TileSize, since tile kernels partition by compile-time
+// shape rather than threads*items.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_CTK_AT_LEAST(13, 3)
+
+#  include <cuda/cmath>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail::transform::tile
+{
+
+constexpr int min_bytes_in_flight_per_sm(int cc_x10)
+{
+  if (cc_x10 >= 1000)
+  {
+    return 64 * 1024; // B200
+  }
+  if (cc_x10 >= 900)
+  {
+    return 48 * 1024; // H100/H200
+  }
+  if (cc_x10 >= 800)
+  {
+    return 16 * 1024; // A100
+  }
+  return 12 * 1024;
+}
+
+constexpr int min_size(int a)
+{
+  return a;
+}
+template <class... Ts>
+constexpr int min_size(int a, int b, Ts... rest)
+{
+  int m = a < b ? a : b;
+  return min_size(m, rest...);
+}
+
+// mufu_heavy=true tells the policy the functor body has heavy MUFU usage.
+// for small data types, vectorized load will make them arrive packed in
+// registers and the compiler unpacks them and packs them back. reducing the
+// compute work per thread helps here. need profiling to know the exact cause.
+template <typename Out, typename... Ins>
+constexpr int pick_tile_size(bool mufu_heavy = false, int cc_x10 = 1000)
+{
+  constexpr int threads_per_block    = 128;
+  constexpr int vector_bytes         = 16; // LDG.E.128 -> 16 bytes
+  constexpr int max_items_per_thread = 32;
+  constexpr int max_occupancy        = 16;
+
+  constexpr int min_elem      = min_size(int(sizeof(Out)), int(sizeof(Ins))...);
+  constexpr int items_for_vec = static_cast<int>(::cuda::ceil_div(vector_bytes, min_elem));
+
+  // Fill (zero inputs) keeps the same latency target by counting output bytes.
+  constexpr int bytes_per_iter = (sizeof...(Ins) > 0) ? (int(sizeof(Ins)) + ... + 0) : int(sizeof(Out));
+  const int target             = min_bytes_in_flight_per_sm(cc_x10);
+  const int items_for_latency =
+    static_cast<int>(::cuda::ceil_div(target, max_occupancy * threads_per_block * bytes_per_iter));
+
+  int items = items_for_vec > items_for_latency ? items_for_vec : items_for_latency;
+  items     = static_cast<int>(::cuda::next_power_of_two(static_cast<unsigned int>(items)));
+  if (items > max_items_per_thread)
+  {
+    items = max_items_per_thread;
+  }
+
+  if (mufu_heavy && min_elem < 4)
+  {
+    const int byte_cap = vector_bytes / min_elem; // 16 for I8, 8 for I16/half/bf16
+    if (items > byte_cap)
+    {
+      items = byte_cap;
+    }
+  }
+
+  return items * threads_per_block;
+}
+
+} // namespace detail::transform::tile
+
+CUB_NAMESPACE_END
+
+#endif // _CCCL_CTK_AT_LEAST(13, 3)

From 7fb935fd2c4ea638835cf9948d254c811b761996 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 3 Jun 2026 14:14:10 -0700
Subject: [PATCH 12/83] wire tile dispatch hook into cub::DeviceTransform

---
 cub/cub/device/device_transform.cuh           | 31 ++++++++
 .../dispatch/dispatch_transform_tile.cuh      | 77 +++++++++++++++++++
 .../dispatch_transform_tile_traits.cuh        | 68 ++++++++++++----
 3 files changed, 161 insertions(+), 15 deletions(-)

diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index d8ad0354bfc..77c50432e34 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -17,6 +17,10 @@
 #include <cub/device/dispatch/dispatch_transform.cuh>
 #include <cub/util_namespace.cuh>
 
+#if _CCCL_CTK_AT_LEAST(13, 3) && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#  include <cub/device/dispatch/dispatch_transform_tile.cuh>
+#endif
+
 #include <cuda/__execution/tune.h>
 #include <cuda/__functional/address_stability.h>
 #include <cuda/__functional/always_true_false.h>
@@ -99,6 +103,32 @@ struct DeviceTransform
 
     const auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env).get();
 
+#if _CCCL_CTK_AT_LEAST(13, 3) && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+    // Opt-in tile path. When every compile-time gate passes we route here
+    // and DO NOT instantiate the standard CUB transform dispatch below --
+    // under --enable-tile that path fails to compile for many (Op, T)
+    // combinations. Runtime alignment / divisibility violations on this
+    // branch surface as cudaErrorInvalidValue; the caller is expected to
+    // satisfy the 16-byte preconditions when opting into the tile path.
+    if constexpr (StableAddress == detail::transform::requires_stable_address::no
+                  && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
+                  && detail::transform::tile::tile_dispatch_eligible_v<
+                       TransformOp,
+                       RandomAccessIteratorOut,
+                       RandomAccessIteratorsIn...>)
+    {
+      cudaError_t tile_result;
+      if (detail::transform::tile::try_dispatch<TransformOp>(
+            inputs, output, static_cast<offset_t>(num_items), stream, tile_result))
+      {
+        return tile_result;
+      }
+      return cudaErrorInvalidValue;
+    }
+    else
+#endif // _CCCL_CTK_AT_LEAST(13, 3) && CCCL_ENABLE_TILE_TRANSFORM_DISPATCH && _CCCL_TILE_COMPILATION()
+    {
+
     using tuning_env =
       ::cuda::std::execution::__query_result_or_t<Env, ::cuda::execution::__get_tuning_t, ::cuda::std::execution::env<>>;
     using default_policy_selector =
@@ -122,6 +152,7 @@ struct DeviceTransform
       ::cuda::std::move(transform_op),
       stream,
       policy_selector{});
+    }
   }
 
   // TODO(bgruber): we want to eventually forward the output tuple to the kernel and optimize writing multiple streams
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 47f25c46bd9..23b1fac2790 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -20,9 +20,20 @@
 
 #if _CCCL_CTK_AT_LEAST(13, 3)
 
+#  include <cub/device/dispatch/dispatch_transform_tile_traits.cuh>
 #  include <cub/device/dispatch/kernels/kernel_transform_tile.cuh>
 #  include <cub/device/dispatch/tuning/tuning_transform_tile.cuh>
 
+#  include <thrust/type_traits/is_contiguous_iterator.h>
+#  include <thrust/type_traits/unwrap_contiguous_iterator.h>
+
+#  include <cuda/__memory/is_aligned.h>
+#  include <cuda/std/__tuple_dir/apply.h>
+#  include <cuda/std/__utility/declval.h>
+#  include <cuda/std/__type_traits/is_empty.h>
+#  include <cuda/std/__type_traits/is_trivially_default_constructible.h>
+#  include <cuda/std/__type_traits/remove_cv.h>
+#  include <cuda/std/__type_traits/remove_pointer.h>
 #  include <cuda/std/tuple>
 #  include <cuda/std/utility>
 
@@ -81,6 +92,72 @@ struct DeviceTransform
   }
 };
 
+namespace __detail
+{
+template <typename Iter>
+using __unwrapped_value_t =
+  ::cuda::std::remove_cv_t<::cuda::std::remove_pointer_t<decltype(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(
+    ::cuda::std::declval<Iter>()))>>;
+} // namespace __detail
+
+// Combined compile-time predicate used by cub::DeviceTransform's __transform_internal
+// to decide whether to route a given (Op, OutIter, InIters...) to the tile path.
+// The call site lifts this into an `if constexpr` so the standard CUB dispatch
+// is not instantiated when tile takes over (under --enable-tile the standard
+// path fails to compile for many functor/type combinations).
+template <typename Op, typename OutIter, typename... InIters>
+inline constexpr bool tile_dispatch_eligible_v =
+  THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutIter>
+  && (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<InIters> && ...)
+  && tile_eligible_v<Op, __detail::__unwrapped_value_t<OutIter>, sizeof...(InIters)>;
+
+// Bridge between cub::DeviceTransform::__transform_internal and the tile
+// DeviceTransform above. Precondition: tile_dispatch_eligible_v<Op, OutIter,
+// InIters...> is true. Returns true and writes the launch result when the
+// call was handled; returns false when the runtime 16-byte alignment /
+// divisibility preconditions are not satisfied (caller surfaces that as
+// cudaErrorInvalidValue -- there is no CUB fallback under --enable-tile).
+//
+// The tile kernel is launched with the trait's tile_op_type (a tile-friendly
+// mirror of Op with __tile__ operator), NOT the user's Op instance -- the
+// user's scalar functor cannot be invoked on ct::tile arguments.
+template <typename TransformOp, typename OutIter, typename... InIters, typename OffsetT>
+CUB_RUNTIME_FUNCTION bool try_dispatch(
+  ::cuda::std::tuple<InIters...> inputs,
+  OutIter output,
+  OffsetT num_items,
+  cudaStream_t stream,
+  cudaError_t& result)
+{
+  auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output);
+  auto in_ptrs = ::cuda::std::apply(
+    [](auto... iters) {
+      return ::cuda::std::make_tuple(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters)...);
+    },
+    inputs);
+  using out_value_t = ::cuda::std::remove_cv_t<::cuda::std::remove_pointer_t<decltype(out_ptr)>>;
+  using tile_op_t   = typename tile_eligible<TransformOp, out_value_t, sizeof...(InIters)>::tile_op_type;
+  static_assert(::cuda::std::is_empty_v<tile_op_t>,
+                "tile_op_type must be stateless (the tile kernel default-constructs it)");
+  static_assert(::cuda::std::is_trivially_default_constructible_v<tile_op_t>,
+                "tile_op_type must be trivially default constructible");
+
+  constexpr int kAlign = 16;
+  const bool aligned_out = ::cuda::is_aligned(out_ptr, kAlign);
+  const bool aligned_in =
+    ::cuda::std::apply([](auto... p) { return ((::cuda::is_aligned(p, kAlign)) && ...); }, in_ptrs);
+  // Tile DSL's tensor_span uses uint32_t shape; cap at 2^31 to stay below
+  // the wraparound cliff at 2^32.
+  constexpr OffsetT kMaxItems = OffsetT{1} << 31;
+  if (!aligned_out || !aligned_in || (num_items % kAlign) != 0 || num_items > kMaxItems)
+  {
+    return false;
+  }
+  result = DeviceTransform::template Transform<0, tile_mufu_heavy_v<TransformOp>, tile_op_t>(
+    in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), tile_op_t{}, stream);
+  return true;
+}
+
 } // namespace detail::transform::tile
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
index 8bfdadaac87..c823cc46b99 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
@@ -3,15 +3,20 @@
 
 // Compile-time policy for cub::DeviceTransform's tile path.
 //
+// Users call cub::DeviceTransform::Transform with whatever scalar functor they
+// have (e.g. cuda::std::plus<__half>). That functor is NOT directly callable
+// from a tile transform_kernel -- its operator() takes scalars, not ct::tile.
+// So eligible specializations declare a `tile_op_type` member that names a
+// tile-friendly replacement functor (with __tile__ templated operator()) that
+// performs the same operation. The dispatch hook then launches the tile
+// kernel with the replacement, not the user's original.
+//
 // tile_eligible_v<Op, T, NIn> answers "should DeviceTransform::Transform
 // route to the tile kernel for this (functor, element type, input arity)?".
 // tile_mufu_heavy_v<Op> hints the tile policy picker that Op spends most of
 // its time on MUFU instructions, so the picker caps items/thread at the
 // vector width to avoid piling up MUFU work that cannot SIMD on Blackwell
 // for sub-4-byte types.
-//
-// This header is pure trait infrastructure; no callers yet. Specializations
-// land here as benches confirm tile wins for a (Op, T, NIn) combination.
 
 #pragma once
 
@@ -33,13 +38,38 @@
 
 #  include <cstddef>
 
+#  if _CCCL_TILE_COMPILATION()
+#    include <cuda_tile.h>
+#  endif
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail::transform::tile
 {
 
-// Primary template: tile path is opt-in. Specialize for combinations where a
-// bench has shown the tile kernel beats the existing CUB algorithms.
+#  if _CCCL_TILE_COMPILATION()
+// Tile-friendly mirrors of common cuda::std ops. Each has a __tile__
+// templated operator() so it can be invoked from inside transform_kernel
+// where the arguments are ct::tile<T, ...> rather than scalar T.
+struct tile_plus
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return a + b;
+  }
+};
+
+struct tile_multiplies
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return a * b;
+  }
+};
+#  endif // _CCCL_TILE_COMPILATION()
+
 template <typename Op, typename T, ::cuda::std::size_t NIn>
 struct tile_eligible : ::cuda::std::false_type
 {};
@@ -47,8 +77,6 @@ struct tile_eligible : ::cuda::std::false_type
 template <typename Op, typename T, ::cuda::std::size_t NIn>
 inline constexpr bool tile_eligible_v = tile_eligible<Op, T, NIn>::value;
 
-// Companion trait: report Op as MUFU-heavy so the tile policy picker caps
-// items/thread at the vector width on small element types. Default is false.
 template <typename Op>
 struct tile_mufu_heavy : ::cuda::std::false_type
 {};
@@ -56,23 +84,33 @@ struct tile_mufu_heavy : ::cuda::std::false_type
 template <typename Op>
 inline constexpr bool tile_mufu_heavy_v = tile_mufu_heavy<Op>::value;
 
-#  if _CCCL_HAS_NVFP16()
+#  if _CCCL_TILE_COMPILATION()
+#    if _CCCL_HAS_NVFP16()
 template <>
 struct tile_eligible<::cuda::std::plus<__half>, __half, 2> : ::cuda::std::true_type
-{};
+{
+  using tile_op_type = tile_plus;
+};
 template <>
 struct tile_eligible<::cuda::std::multiplies<__half>, __half, 2> : ::cuda::std::true_type
-{};
-#  endif // _CCCL_HAS_NVFP16()
+{
+  using tile_op_type = tile_multiplies;
+};
+#    endif // _CCCL_HAS_NVFP16()
 
-#  if _CCCL_HAS_NVBF16()
+#    if _CCCL_HAS_NVBF16()
 template <>
 struct tile_eligible<::cuda::std::plus<__nv_bfloat16>, __nv_bfloat16, 2> : ::cuda::std::true_type
-{};
+{
+  using tile_op_type = tile_plus;
+};
 template <>
 struct tile_eligible<::cuda::std::multiplies<__nv_bfloat16>, __nv_bfloat16, 2> : ::cuda::std::true_type
-{};
-#  endif // _CCCL_HAS_NVBF16()
+{
+  using tile_op_type = tile_multiplies;
+};
+#    endif // _CCCL_HAS_NVBF16()
+#  endif // _CCCL_TILE_COMPILATION()
 
 } // namespace detail::transform::tile
 

From 906fbecc2a88323e850b85d4532c732c0f0c586e Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 3 Jun 2026 14:24:08 -0700
Subject: [PATCH 13/83] drop runtime gates from tile dispatch helper

---
 cub/cub/device/device_transform.cuh           | 15 +++------
 .../dispatch/dispatch_transform_tile.cuh      | 32 ++++---------------
 2 files changed, 12 insertions(+), 35 deletions(-)

diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index 77c50432e34..4890a908085 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -107,9 +107,9 @@ struct DeviceTransform
     // Opt-in tile path. When every compile-time gate passes we route here
     // and DO NOT instantiate the standard CUB transform dispatch below --
     // under --enable-tile that path fails to compile for many (Op, T)
-    // combinations. Runtime alignment / divisibility violations on this
-    // branch surface as cudaErrorInvalidValue; the caller is expected to
-    // satisfy the 16-byte preconditions when opting into the tile path.
+    // combinations. The 16-byte alignment, num_items divisibility, and the
+    // 2^31 size cap are the caller's contract once the trait flags the
+    // (Op, T, NIn) combo as tile-eligible.
     if constexpr (StableAddress == detail::transform::requires_stable_address::no
                   && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
                   && detail::transform::tile::tile_dispatch_eligible_v<
@@ -117,13 +117,8 @@ struct DeviceTransform
                        RandomAccessIteratorOut,
                        RandomAccessIteratorsIn...>)
     {
-      cudaError_t tile_result;
-      if (detail::transform::tile::try_dispatch<TransformOp>(
-            inputs, output, static_cast<offset_t>(num_items), stream, tile_result))
-      {
-        return tile_result;
-      }
-      return cudaErrorInvalidValue;
+      return detail::transform::tile::dispatch<TransformOp>(
+        inputs, output, static_cast<offset_t>(num_items), stream);
     }
     else
 #endif // _CCCL_CTK_AT_LEAST(13, 3) && CCCL_ENABLE_TILE_TRANSFORM_DISPATCH && _CCCL_TILE_COMPILATION()
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 23b1fac2790..41a5a4e9cb3 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -27,11 +27,10 @@
 #  include <thrust/type_traits/is_contiguous_iterator.h>
 #  include <thrust/type_traits/unwrap_contiguous_iterator.h>
 
-#  include <cuda/__memory/is_aligned.h>
 #  include <cuda/std/__tuple_dir/apply.h>
-#  include <cuda/std/__utility/declval.h>
 #  include <cuda/std/__type_traits/is_empty.h>
 #  include <cuda/std/__type_traits/is_trivially_default_constructible.h>
+#  include <cuda/std/__utility/declval.h>
 #  include <cuda/std/__type_traits/remove_cv.h>
 #  include <cuda/std/__type_traits/remove_pointer.h>
 #  include <cuda/std/tuple>
@@ -113,21 +112,16 @@ inline constexpr bool tile_dispatch_eligible_v =
 
 // Bridge between cub::DeviceTransform::__transform_internal and the tile
 // DeviceTransform above. Precondition: tile_dispatch_eligible_v<Op, OutIter,
-// InIters...> is true. Returns true and writes the launch result when the
-// call was handled; returns false when the runtime 16-byte alignment /
-// divisibility preconditions are not satisfied (caller surfaces that as
-// cudaErrorInvalidValue -- there is no CUB fallback under --enable-tile).
+// InIters...> is true. The 16-byte pointer alignment, num_items divisibility,
+// and 2^31 size cap (the tile DSL's uint32_t extent ceiling) are the caller's
+// contract -- opting into the tile path is opting into these preconditions.
 //
 // The tile kernel is launched with the trait's tile_op_type (a tile-friendly
 // mirror of Op with __tile__ operator), NOT the user's Op instance -- the
 // user's scalar functor cannot be invoked on ct::tile arguments.
 template <typename TransformOp, typename OutIter, typename... InIters, typename OffsetT>
-CUB_RUNTIME_FUNCTION bool try_dispatch(
-  ::cuda::std::tuple<InIters...> inputs,
-  OutIter output,
-  OffsetT num_items,
-  cudaStream_t stream,
-  cudaError_t& result)
+CUB_RUNTIME_FUNCTION cudaError_t dispatch(
+  ::cuda::std::tuple<InIters...> inputs, OutIter output, OffsetT num_items, cudaStream_t stream)
 {
   auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output);
   auto in_ptrs = ::cuda::std::apply(
@@ -142,20 +136,8 @@ CUB_RUNTIME_FUNCTION bool try_dispatch(
   static_assert(::cuda::std::is_trivially_default_constructible_v<tile_op_t>,
                 "tile_op_type must be trivially default constructible");
 
-  constexpr int kAlign = 16;
-  const bool aligned_out = ::cuda::is_aligned(out_ptr, kAlign);
-  const bool aligned_in =
-    ::cuda::std::apply([](auto... p) { return ((::cuda::is_aligned(p, kAlign)) && ...); }, in_ptrs);
-  // Tile DSL's tensor_span uses uint32_t shape; cap at 2^31 to stay below
-  // the wraparound cliff at 2^32.
-  constexpr OffsetT kMaxItems = OffsetT{1} << 31;
-  if (!aligned_out || !aligned_in || (num_items % kAlign) != 0 || num_items > kMaxItems)
-  {
-    return false;
-  }
-  result = DeviceTransform::template Transform<0, tile_mufu_heavy_v<TransformOp>, tile_op_t>(
+  return DeviceTransform::template Transform<0, tile_mufu_heavy_v<TransformOp>, tile_op_t>(
     in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), tile_op_t{}, stream);
-  return true;
 }
 
 } // namespace detail::transform::tile

From 249131e6cb68e144ceea71e9495785616a68aaa0 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 3 Jun 2026 14:28:29 -0700
Subject: [PATCH 14/83] add runtime alignment check before routing to tile

---
 cub/cub/device/device_transform.cuh           |  4 +++
 .../dispatch/dispatch_transform_tile.cuh      | 33 +++++++++++++++++--
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index 4890a908085..4a27a08b8d5 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -117,6 +117,10 @@ struct DeviceTransform
                        RandomAccessIteratorOut,
                        RandomAccessIteratorsIn...>)
     {
+      if (!detail::transform::tile::runtime_preconditions_ok(inputs, output, static_cast<offset_t>(num_items)))
+      {
+        return cudaErrorInvalidValue;
+      }
       return detail::transform::tile::dispatch<TransformOp>(
         inputs, output, static_cast<offset_t>(num_items), stream);
     }
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 41a5a4e9cb3..414f64a3075 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -27,6 +27,7 @@
 #  include <thrust/type_traits/is_contiguous_iterator.h>
 #  include <thrust/type_traits/unwrap_contiguous_iterator.h>
 
+#  include <cuda/__memory/is_aligned.h>
 #  include <cuda/std/__tuple_dir/apply.h>
 #  include <cuda/std/__type_traits/is_empty.h>
 #  include <cuda/std/__type_traits/is_trivially_default_constructible.h>
@@ -110,11 +111,37 @@ inline constexpr bool tile_dispatch_eligible_v =
   && (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<InIters> && ...)
   && tile_eligible_v<Op, __detail::__unwrapped_value_t<OutIter>, sizeof...(InIters)>;
 
+// Runtime predicate consulted by the cub::DeviceTransform tile hook before
+// it commits to the tile path. Mirrors how CUB's dispatch_t::CanVectorize
+// guards the vectorized kernel. The tile kernels use ct::assume_aligned<16>
+// and ct::assume_divisible<16>, so violating these at runtime is UB.
+// Returns false to tell the hook to surface cudaErrorInvalidValue.
+template <typename OutIter, typename... InIters, typename OffsetT>
+CUB_RUNTIME_FUNCTION bool
+runtime_preconditions_ok(::cuda::std::tuple<InIters...> const& inputs, OutIter output, OffsetT num_items)
+{
+  constexpr int kAlign = 16;
+  // Tile DSL's tensor_span uses uint32_t shape internally; values >= 2^32
+  // wrap to 0. Cap at 2^31 to stay below the cliff with margin.
+  constexpr OffsetT kMaxItems = OffsetT{1} << 31;
+
+  auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output);
+  const bool aligned_out = ::cuda::is_aligned(out_ptr, kAlign);
+  const bool aligned_in  = ::cuda::std::apply(
+    [](auto... iters) {
+      return ((::cuda::is_aligned(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters), kAlign)) && ...);
+    },
+    inputs);
+
+  return aligned_out && aligned_in && (num_items % kAlign) == 0 && num_items <= kMaxItems;
+}
+
 // Bridge between cub::DeviceTransform::__transform_internal and the tile
 // DeviceTransform above. Precondition: tile_dispatch_eligible_v<Op, OutIter,
-// InIters...> is true. The 16-byte pointer alignment, num_items divisibility,
-// and 2^31 size cap (the tile DSL's uint32_t extent ceiling) are the caller's
-// contract -- opting into the tile path is opting into these preconditions.
+// InIters...> is true AND runtime_preconditions_ok returned true. The kernel
+// itself assumes 16-byte pointer alignment and num_items divisibility; the
+// caller (the hook in device_transform.cuh) is responsible for checking
+// runtime_preconditions_ok first.
 //
 // The tile kernel is launched with the trait's tile_op_type (a tile-friendly
 // mirror of Op with __tile__ operator), NOT the user's Op instance -- the

From 744dbb32078849753956f8a984dcd1b269593434 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 3 Jun 2026 15:59:44 -0700
Subject: [PATCH 15/83] drop _CCCL_TILE from _CCCL_API to unblock CUB under
 enable-tile

---
 libcudacxx/include/cuda/std/__cccl/visibility.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/__cccl/visibility.h b/libcudacxx/include/cuda/std/__cccl/visibility.h
index 075a98130aa..47337d8d8fd 100644
--- a/libcudacxx/include/cuda/std/__cccl/visibility.h
+++ b/libcudacxx/include/cuda/std/__cccl/visibility.h
@@ -116,7 +116,15 @@
 #  define _CCCL_DEVICE_API      _CCCL_DEVICE
 #  define _CCCL_TILE_API        _CCCL_TILE
 #else // ^^^ _CCCL_COMPILER(NVHPC) ^^^ / vvv !_CCCL_COMPILER(NVHPC) vvv
-#  define _CCCL_API             _CCCL_TILE _CCCL_HOST_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION
+// Local fork patch: drop _CCCL_TILE from _CCCL_API. Under the tile compiler's
+// local-only context check, marking a host/device utility __tile__ means its
+// body must satisfy tile restrictions even when the caller is non-tile. That
+// fails for any utility that takes a user-provided callable (apply, invoke,
+// visit, runtime_assume_aligned, ...). Drop the marker globally; tile DSL
+// code in this branch uses its own tile-marked operations and doesn't depend
+// on libcudacxx utilities being tile-callable. Revert when upstream fixes the
+// marking discipline (or the compiler adopts per-instantiation checking).
+#  define _CCCL_API             _CCCL_HOST_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION
 #  define _CCCL_HOST_DEVICE_API _CCCL_HOST_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION
 #  define _CCCL_HOST_API        _CCCL_HOST _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION
 #  define _CCCL_DEVICE_API      _CCCL_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION

From 74fd6ccabe35911082f28cfcb3aaee9947ad2cc3 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 3 Jun 2026 16:05:43 -0700
Subject: [PATCH 16/83] fall back to standard CUB dispatch when tile
 preconditions fail

---
 cub/cub/device/device_transform.cuh | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index 4a27a08b8d5..1560d9e1a68 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -104,12 +104,12 @@ struct DeviceTransform
     const auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env).get();
 
 #if _CCCL_CTK_AT_LEAST(13, 3) && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
-    // Opt-in tile path. When every compile-time gate passes we route here
-    // and DO NOT instantiate the standard CUB transform dispatch below --
-    // under --enable-tile that path fails to compile for many (Op, T)
-    // combinations. The 16-byte alignment, num_items divisibility, and the
-    // 2^31 size cap are the caller's contract once the trait flags the
-    // (Op, T, NIn) combo as tile-eligible.
+    // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible AND
+    // the runtime alignment / divisibility / size preconditions hold, route
+    // to the tile kernel. Otherwise fall through to the standard CUB
+    // dispatch below -- CUB's existing kernels handle the unaligned tail
+    // case via their own internal logic, so misalignment is a graceful
+    // fallback, not an error.
     if constexpr (StableAddress == detail::transform::requires_stable_address::no
                   && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
                   && detail::transform::tile::tile_dispatch_eligible_v<
@@ -117,16 +117,13 @@ struct DeviceTransform
                        RandomAccessIteratorOut,
                        RandomAccessIteratorsIn...>)
     {
-      if (!detail::transform::tile::runtime_preconditions_ok(inputs, output, static_cast<offset_t>(num_items)))
+      if (detail::transform::tile::runtime_preconditions_ok(inputs, output, static_cast<offset_t>(num_items)))
       {
-        return cudaErrorInvalidValue;
+        return detail::transform::tile::dispatch<TransformOp>(
+          inputs, output, static_cast<offset_t>(num_items), stream);
       }
-      return detail::transform::tile::dispatch<TransformOp>(
-        inputs, output, static_cast<offset_t>(num_items), stream);
     }
-    else
 #endif // _CCCL_CTK_AT_LEAST(13, 3) && CCCL_ENABLE_TILE_TRANSFORM_DISPATCH && _CCCL_TILE_COMPILATION()
-    {
 
     using tuning_env =
       ::cuda::std::execution::__query_result_or_t<Env, ::cuda::execution::__get_tuning_t, ::cuda::std::execution::env<>>;
@@ -151,7 +148,6 @@ struct DeviceTransform
       ::cuda::std::move(transform_op),
       stream,
       policy_selector{});
-    }
   }
 
   // TODO(bgruber): we want to eventually forward the output tuple to the kernel and optimize writing multiple streams

From b7e8c924aec76b8883feeff44eb84abc08e1a462 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 3 Jun 2026 17:44:11 -0700
Subject: [PATCH 17/83] migrate tile benches and tests to cub::DeviceTransform

---
 .../bench/transform/tile/babelstream.cu       |  87 +++++++---
 cub/benchmarks/bench/transform/tile/copy.cu   |  30 +++-
 cub/benchmarks/bench/transform/tile/fill.cu   |  14 +-
 .../bench/transform/tile/grayscale.cu         |  40 +++--
 .../bench/transform/tile/pytorch.cu           | 149 +++++++++++++-----
 .../transform/tile/test_device_transform.cu   |  76 +++++++--
 .../dispatch/dispatch_transform_tile.cuh      |   8 -
 7 files changed, 301 insertions(+), 103 deletions(-)

diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu
index 1e180f850a4..2201b05674a 100644
--- a/cub/benchmarks/bench/transform/tile/babelstream.cu
+++ b/cub/benchmarks/bench/transform/tile/babelstream.cu
@@ -1,31 +1,74 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// BabelStream-style bandwidth benchmarks on cub_tile::DeviceTransform.
-// Mirror of cub/benchmarks/bench/transform/babelstream.cu so we can compare
-// numbers side-by-side.
+// BabelStream-style bandwidth benchmarks via cub::DeviceTransform::Transform.
+// Custom ops self-register their tile substitutes via tile_eligible<>, so the
+// dispatch hook routes them to the tile kernel under --enable-tile + the
+// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH macro.
 
 #include <nvbench/nvbench.cuh>
 
-#include <cub/device/dispatch/dispatch_transform_tile.cuh>
+#include <cub/device/device_transform.cuh>
 
 #include <cuda_runtime.h>
 #include <cuda/std/tuple>
 #include <vector>
 #include <cstdint>
 
+#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#  include <cuda_tile.h>
+#endif
+
 #include "bench_init.cuh"
 
-#ifndef TILE_SIZE
-#define TILE_SIZE 0     // 0 = auto-pick via detail::pick_tile_size
-#endif
-#define STR_(x) #x
-#define STR(x) STR_(x)
+// User-defined scalar ops (used at the call site, in both build modes).
+struct mul_op {
+    template <class B>
+    __host__ __device__ auto operator()(B b) const { return -(b + b); }
+};
+struct add_op {
+    template <class A, class B>
+    __host__ __device__ auto operator()(A a, B b) const { return a + b; }
+};
+struct triad_op {
+    template <class B, class C>
+    __host__ __device__ auto operator()(B b, C c) const { return b - c - c; }
+};
+struct nstream_op {
+    template <class A, class B, class C>
+    __host__ __device__ auto operator()(A a, B b, C c) const { return a + b - c - c; }
+};
 
-struct mul_op     { template <class B>                  __tile__ auto operator()(B b) const         { return -(b + b); } };
-struct add_op     { template <class A, class B>         __tile__ auto operator()(A a, B b) const    { return a + b; } };
-struct triad_op   { template <class B, class C>         __tile__ auto operator()(B b, C c) const    { return b - c - c; } };
-struct nstream_op { template <class A, class B, class C> __tile__ auto operator()(A a, B b, C c) const { return a + b - c - c; } };
+#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+// Tile-friendly substitutes (must be stateless + trivially default constructible).
+struct tile_mul_op {
+    template <class B>
+    __tile__ auto operator()(B b) const { return -(b + b); }
+};
+struct tile_add_op {
+    template <class A, class B>
+    __tile__ auto operator()(A a, B b) const { return a + b; }
+};
+struct tile_triad_op {
+    template <class B, class C>
+    __tile__ auto operator()(B b, C c) const { return b - c - c; }
+};
+struct tile_nstream_op {
+    template <class A, class B, class C>
+    __tile__ auto operator()(A a, B b, C c) const { return a + b - c - c; }
+};
+
+// Self-register each scalar op for all T (partial specialization on T).
+CUB_NAMESPACE_BEGIN
+namespace detail::transform::tile
+{
+template <class T> struct tile_eligible<mul_op,     T, 1> : ::cuda::std::true_type { using tile_op_type = tile_mul_op; };
+template <class T> struct tile_eligible<add_op,     T, 2> : ::cuda::std::true_type { using tile_op_type = tile_add_op; };
+template <class T> struct tile_eligible<triad_op,   T, 2> : ::cuda::std::true_type { using tile_op_type = tile_triad_op; };
+template <class T> struct tile_eligible<nstream_op, T, 3> : ::cuda::std::true_type { using tile_op_type = tile_nstream_op; };
+} // namespace detail::transform::tile
+CUB_NAMESPACE_END
+#endif
 
 // True if `bytes_needed` worth of GPU memory is available, with 5% headroom
 // for driver overhead. Caller should `state.skip(...)` on false.
@@ -43,8 +86,6 @@ struct Buffers {
         cudaMalloc(&a, n * sizeof(T));
         cudaMalloc(&b, n * sizeof(T));
         cudaMalloc(&c, n * sizeof(T));
-        // touch every page so HBM is actually backed (not cold-page tricks).
-        // values don't matter for BW measurement.
         bench_init::rand_fill(a, n, 0xA111);
         bench_init::rand_fill(b, n, 0xB222);
         bench_init::rand_fill(c, n, 0xC333);
@@ -62,7 +103,7 @@ void mul(nvbench::state& state, nvbench::type_list<T>) {
     state.add_global_memory_reads<T>(n);
     state.add_global_memory_writes<T>(n);
     state.exec([&](nvbench::launch& launch) {
-        cub_tile::DeviceTransform::Transform<TILE_SIZE>(
+        cub::DeviceTransform::Transform(
             ::cuda::std::make_tuple(buf.b), buf.c, n, mul_op{}, launch.get_stream());
     });
 }
@@ -75,7 +116,7 @@ void add(nvbench::state& state, nvbench::type_list<T>) {
     state.add_global_memory_reads<T>(2 * n);
     state.add_global_memory_writes<T>(n);
     state.exec([&](nvbench::launch& launch) {
-        cub_tile::DeviceTransform::Transform<TILE_SIZE>(
+        cub::DeviceTransform::Transform(
             ::cuda::std::make_tuple(buf.a, buf.b), buf.c, n, add_op{}, launch.get_stream());
     });
 }
@@ -88,7 +129,7 @@ void triad(nvbench::state& state, nvbench::type_list<T>) {
     state.add_global_memory_reads<T>(2 * n);
     state.add_global_memory_writes<T>(n);
     state.exec([&](nvbench::launch& launch) {
-        cub_tile::DeviceTransform::Transform<TILE_SIZE>(
+        cub::DeviceTransform::Transform(
             ::cuda::std::make_tuple(buf.b, buf.c), buf.a, n, triad_op{}, launch.get_stream());
     });
 }
@@ -101,7 +142,7 @@ void nstream(nvbench::state& state, nvbench::type_list<T>) {
     state.add_global_memory_reads<T>(3 * n);
     state.add_global_memory_writes<T>(n);
     state.exec([&](nvbench::launch& launch) {
-        cub_tile::DeviceTransform::Transform<TILE_SIZE>(
+        cub::DeviceTransform::Transform(
             ::cuda::std::make_tuple(buf.a, buf.b, buf.c), buf.a, n, nstream_op{}, launch.get_stream());
     });
 }
@@ -109,9 +150,9 @@ void nstream(nvbench::state& state, nvbench::type_list<T>) {
 using types = nvbench::type_list<std::int8_t, std::int16_t, float, double>;
 inline auto sizes = std::vector<nvbench::int64_t>{16, 20, 24, 28, 31};
 
-NVBENCH_BENCH_TYPES(mul,     NVBENCH_TYPE_AXES(types)).set_name("tile_mul_ts"     STR(TILE_SIZE)).add_int64_power_of_two_axis("Elements{io}", sizes);
-NVBENCH_BENCH_TYPES(add,     NVBENCH_TYPE_AXES(types)).set_name("tile_add_ts"     STR(TILE_SIZE)).add_int64_power_of_two_axis("Elements{io}", sizes);
-NVBENCH_BENCH_TYPES(triad,   NVBENCH_TYPE_AXES(types)).set_name("tile_triad_ts"   STR(TILE_SIZE)).add_int64_power_of_two_axis("Elements{io}", sizes);
-NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(types)).set_name("tile_nstream_ts" STR(TILE_SIZE)).add_int64_power_of_two_axis("Elements{io}", sizes);
+NVBENCH_BENCH_TYPES(mul,     NVBENCH_TYPE_AXES(types)).set_name("tile_mul").add_int64_power_of_two_axis("Elements{io}", sizes);
+NVBENCH_BENCH_TYPES(add,     NVBENCH_TYPE_AXES(types)).set_name("tile_add").add_int64_power_of_two_axis("Elements{io}", sizes);
+NVBENCH_BENCH_TYPES(triad,   NVBENCH_TYPE_AXES(types)).set_name("tile_triad").add_int64_power_of_two_axis("Elements{io}", sizes);
+NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(types)).set_name("tile_nstream").add_int64_power_of_two_axis("Elements{io}", sizes);
 
 NVBENCH_MAIN
diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu
index 951af8b0fed..07d08f74b8b 100644
--- a/cub/benchmarks/bench/transform/tile/copy.cu
+++ b/cub/benchmarks/bench/transform/tile/copy.cu
@@ -1,23 +1,43 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// Pure copy bench (identity transform) — tile side.
-// Isolates the load/store path from any arithmetic on top: useful for
-// catching narrow-type store wars (e.g. byte stores capping BW).
+// Pure copy bench (identity transform). Custom identity op self-registers
+// its tile substitute via tile_eligible<>; under --enable-tile + the
+// dispatch macro this routes to the tile load_masked/store_masked path,
+// otherwise it falls through to CUB's standard transform.
 
 #include <nvbench/nvbench.cuh>
-#include <cub/device/dispatch/dispatch_transform_tile.cuh>
+
+#include <cub/device/device_transform.cuh>
+
 #include <cuda_runtime.h>
 #include <cuda/std/tuple>
 #include <vector>
 #include <cstdint>
 
+#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#  include <cuda_tile.h>
+#endif
+
 #include "bench_init.cuh"
 
 struct identity {
+    template <class T> __host__ __device__ auto operator()(T v) const { return v; }
+};
+
+#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+struct tile_identity {
     template <class T> __tile__ auto operator()(T v) const { return v; }
 };
 
+CUB_NAMESPACE_BEGIN
+namespace detail::transform::tile
+{
+template <class T> struct tile_eligible<identity, T, 1> : ::cuda::std::true_type { using tile_op_type = tile_identity; };
+} // namespace detail::transform::tile
+CUB_NAMESPACE_END
+#endif
+
 template <typename T>
 void copy(nvbench::state& state, nvbench::type_list<T>) {
     auto n = state.get_int64("Elements{io}");
@@ -28,7 +48,7 @@ void copy(nvbench::state& state, nvbench::type_list<T>) {
     state.add_global_memory_reads<T>(n);
     state.add_global_memory_writes<T>(n);
     state.exec([&](nvbench::launch& launch) {
-        cub_tile::DeviceTransform::Transform(
+        cub::DeviceTransform::Transform(
             ::cuda::std::make_tuple(in), out, n, identity{}, launch.get_stream());
     });
     cudaFree(in); cudaFree(out);
diff --git a/cub/benchmarks/bench/transform/tile/fill.cu b/cub/benchmarks/bench/transform/tile/fill.cu
index 5514c1a1287..5105b25b67b 100644
--- a/cub/benchmarks/bench/transform/tile/fill.cu
+++ b/cub/benchmarks/bench/transform/tile/fill.cu
@@ -1,12 +1,16 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// Fill: zero-input broadcast.  CUB models this as Transform with empty input tuple
-// and a no-arg op.  Tile can't express zero-input Transform directly, so we use the
-// dedicated cub_tile::DeviceTransform::Fill API which writes a constant.
+// Fill: zero-input broadcast. Calls cub::DeviceTransform::Fill, which goes
+// through the unified __transform_internal path -- our trait dispatch hook
+// sees the zero-input case but currently has no trait spec for it, so this
+// lands on CUB's standard Fill kernel. Wire a tile substitute later if Fill
+// becomes a bottleneck.
 
 #include <nvbench/nvbench.cuh>
-#include <cub/device/dispatch/dispatch_transform_tile.cuh>
+
+#include <cub/device/device_transform.cuh>
+
 #include <cuda_runtime.h>
 
 template <typename T>
@@ -16,7 +20,7 @@ void fill(nvbench::state& state, nvbench::type_list<T>) {
     state.add_element_count(n);
     state.add_global_memory_writes<T>(n);
     state.exec([&](nvbench::launch& launch) {
-        cub_tile::DeviceTransform::Fill(out, n, T(42), launch.get_stream());
+        cub::DeviceTransform::Fill(out, n, T(42), launch.get_stream());
     });
     cudaFree(out);
 }
diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu
index 14641c2d872..e715945b9bb 100644
--- a/cub/benchmarks/bench/transform/tile/grayscale.cu
+++ b/cub/benchmarks/bench/transform/tile/grayscale.cu
@@ -1,21 +1,35 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// Grayscale: RGB pixel -> luminance.  Uses a 3-component pixel type.
-// CUB stores rgb_t<float> (12 bytes) packed; tile may or may not accept this as an
-// element type.  If tile rejects rgb_t<float>, this bench will fail to compile —
-// we'll then fall back to treating R/G/B as three separate float streams.
+// Grayscale: RGB pixel -> luminance via three separate input streams.
+// Custom rgb_to_y op self-registers its tile substitute via tile_eligible<>.
 
 #include <nvbench/nvbench.cuh>
-#include <cub/device/dispatch/dispatch_transform_tile.cuh>
-#include "bench_init.cuh"
+
+#include <cub/device/device_transform.cuh>
+
 #include <cuda_runtime.h>
 #include <cuda/std/tuple>
 #include <vector>
 
-// Three-stream version (R, G, B as separate input arrays).
-// Computationally equivalent to CUB's packed rgb_t version.
+#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#  include <cuda_tile.h>
+#endif
+
+#include "bench_init.cuh"
+
 struct rgb_to_y {
+    template <class R, class G, class B>
+    __host__ __device__ auto operator()(R r, G g, B b) const {
+        constexpr float w_r = 0.2989f;
+        constexpr float w_g = 0.587f;
+        constexpr float w_b = 0.114f;
+        return w_r * r + w_g * g + w_b * b;
+    }
+};
+
+#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+struct tile_rgb_to_y {
     template <class R, class G, class B>
     __tile__ auto operator()(R r, G g, B b) const {
         constexpr float w_r = 0.2989f;
@@ -25,6 +39,14 @@ struct rgb_to_y {
     }
 };
 
+CUB_NAMESPACE_BEGIN
+namespace detail::transform::tile
+{
+template <class T> struct tile_eligible<rgb_to_y, T, 3> : ::cuda::std::true_type { using tile_op_type = tile_rgb_to_y; };
+} // namespace detail::transform::tile
+CUB_NAMESPACE_END
+#endif
+
 template <typename T>
 void grayscale(nvbench::state& state, nvbench::type_list<T>) {
     const auto n = state.get_int64("Elements{io}");
@@ -39,7 +61,7 @@ void grayscale(nvbench::state& state, nvbench::type_list<T>) {
     state.add_global_memory_reads<T>(3 * n);   // matches CUB's rgb_t<T> = 3*sizeof(T)
     state.add_global_memory_writes<T>(n);
     state.exec([&](nvbench::launch& launch) {
-        cub_tile::DeviceTransform::Transform(
+        cub::DeviceTransform::Transform(
             ::cuda::std::make_tuple(r, g, b), out, n, rgb_to_y{}, launch.get_stream());
     });
     cudaFree(r); cudaFree(g); cudaFree(b); cudaFree(out);
diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu
index e1eee3e4452..71cbd20f583 100644
--- a/cub/benchmarks/bench/transform/tile/pytorch.cu
+++ b/cub/benchmarks/bench/transform/tile/pytorch.cu
@@ -1,51 +1,128 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// PyTorch ops on tile.  Uses ct::tanh / ct::sin / ct::exp / ct::select.
+// PyTorch-style ops via cub::DeviceTransform::Transform. Each custom op
+// self-registers a tile substitute through tile_eligible<>, so the dispatch
+// hook routes them to the tile kernel under --enable-tile + the
+// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH macro. MUFU-heavy ops also opt into
+// tile_mufu_heavy<> so the tile policy picker caps items/thread at the
+// vector width on sub-4-byte types.
 
 #include <nvbench/nvbench.cuh>
-#include <cub/device/dispatch/dispatch_transform_tile.cuh>
+
+#include <cub/device/device_transform.cuh>
+
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
+#include <cuda/std/cmath>
 #include <cuda/std/tuple>
 #include <vector>
 
+#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#  include <cuda_tile.h>
+#endif
+
 #include "bench_init.cuh"
 
-namespace ct = cuda::tiles;
+// ========================================================================
+// Scalar ops (the types the user passes to cub::DeviceTransform::Transform).
+// Sub-4-byte input types compute in float and cast back, matching the tile
+// substitute below.
+// ========================================================================
+template <class T> __host__ __device__ float to_f(T v) { return static_cast<float>(v); }
+template <class T> __host__ __device__ T from_f(float f) { return static_cast<T>(f); }
+
+struct relu_op    { template <class T> __host__ __device__ T operator()(T v) const {
+    float f = to_f(v); return from_f<T>(f > 0.0f ? f : 0.0f); } };
+struct sigmoid_op { template <class T> __host__ __device__ T operator()(T v) const {
+    float f = to_f(v); return from_f<T>(1.0f / (1.0f + ::cuda::std::exp(-f))); } };
+struct tanh_op    { template <class T> __host__ __device__ T operator()(T v) const {
+    return from_f<T>(::cuda::std::tanh(to_f(v))); } };
+struct gelu_op    { template <class T> __host__ __device__ T operator()(T v) const {
+    constexpr float k0 = 0.7978845608028654f, k1 = 0.044715f;
+    float f = to_f(v);
+    return from_f<T>(0.5f * f * (1.0f + ::cuda::std::tanh(k0 * (f + k1 * f * f * f)))); } };
+struct sin_op     { template <class T> __host__ __device__ T operator()(T v) const {
+    return from_f<T>(::cuda::std::sin(to_f(v))); } };
+struct exp_op     { template <class T> __host__ __device__ T operator()(T v) const {
+    return from_f<T>(::cuda::std::exp(to_f(v))); } };
+
+struct binary_add  { template <class A, class B> __host__ __device__ auto operator()(A a, B b) const { return a + b; } };
+struct binary_sub  { template <class A, class B> __host__ __device__ auto operator()(A a, B b) const { return a - b; } };
+struct binary_mul  { template <class A, class B> __host__ __device__ auto operator()(A a, B b) const { return a * b; } };
+struct binary_div  { template <class A, class B> __host__ __device__ auto operator()(A a, B b) const { return a / b; } };
+struct binary_le   { template <class A, class B> __host__ __device__ A operator()(A a, B b) const { return static_cast<A>(a <= b); } };
+struct binary_ge   { template <class A, class B> __host__ __device__ A operator()(A a, B b) const { return static_cast<A>(a >= b); } };
+struct binary_fmin { template <class A, class B> __host__ __device__ auto operator()(A a, B b) const { return a < b ? a : b; } };
+struct binary_fmax { template <class A, class B> __host__ __device__ auto operator()(A a, B b) const { return a > b ? a : b; } };
+
+// ========================================================================
+// Tile substitutes + trait registration. Only compiled under tile mode.
+// ========================================================================
+#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+namespace ct = ::cuda::tiles;
 
-// --- Unary --- (compute in float, cast back so the same ops work for __half/__bf16/float)
 template <class T> __tile__ auto as_float(T v) { return ct::element_cast<float>(v); }
 template <class T, class F> __tile__ auto from_float(F f) { return ct::element_cast<ct::tile_element_t<T>>(f); }
 
-struct relu_op    { template <class T> __tile__ auto operator()(T v) const {
+struct tile_relu    { template <class T> __tile__ auto operator()(T v) const {
     auto f = as_float(v); return from_float<T>(ct::select(f > 0.0f, f, f - f)); } };
-struct sigmoid_op { template <class T> __tile__ auto operator()(T v) const {
+struct tile_sigmoid { template <class T> __tile__ auto operator()(T v) const {
     auto f = as_float(v); return from_float<T>(1.0f / (1.0f + ct::exp(-f))); } };
-struct tanh_op    { template <class T> __tile__ auto operator()(T v) const {
+struct tile_tanh    { template <class T> __tile__ auto operator()(T v) const {
     return from_float<T>(ct::tanh(as_float(v))); } };
-struct gelu_op    { template <class T> __tile__ auto operator()(T v) const {
+struct tile_gelu    { template <class T> __tile__ auto operator()(T v) const {
     constexpr float k0 = 0.7978845608028654f, k1 = 0.044715f;
     auto f = as_float(v);
     return from_float<T>(0.5f * f * (1.0f + ct::tanh(k0 * (f + k1 * f * f * f)))); } };
-struct sin_op { template <class T> __tile__ auto operator()(T v) const { return from_float<T>(ct::sin(as_float(v))); } };
-struct exp_op { template <class T> __tile__ auto operator()(T v) const { return from_float<T>(ct::exp(as_float(v))); } };
-
-// --- Binary ---
-struct binary_add  { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a + b; } };
-struct binary_sub  { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a - b; } };
-struct binary_mul  { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a * b; } };
-struct binary_div  { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a / b; } };
-// le/ge: cast the bool result tile to A's element type so it fits the float output buffer
-//        (CUB does the same implicit cast via its iterator path).
-struct binary_le   { template <class A, class B> __tile__ auto operator()(A a, B b) const { return ct::element_cast<ct::tile_element_t<A>>(a <= b); } };
-struct binary_ge   { template <class A, class B> __tile__ auto operator()(A a, B b) const { return ct::element_cast<ct::tile_element_t<A>>(a >= b); } };
-struct binary_fmin { template <class A, class B> __tile__ auto operator()(A a, B b) const { return ct::select(a < b, a, b); } };
-struct binary_fmax { template <class A, class B> __tile__ auto operator()(A a, B b) const { return ct::select(a > b, a, b); } };
-
-
-template <typename Op, typename T, bool MufuHeavy = false>
+struct tile_sin     { template <class T> __tile__ auto operator()(T v) const { return from_float<T>(ct::sin(as_float(v))); } };
+struct tile_exp     { template <class T> __tile__ auto operator()(T v) const { return from_float<T>(ct::exp(as_float(v))); } };
+
+struct tile_binary_add  { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a + b; } };
+struct tile_binary_sub  { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a - b; } };
+struct tile_binary_mul  { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a * b; } };
+struct tile_binary_div  { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a / b; } };
+struct tile_binary_le   { template <class A, class B> __tile__ auto operator()(A a, B b) const { return ct::element_cast<ct::tile_element_t<A>>(a <= b); } };
+struct tile_binary_ge   { template <class A, class B> __tile__ auto operator()(A a, B b) const { return ct::element_cast<ct::tile_element_t<A>>(a >= b); } };
+struct tile_binary_fmin { template <class A, class B> __tile__ auto operator()(A a, B b) const { return ct::select(a < b, a, b); } };
+struct tile_binary_fmax { template <class A, class B> __tile__ auto operator()(A a, B b) const { return ct::select(a > b, a, b); } };
+
+CUB_NAMESPACE_BEGIN
+namespace detail::transform::tile
+{
+// Unary
+template <class T> struct tile_eligible<relu_op,    T, 1> : ::cuda::std::true_type { using tile_op_type = tile_relu;    };
+template <class T> struct tile_eligible<sigmoid_op, T, 1> : ::cuda::std::true_type { using tile_op_type = tile_sigmoid; };
+template <class T> struct tile_eligible<tanh_op,    T, 1> : ::cuda::std::true_type { using tile_op_type = tile_tanh;    };
+template <class T> struct tile_eligible<gelu_op,    T, 1> : ::cuda::std::true_type { using tile_op_type = tile_gelu;    };
+template <class T> struct tile_eligible<sin_op,     T, 1> : ::cuda::std::true_type { using tile_op_type = tile_sin;     };
+template <class T> struct tile_eligible<exp_op,     T, 1> : ::cuda::std::true_type { using tile_op_type = tile_exp;     };
+
+// MUFU-heavy unary ops: hint to tile policy picker to cap items/thread at vector width on sub-4-byte types.
+template <> struct tile_mufu_heavy<sigmoid_op> : ::cuda::std::true_type {};
+template <> struct tile_mufu_heavy<tanh_op>    : ::cuda::std::true_type {};
+template <> struct tile_mufu_heavy<gelu_op>    : ::cuda::std::true_type {};
+template <> struct tile_mufu_heavy<sin_op>     : ::cuda::std::true_type {};
+template <> struct tile_mufu_heavy<exp_op>     : ::cuda::std::true_type {};
+
+// Binary
+template <class T> struct tile_eligible<binary_add,  T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_add;  };
+template <class T> struct tile_eligible<binary_sub,  T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_sub;  };
+template <class T> struct tile_eligible<binary_mul,  T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_mul;  };
+template <class T> struct tile_eligible<binary_div,  T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_div;  };
+template <class T> struct tile_eligible<binary_le,   T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_le;   };
+template <class T> struct tile_eligible<binary_ge,   T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_ge;   };
+template <class T> struct tile_eligible<binary_fmin, T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_fmin; };
+template <class T> struct tile_eligible<binary_fmax, T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_fmax; };
+} // namespace detail::transform::tile
+CUB_NAMESPACE_END
+#endif
+
+// ========================================================================
+// Bench harness.
+// ========================================================================
+template <typename Op, typename T>
 void run_unary(nvbench::state& state) {
     const auto n = state.get_int64("Elements{io}");
     T *in, *out;
@@ -55,7 +132,7 @@ void run_unary(nvbench::state& state) {
     state.add_global_memory_reads<T>(n);
     state.add_global_memory_writes<T>(n);
     state.exec([&](nvbench::launch& launch) {
-        cub_tile::DeviceTransform::Transform<0, MufuHeavy>(
+        cub::DeviceTransform::Transform(
             ::cuda::std::make_tuple(in), out, n, Op{}, launch.get_stream());
     });
     cudaFree(in); cudaFree(out);
@@ -73,7 +150,7 @@ void run_binary(nvbench::state& state) {
     state.add_global_memory_reads<T>(2*n);
     state.add_global_memory_writes<T>(n);
     state.exec([&](nvbench::launch& launch) {
-        cub_tile::DeviceTransform::Transform(
+        cub::DeviceTransform::Transform(
             ::cuda::std::make_tuple(a, b), out, n, Op{}, launch.get_stream());
     });
     cudaFree(a); cudaFree(b); cudaFree(out);
@@ -82,18 +159,16 @@ void run_binary(nvbench::state& state) {
 using element_types = nvbench::type_list<__half, __nv_bfloat16, float>;
 inline auto pt_sizes = std::vector<nvbench::int64_t>{16, 20, 24, 28, 31};
 
-#define UNARY_BENCH(name, op, mufu) \
-    template <typename T> void name##_bench(nvbench::state& state, nvbench::type_list<T>) { run_unary<op, T, mufu>(state); } \
+#define UNARY_BENCH(name, op) \
+    template <typename T> void name##_bench(nvbench::state& state, nvbench::type_list<T>) { run_unary<op, T>(state); } \
     NVBENCH_BENCH_TYPES(name##_bench, NVBENCH_TYPE_AXES(element_types)).set_name("tile_" #name).add_int64_power_of_two_axis("Elements{io}", pt_sizes);
 
-// MufuHeavy hint set for ops dominated by MUFU intrinsics (exp/tanh/sin/cos).
-// relu is just compare+select, so no hint.
-UNARY_BENCH(relu,    relu_op,    false)
-UNARY_BENCH(sigmoid, sigmoid_op, true)
-UNARY_BENCH(tanh,    tanh_op,    true)
-UNARY_BENCH(gelu,    gelu_op,    true)
-UNARY_BENCH(sin,     sin_op,     true)
-UNARY_BENCH(exp,     exp_op,     true)
+UNARY_BENCH(relu,    relu_op)
+UNARY_BENCH(sigmoid, sigmoid_op)
+UNARY_BENCH(tanh,    tanh_op)
+UNARY_BENCH(gelu,    gelu_op)
+UNARY_BENCH(sin,     sin_op)
+UNARY_BENCH(exp,     exp_op)
 
 #define BINARY_BENCH(name, op) \
     template <typename T> void name##_bench(nvbench::state& state, nvbench::type_list<T>) { run_binary<op, T>(state); } \
diff --git a/cub/benchmarks/bench/transform/tile/test_device_transform.cu b/cub/benchmarks/bench/transform/tile/test_device_transform.cu
index 0df21dc66a3..713a3846025 100644
--- a/cub/benchmarks/bench/transform/tile/test_device_transform.cu
+++ b/cub/benchmarks/bench/transform/tile/test_device_transform.cu
@@ -1,13 +1,22 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// Standalone correctness tests for cub_tile::DeviceTransform.
-// Sits next to the benches so it builds against the same tileiras
-// toolchain and does not pretend to be part of CCCL's catch2 suite.
-
-#include <cub/device/dispatch/dispatch_transform_tile.cuh>
+// Standalone correctness tests for cub::DeviceTransform with the tile
+// dispatch hook on. Exercises:
+//   - Built-in trait specs (cuda::std::plus, cuda::std::multiplies)
+//   - User-registered trait specs (square_op, identity_op)
+//   - cub::DeviceTransform::Fill (zero-input case)
+//
+// Built under --enable-tile + CCCL_ENABLE_TILE_TRANSFORM_DISPATCH so the
+// hook routes eligible combos to the tile kernel. Sits next to the benches
+// so it builds against the same tileiras toolchain; not part of CCCL's
+// catch2 suite.
+
+#include <cub/device/device_transform.cuh>
 
 #include <cuda_runtime.h>
+
+#include <cuda/std/functional>
 #include <cuda/std/tuple>
 
 #include <cstdio>
@@ -16,6 +25,10 @@
 #include <cmath>
 #include <vector>
 
+#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#  include <cuda_tile.h>
+#endif
+
 namespace {
 
 int g_failures = 0;
@@ -59,10 +72,28 @@ void expect_array(const char* name, const std::vector<T>& got, const std::vector
     else            { std::printf("[ OK ] %s (n=%zu)\n", name, got.size()); }
 }
 
-struct identity_op { template <class A> __tile__ auto operator()(A a) const { return a; } };
-struct square_op   { template <class A> __tile__ auto operator()(A a) const { return a * a; } };
-struct add_op      { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a + b; } };
-struct mul_op      { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a * b; } };
+// User-defined scalar functors (the call-site type). identity_op and square_op
+// don't have a cuda::std equivalent, so we self-register them. add and mul map
+// to cuda::std::plus / cuda::std::multiplies which CCCL already ships specs for.
+
+struct identity_op {
+    template <class T> __host__ __device__ T operator()(T a) const { return a; }
+};
+struct square_op {
+    template <class T> __host__ __device__ T operator()(T a) const { return a * a; }
+};
+
+#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+namespace ct = ::cuda::tiles;
+
+// Tile-friendly substitutes (must be stateless + trivially default constructible).
+struct tile_identity_op {
+    template <class T> __tile__ auto operator()(T v) const { return v; }
+};
+struct tile_square_op {
+    template <class T> __tile__ auto operator()(T v) const { return v * v; }
+};
+#endif
 
 template <typename T>
 std::vector<T> ramp(int64_t n, T start = T{0}, T step = T{1}) {
@@ -91,7 +122,7 @@ template <typename T>
 void test_identity(int64_t n) {
     auto h_in = ramp<T>(n, T{1}, T{1});
     GpuVec<T> dx(h_in), dy(n);
-    CUDA_CHECK(cub_tile::DeviceTransform::Transform(
+    CUDA_CHECK(cub::DeviceTransform::Transform(
         ::cuda::std::make_tuple(dx.d), dy.d, n, identity_op{}));
     CUDA_CHECK(cudaDeviceSynchronize());
     expect_array("identity", dy.to_host(), h_in);
@@ -103,7 +134,7 @@ void test_square(int64_t n) {
     std::vector<T> want(n);
     for (int64_t i = 0; i < n; ++i) want[i] = h_in[i] * h_in[i];
     GpuVec<T> dx(h_in), dy(n);
-    CUDA_CHECK(cub_tile::DeviceTransform::Transform(
+    CUDA_CHECK(cub::DeviceTransform::Transform(
         ::cuda::std::make_tuple(dx.d), dy.d, n, square_op{}));
     CUDA_CHECK(cudaDeviceSynchronize());
     expect_array("square", dy.to_host(), want);
@@ -116,8 +147,8 @@ void test_add(int64_t n) {
     std::vector<T> want(n);
     for (int64_t i = 0; i < n; ++i) want[i] = ha[i] + hb[i];
     GpuVec<T> da(ha), db(hb), dc(n);
-    CUDA_CHECK(cub_tile::DeviceTransform::Transform(
-        ::cuda::std::make_tuple(da.d, db.d), dc.d, n, add_op{}));
+    CUDA_CHECK(cub::DeviceTransform::Transform(
+        ::cuda::std::make_tuple(da.d, db.d), dc.d, n, ::cuda::std::plus<T>{}));
     CUDA_CHECK(cudaDeviceSynchronize());
     expect_array("add", dc.to_host(), want);
 }
@@ -129,8 +160,8 @@ void test_mul(int64_t n) {
     std::vector<T> want(n);
     for (int64_t i = 0; i < n; ++i) want[i] = ha[i] * hb[i];
     GpuVec<T> da(ha), db(hb), dc(n);
-    CUDA_CHECK(cub_tile::DeviceTransform::Transform(
-        ::cuda::std::make_tuple(da.d, db.d), dc.d, n, mul_op{}));
+    CUDA_CHECK(cub::DeviceTransform::Transform(
+        ::cuda::std::make_tuple(da.d, db.d), dc.d, n, ::cuda::std::multiplies<T>{}));
     CUDA_CHECK(cudaDeviceSynchronize());
     expect_array("mul", dc.to_host(), want);
 }
@@ -138,7 +169,7 @@ void test_mul(int64_t n) {
 template <typename T>
 void test_fill(int64_t n, T value) {
     GpuVec<T> dy(n);
-    CUDA_CHECK(cub_tile::DeviceTransform::Fill(dy.d, n, value));
+    CUDA_CHECK(cub::DeviceTransform::Fill(dy.d, n, value));
     CUDA_CHECK(cudaDeviceSynchronize());
     std::vector<T> want(n, value);
     expect_array("fill", dy.to_host(), want);
@@ -146,6 +177,19 @@ void test_fill(int64_t n, T value) {
 
 } // namespace
 
+// User self-registers identity_op and square_op as tile-eligible.
+#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+CUB_NAMESPACE_BEGIN
+namespace detail::transform::tile
+{
+template <> struct tile_eligible<identity_op, int32_t, 1> : ::cuda::std::true_type { using tile_op_type = tile_identity_op; };
+template <> struct tile_eligible<identity_op, float, 1>   : ::cuda::std::true_type { using tile_op_type = tile_identity_op; };
+template <> struct tile_eligible<square_op, int32_t, 1>   : ::cuda::std::true_type { using tile_op_type = tile_square_op; };
+template <> struct tile_eligible<square_op, float, 1>     : ::cuda::std::true_type { using tile_op_type = tile_square_op; };
+} // namespace detail::transform::tile
+CUB_NAMESPACE_END
+#endif
+
 int main() {
     // pow-2, multiple tiles
     test_identity<std::int32_t>(4096);
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 414f64a3075..b62cfd61ca8 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -171,12 +171,4 @@ CUB_RUNTIME_FUNCTION cudaError_t dispatch(
 
 CUB_NAMESPACE_END
 
-// Compatibility shim. Existing benches and tests still call
-// cub_tile::DeviceTransform; once they move to cub::DeviceTransform with named
-// functors and the trait dispatch, this alias can be removed.
-namespace cub_tile
-{
-using DeviceTransform = ::cub::detail::transform::tile::DeviceTransform;
-} // namespace cub_tile
-
 #endif // _CCCL_CTK_AT_LEAST(13, 3)

From 4afa1b3070a3cd3020db6780937d513c6a229fba Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Thu, 4 Jun 2026 19:40:08 -0700
Subject: [PATCH 18/83] use int64 extents in tile kernels and clean up runtime
 precondition check

---
 .../dispatch/dispatch_transform_tile.cuh      | 19 +++++++++++--------
 .../kernels/kernel_transform_tile.cuh         |  9 ++++++---
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index b62cfd61ca8..438a237c35e 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -27,7 +27,7 @@
 #  include <thrust/type_traits/is_contiguous_iterator.h>
 #  include <thrust/type_traits/unwrap_contiguous_iterator.h>
 
-#  include <cuda/__memory/is_aligned.h>
+#  include <cuda/std/__memory/is_sufficiently_aligned.h>
 #  include <cuda/std/__tuple_dir/apply.h>
 #  include <cuda/std/__type_traits/is_empty.h>
 #  include <cuda/std/__type_traits/is_trivially_default_constructible.h>
@@ -120,20 +120,23 @@ template <typename OutIter, typename... InIters, typename OffsetT>
 CUB_RUNTIME_FUNCTION bool
 runtime_preconditions_ok(::cuda::std::tuple<InIters...> const& inputs, OutIter output, OffsetT num_items)
 {
-  constexpr int kAlign = 16;
-  // Tile DSL's tensor_span uses uint32_t shape internally; values >= 2^32
-  // wrap to 0. Cap at 2^31 to stay below the cliff with margin.
-  constexpr OffsetT kMaxItems = OffsetT{1} << 31;
+  // Pointer alignment is in bytes (for LDG.E.128); the kernel's
+  // ct::assume_divisible<N> applies to num_items as an element count. These
+  // are both 16 today by coincidence but live on different axes.
+  constexpr int byte_align    = 16;
+  constexpr int items_divisor = 16;
 
   auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output);
-  const bool aligned_out = ::cuda::is_aligned(out_ptr, kAlign);
+  const bool aligned_out = ::cuda::std::is_sufficiently_aligned<byte_align>(out_ptr);
   const bool aligned_in  = ::cuda::std::apply(
     [](auto... iters) {
-      return ((::cuda::is_aligned(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters), kAlign)) && ...);
+      return ((::cuda::std::is_sufficiently_aligned<byte_align>(
+                THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters)))
+              && ...);
     },
     inputs);
 
-  return aligned_out && aligned_in && (num_items % kAlign) == 0 && num_items <= kMaxItems;
+  return aligned_out && aligned_in && (num_items % items_divisor) == 0;
 }
 
 // Bridge between cub::DeviceTransform::__transform_internal and the tile
diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index 3d038c9068f..5a67e75a04c 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -41,12 +41,14 @@ transform_kernel(int64_t num_items_, Out* __restrict__ out_, const Ins* __restri
   auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_));
   auto out       = ct::assume_aligned<16>(out_);
 
-  auto out_span = ct::tensor_span{out, ct::extents{num_items}};
+  // Explicit int64_t element type on the extent; CTAD would deduce uint32_t
+  // and wrap at 2^32. Using int64_t lets us drop the 2^31 runtime cap.
+  auto out_span = ct::tensor_span{out, ct::extents<int64_t, ct::dynamic_extent>{num_items}};
   auto out_view = ct::partition_view{out_span, ct::shape<TileSize>{}};
 
   auto load_one = [bx, num_items](auto* ptr_) {
     auto ptr  = ct::assume_aligned<16>(ptr_);
-    auto span = ct::tensor_span{ptr, ct::extents{num_items}};
+    auto span = ct::tensor_span{ptr, ct::extents<int64_t, ct::dynamic_extent>{num_items}};
     auto view = ct::partition_view{span, ct::shape<TileSize>{}};
     return view.load_masked(bx);
   };
@@ -63,7 +65,8 @@ __tile_global__ void fill_kernel(int64_t num_items_, T* __restrict__ out_, T val
   auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_));
   auto out       = ct::assume_aligned<16>(out_);
 
-  auto out_span = ct::tensor_span{out, ct::extents{num_items}};
+  // Explicit int64_t element type on the extent (see transform_kernel above).
+  auto out_span = ct::tensor_span{out, ct::extents<int64_t, ct::dynamic_extent>{num_items}};
   auto out_view = ct::partition_view{out_span, ct::shape<TileSize>{}};
   using tile_t  = ct::tile<T, ct::shape<TileSize>>;
   out_view.store_masked(ct::full<tile_t>(value), bx);

From f761178b3ce8bf9945835dc6a1167cd465779d56 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 12:49:39 -0700
Subject: [PATCH 19/83] lift tile_eligible and tile_mufu_heavy to
 cub::transform namespace

---
 .../bench/transform/tile/babelstream.cu       |  4 +-
 cub/benchmarks/bench/transform/tile/copy.cu   |  4 +-
 .../bench/transform/tile/grayscale.cu         |  4 +-
 .../bench/transform/tile/pytorch.cu           |  4 +-
 .../transform/tile/test_device_transform.cu   |  4 +-
 .../dispatch/dispatch_transform_tile.cuh      |  8 +-
 .../dispatch_transform_tile_traits.cuh        | 91 +++++++++++--------
 7 files changed, 70 insertions(+), 49 deletions(-)

diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu
index 2201b05674a..ba1c37036b0 100644
--- a/cub/benchmarks/bench/transform/tile/babelstream.cu
+++ b/cub/benchmarks/bench/transform/tile/babelstream.cu
@@ -60,13 +60,13 @@ struct tile_nstream_op {
 
 // Self-register each scalar op for all T (partial specialization on T).
 CUB_NAMESPACE_BEGIN
-namespace detail::transform::tile
+namespace transform
 {
 template <class T> struct tile_eligible<mul_op,     T, 1> : ::cuda::std::true_type { using tile_op_type = tile_mul_op; };
 template <class T> struct tile_eligible<add_op,     T, 2> : ::cuda::std::true_type { using tile_op_type = tile_add_op; };
 template <class T> struct tile_eligible<triad_op,   T, 2> : ::cuda::std::true_type { using tile_op_type = tile_triad_op; };
 template <class T> struct tile_eligible<nstream_op, T, 3> : ::cuda::std::true_type { using tile_op_type = tile_nstream_op; };
-} // namespace detail::transform::tile
+} // namespace transform
 CUB_NAMESPACE_END
 #endif
 
diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu
index 07d08f74b8b..fd697256dd9 100644
--- a/cub/benchmarks/bench/transform/tile/copy.cu
+++ b/cub/benchmarks/bench/transform/tile/copy.cu
@@ -31,10 +31,10 @@ struct tile_identity {
 };
 
 CUB_NAMESPACE_BEGIN
-namespace detail::transform::tile
+namespace transform
 {
 template <class T> struct tile_eligible<identity, T, 1> : ::cuda::std::true_type { using tile_op_type = tile_identity; };
-} // namespace detail::transform::tile
+} // namespace transform
 CUB_NAMESPACE_END
 #endif
 
diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu
index e715945b9bb..80768581aab 100644
--- a/cub/benchmarks/bench/transform/tile/grayscale.cu
+++ b/cub/benchmarks/bench/transform/tile/grayscale.cu
@@ -40,10 +40,10 @@ struct tile_rgb_to_y {
 };
 
 CUB_NAMESPACE_BEGIN
-namespace detail::transform::tile
+namespace transform
 {
 template <class T> struct tile_eligible<rgb_to_y, T, 3> : ::cuda::std::true_type { using tile_op_type = tile_rgb_to_y; };
-} // namespace detail::transform::tile
+} // namespace transform
 CUB_NAMESPACE_END
 #endif
 
diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu
index 71cbd20f583..6e35560f426 100644
--- a/cub/benchmarks/bench/transform/tile/pytorch.cu
+++ b/cub/benchmarks/bench/transform/tile/pytorch.cu
@@ -89,7 +89,7 @@ struct tile_binary_fmin { template <class A, class B> __tile__ auto operator()(A
 struct tile_binary_fmax { template <class A, class B> __tile__ auto operator()(A a, B b) const { return ct::select(a > b, a, b); } };
 
 CUB_NAMESPACE_BEGIN
-namespace detail::transform::tile
+namespace transform
 {
 // Unary
 template <class T> struct tile_eligible<relu_op,    T, 1> : ::cuda::std::true_type { using tile_op_type = tile_relu;    };
@@ -115,7 +115,7 @@ template <class T> struct tile_eligible<binary_le,   T, 2> : ::cuda::std::true_t
 template <class T> struct tile_eligible<binary_ge,   T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_ge;   };
 template <class T> struct tile_eligible<binary_fmin, T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_fmin; };
 template <class T> struct tile_eligible<binary_fmax, T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_fmax; };
-} // namespace detail::transform::tile
+} // namespace transform
 CUB_NAMESPACE_END
 #endif
 
diff --git a/cub/benchmarks/bench/transform/tile/test_device_transform.cu b/cub/benchmarks/bench/transform/tile/test_device_transform.cu
index 713a3846025..b3fe263909d 100644
--- a/cub/benchmarks/bench/transform/tile/test_device_transform.cu
+++ b/cub/benchmarks/bench/transform/tile/test_device_transform.cu
@@ -180,13 +180,13 @@ void test_fill(int64_t n, T value) {
 // User self-registers identity_op and square_op as tile-eligible.
 #if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
 CUB_NAMESPACE_BEGIN
-namespace detail::transform::tile
+namespace transform
 {
 template <> struct tile_eligible<identity_op, int32_t, 1> : ::cuda::std::true_type { using tile_op_type = tile_identity_op; };
 template <> struct tile_eligible<identity_op, float, 1>   : ::cuda::std::true_type { using tile_op_type = tile_identity_op; };
 template <> struct tile_eligible<square_op, int32_t, 1>   : ::cuda::std::true_type { using tile_op_type = tile_square_op; };
 template <> struct tile_eligible<square_op, float, 1>     : ::cuda::std::true_type { using tile_op_type = tile_square_op; };
-} // namespace detail::transform::tile
+} // namespace transform
 CUB_NAMESPACE_END
 #endif
 
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 438a237c35e..ac9cdaf059d 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -109,7 +109,8 @@ template <typename Op, typename OutIter, typename... InIters>
 inline constexpr bool tile_dispatch_eligible_v =
   THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutIter>
   && (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<InIters> && ...)
-  && tile_eligible_v<Op, __detail::__unwrapped_value_t<OutIter>, sizeof...(InIters)>;
+  && CUB_NS_QUALIFIER::transform::tile_eligible_v<
+       Op, __detail::__unwrapped_value_t<OutIter>, sizeof...(InIters)>;
 
 // Runtime predicate consulted by the cub::DeviceTransform tile hook before
 // it commits to the tile path. Mirrors how CUB's dispatch_t::CanVectorize
@@ -160,13 +161,14 @@ CUB_RUNTIME_FUNCTION cudaError_t dispatch(
     },
     inputs);
   using out_value_t = ::cuda::std::remove_cv_t<::cuda::std::remove_pointer_t<decltype(out_ptr)>>;
-  using tile_op_t   = typename tile_eligible<TransformOp, out_value_t, sizeof...(InIters)>::tile_op_type;
+  using tile_op_t   =
+    typename CUB_NS_QUALIFIER::transform::tile_eligible<TransformOp, out_value_t, sizeof...(InIters)>::tile_op_type;
   static_assert(::cuda::std::is_empty_v<tile_op_t>,
                 "tile_op_type must be stateless (the tile kernel default-constructs it)");
   static_assert(::cuda::std::is_trivially_default_constructible_v<tile_op_t>,
                 "tile_op_type must be trivially default constructible");
 
-  return DeviceTransform::template Transform<0, tile_mufu_heavy_v<TransformOp>, tile_op_t>(
+  return DeviceTransform::template Transform<0, CUB_NS_QUALIFIER::transform::tile_mufu_heavy_v<TransformOp>, tile_op_t>(
     in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), tile_op_t{}, stream);
 }
 
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
index c823cc46b99..330f34d8754 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
@@ -3,20 +3,27 @@
 
 // Compile-time policy for cub::DeviceTransform's tile path.
 //
-// Users call cub::DeviceTransform::Transform with whatever scalar functor they
-// have (e.g. cuda::std::plus<__half>). That functor is NOT directly callable
-// from a tile transform_kernel -- its operator() takes scalars, not ct::tile.
-// So eligible specializations declare a `tile_op_type` member that names a
-// tile-friendly replacement functor (with __tile__ templated operator()) that
-// performs the same operation. The dispatch hook then launches the tile
-// kernel with the replacement, not the user's original.
+// PUBLIC EXTENSION POINTS (cub::transform):
+//   tile_eligible<Op, T, NIn>   -- specialize this to opt a (functor type,
+//                                   element type, input arity) combo into
+//                                   the tile dispatch path.
+//   tile_eligible_v<...>        -- variable-template companion.
+//   tile_mufu_heavy<Op>         -- specialize to flag Op as MUFU-heavy; the
+//                                   tile policy picker uses this hint.
+//   tile_mufu_heavy_v<...>      -- variable-template companion.
 //
-// tile_eligible_v<Op, T, NIn> answers "should DeviceTransform::Transform
-// route to the tile kernel for this (functor, element type, input arity)?".
-// tile_mufu_heavy_v<Op> hints the tile policy picker that Op spends most of
-// its time on MUFU instructions, so the picker caps items/thread at the
-// vector width to avoid piling up MUFU work that cannot SIMD on Blackwell
-// for sub-4-byte types.
+// Users call cub::DeviceTransform::Transform with whatever scalar functor
+// they have (e.g. cuda::std::plus<__half>). That scalar functor is NOT
+// directly callable from a tile transform_kernel -- its operator() takes
+// scalars, not ct::tile. So eligible specializations declare a `tile_op_type`
+// member naming a tile-friendly replacement (a stateless functor with a
+// __tile__ templated operator() that performs the same op on ct::tile args).
+// The dispatch hook launches the tile kernel with the replacement, not the
+// user's original functor instance.
+//
+// INTERNAL (cub::detail::transform::tile):
+//   tile_plus, tile_multiplies   -- shipped tile-friendly substitutes used by
+//                                    the built-in specializations below.
 
 #pragma once
 
@@ -44,6 +51,27 @@
 
 CUB_NAMESPACE_BEGIN
 
+// Public extension surface.
+namespace transform
+{
+
+template <typename Op, typename T, ::cuda::std::size_t NIn>
+struct tile_eligible : ::cuda::std::false_type
+{};
+
+template <typename Op, typename T, ::cuda::std::size_t NIn>
+inline constexpr bool tile_eligible_v = tile_eligible<Op, T, NIn>::value;
+
+template <typename Op>
+struct tile_mufu_heavy : ::cuda::std::false_type
+{};
+
+template <typename Op>
+inline constexpr bool tile_mufu_heavy_v = tile_mufu_heavy<Op>::value;
+
+} // namespace transform
+
+// Internal substitutes shipped by CCCL.
 namespace detail::transform::tile
 {
 
@@ -70,50 +98,41 @@ struct tile_multiplies
 };
 #  endif // _CCCL_TILE_COMPILATION()
 
-template <typename Op, typename T, ::cuda::std::size_t NIn>
-struct tile_eligible : ::cuda::std::false_type
-{};
-
-template <typename Op, typename T, ::cuda::std::size_t NIn>
-inline constexpr bool tile_eligible_v = tile_eligible<Op, T, NIn>::value;
-
-template <typename Op>
-struct tile_mufu_heavy : ::cuda::std::false_type
-{};
-
-template <typename Op>
-inline constexpr bool tile_mufu_heavy_v = tile_mufu_heavy<Op>::value;
+} // namespace detail::transform::tile
 
+// Built-in trait specializations live in the public namespace alongside the
+// trait, but reference the internal substitute functors.
 #  if _CCCL_TILE_COMPILATION()
+namespace transform
+{
 #    if _CCCL_HAS_NVFP16()
 template <>
-struct tile_eligible<::cuda::std::plus<__half>, __half, 2> : ::cuda::std::true_type
+struct tile_eligible<::cuda::std::plus<::__half>, ::__half, 2> : ::cuda::std::true_type
 {
-  using tile_op_type = tile_plus;
+  using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_plus;
 };
 template <>
-struct tile_eligible<::cuda::std::multiplies<__half>, __half, 2> : ::cuda::std::true_type
+struct tile_eligible<::cuda::std::multiplies<::__half>, ::__half, 2> : ::cuda::std::true_type
 {
-  using tile_op_type = tile_multiplies;
+  using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_multiplies;
 };
 #    endif // _CCCL_HAS_NVFP16()
 
 #    if _CCCL_HAS_NVBF16()
 template <>
-struct tile_eligible<::cuda::std::plus<__nv_bfloat16>, __nv_bfloat16, 2> : ::cuda::std::true_type
+struct tile_eligible<::cuda::std::plus<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type
 {
-  using tile_op_type = tile_plus;
+  using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_plus;
 };
 template <>
-struct tile_eligible<::cuda::std::multiplies<__nv_bfloat16>, __nv_bfloat16, 2> : ::cuda::std::true_type
+struct tile_eligible<::cuda::std::multiplies<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type
 {
-  using tile_op_type = tile_multiplies;
+  using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_multiplies;
 };
 #    endif // _CCCL_HAS_NVBF16()
+} // namespace transform
 #  endif // _CCCL_TILE_COMPILATION()
 
-} // namespace detail::transform::tile
-
 CUB_NAMESPACE_END
 
 #endif // _CCCL_CTK_AT_LEAST(13, 3)

From eb6bd04bbcd6d172b1b9381f7ade2755d5574995 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 12:58:25 -0700
Subject: [PATCH 20/83] purge outdated comments from before runtime fallback
 was added

---
 cub/cub/device/device_transform.cuh           | 10 ++++----
 .../dispatch/dispatch_transform_tile.cuh      | 23 ++++++++++++-------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index 1560d9e1a68..bcb84f76fa4 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -105,11 +105,11 @@ struct DeviceTransform
 
 #if _CCCL_CTK_AT_LEAST(13, 3) && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
     // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible AND
-    // the runtime alignment / divisibility / size preconditions hold, route
-    // to the tile kernel. Otherwise fall through to the standard CUB
-    // dispatch below -- CUB's existing kernels handle the unaligned tail
-    // case via their own internal logic, so misalignment is a graceful
-    // fallback, not an error.
+    // the runtime alignment + divisibility preconditions hold, route to the
+    // tile kernel. Otherwise fall through to the standard CUB dispatch
+    // below -- CUB's existing kernels handle the unaligned tail case via
+    // their own internal logic, so misalignment is a graceful fallback,
+    // not an error.
     if constexpr (StableAddress == detail::transform::requires_stable_address::no
                   && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
                   && detail::transform::tile::tile_dispatch_eligible_v<
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index ac9cdaf059d..e3bb569b2af 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -1,10 +1,16 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-// Tile port of cub::DeviceTransform. The public surface mirrors
-// cub::DeviceTransform::{Transform, Fill}; the kernels are written against the
-// tile DSL (cuda::tiles). This header requires CTK 13.3 or newer and nvcc
-// invoked with --enable-tile.
+// Internal dispatch helpers for cub::DeviceTransform's tile path:
+//   tile_dispatch_eligible_v  -- compile-time predicate the hook consults
+//   runtime_preconditions_ok  -- runtime alignment + divisibility predicate
+//   dispatch                  -- bridge that launches the tile kernel with
+//                                the trait's substitute functor
+//   DeviceTransform           -- internal tile-local Transform/Fill wrappers
+//                                used by `dispatch`
+// User-facing extension points (tile_eligible / tile_mufu_heavy) live in
+// dispatch_transform_tile_traits.cuh under cub::transform.
+// Requires CTK 13.3 or newer and nvcc invoked with --enable-tile.
 
 #pragma once
 
@@ -102,9 +108,10 @@ using __unwrapped_value_t =
 
 // Combined compile-time predicate used by cub::DeviceTransform's __transform_internal
 // to decide whether to route a given (Op, OutIter, InIters...) to the tile path.
-// The call site lifts this into an `if constexpr` so the standard CUB dispatch
-// is not instantiated when tile takes over (under --enable-tile the standard
-// path fails to compile for many functor/type combinations).
+// The call site lifts this into an `if constexpr`: when this is true the hook
+// tries the tile kernel first and, on runtime alignment / divisibility
+// failure, falls through to the standard CUB dispatch below. When false, the
+// tile branch is discarded and only CUB's standard path is emitted.
 template <typename Op, typename OutIter, typename... InIters>
 inline constexpr bool tile_dispatch_eligible_v =
   THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutIter>
@@ -116,7 +123,7 @@ inline constexpr bool tile_dispatch_eligible_v =
 // it commits to the tile path. Mirrors how CUB's dispatch_t::CanVectorize
 // guards the vectorized kernel. The tile kernels use ct::assume_aligned<16>
 // and ct::assume_divisible<16>, so violating these at runtime is UB.
-// Returns false to tell the hook to surface cudaErrorInvalidValue.
+// Returns false to tell the hook to fall back to the standard CUB dispatch.
 template <typename OutIter, typename... InIters, typename OffsetT>
 CUB_RUNTIME_FUNCTION bool
 runtime_preconditions_ok(::cuda::std::tuple<InIters...> const& inputs, OutIter output, OffsetT num_items)

From f69e0ede371ff775c245c3f22ef5192e13cf6fc7 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 13:12:48 -0700
Subject: [PATCH 21/83] move kernel doc-comment next to the kernel and reflow
 to 100 col

---
 cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index 5a67e75a04c..5f6271e61e1 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -1,11 +1,6 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-// Tile DSL kernels backing cub::DeviceTransform's tile path. The kernels
-// assume 16-byte alignment on every pointer and 16-byte divisibility on
-// num_items so the compiler can pick LDG.E.128. Callers in the dispatch
-// header are responsible for honoring those preconditions.
-
 #pragma once
 
 #include <cub/config.cuh>
@@ -29,6 +24,9 @@ CUB_NAMESPACE_BEGIN
 namespace detail::transform::tile
 {
 
+// Tile DSL kernels backing cub::DeviceTransform's tile path. The kernels assume 16-byte alignment on
+// every pointer and 16-byte divisibility on num_items so the compiler can pick LDG.E.128. Callers in
+// the dispatch header are responsible for honoring those preconditions.
 template <int TileSize, typename Fn, typename Out, typename... Ins>
 __tile_global__ void
 transform_kernel(int64_t num_items_, Out* __restrict__ out_, const Ins* __restrict__... ins_)

From 207ba0e08a454652d6e72fa74ab5187a6450d04e Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 13:24:20 -0700
Subject: [PATCH 22/83] gate tile transform headers on a single config macro

---
 .../bench/transform/tile/babelstream.cu       |  4 +--
 cub/benchmarks/bench/transform/tile/copy.cu   |  4 +--
 .../bench/transform/tile/grayscale.cu         |  4 +--
 .../bench/transform/tile/pytorch.cu           |  4 +--
 .../transform/tile/test_device_transform.cu   |  6 ++--
 cub/cub/device/device_transform.cuh           |  7 ++--
 .../dispatch/dispatch_transform_tile.cuh      |  6 ++--
 .../dispatch_transform_tile_config.cuh        | 34 +++++++++++++++++++
 .../dispatch_transform_tile_traits.cuh        | 24 ++++++-------
 .../kernels/kernel_transform_tile.cuh         |  6 ++--
 .../dispatch/tuning/tuning_transform_tile.cuh |  6 ++--
 11 files changed, 71 insertions(+), 34 deletions(-)
 create mode 100644 cub/cub/device/dispatch/dispatch_transform_tile_config.cuh

diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu
index ba1c37036b0..297ef78379a 100644
--- a/cub/benchmarks/bench/transform/tile/babelstream.cu
+++ b/cub/benchmarks/bench/transform/tile/babelstream.cu
@@ -15,7 +15,7 @@
 #include <vector>
 #include <cstdint>
 
-#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 #  include <cuda_tile.h>
 #endif
 
@@ -39,7 +39,7 @@ struct nstream_op {
     __host__ __device__ auto operator()(A a, B b, C c) const { return a + b - c - c; }
 };
 
-#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 // Tile-friendly substitutes (must be stateless + trivially default constructible).
 struct tile_mul_op {
     template <class B>
diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu
index fd697256dd9..da9665b2f25 100644
--- a/cub/benchmarks/bench/transform/tile/copy.cu
+++ b/cub/benchmarks/bench/transform/tile/copy.cu
@@ -15,7 +15,7 @@
 #include <vector>
 #include <cstdint>
 
-#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 #  include <cuda_tile.h>
 #endif
 
@@ -25,7 +25,7 @@ struct identity {
     template <class T> __host__ __device__ auto operator()(T v) const { return v; }
 };
 
-#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 struct tile_identity {
     template <class T> __tile__ auto operator()(T v) const { return v; }
 };
diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu
index 80768581aab..9f364304266 100644
--- a/cub/benchmarks/bench/transform/tile/grayscale.cu
+++ b/cub/benchmarks/bench/transform/tile/grayscale.cu
@@ -12,7 +12,7 @@
 #include <cuda/std/tuple>
 #include <vector>
 
-#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 #  include <cuda_tile.h>
 #endif
 
@@ -28,7 +28,7 @@ struct rgb_to_y {
     }
 };
 
-#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 struct tile_rgb_to_y {
     template <class R, class G, class B>
     __tile__ auto operator()(R r, G g, B b) const {
diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu
index 6e35560f426..0e1767fdac7 100644
--- a/cub/benchmarks/bench/transform/tile/pytorch.cu
+++ b/cub/benchmarks/bench/transform/tile/pytorch.cu
@@ -19,7 +19,7 @@
 #include <cuda/std/tuple>
 #include <vector>
 
-#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 #  include <cuda_tile.h>
 #endif
 
@@ -60,7 +60,7 @@ struct binary_fmax { template <class A, class B> __host__ __device__ auto operat
 // ========================================================================
 // Tile substitutes + trait registration. Only compiled under tile mode.
 // ========================================================================
-#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 namespace ct = ::cuda::tiles;
 
 template <class T> __tile__ auto as_float(T v) { return ct::element_cast<float>(v); }
diff --git a/cub/benchmarks/bench/transform/tile/test_device_transform.cu b/cub/benchmarks/bench/transform/tile/test_device_transform.cu
index b3fe263909d..d3a143a3deb 100644
--- a/cub/benchmarks/bench/transform/tile/test_device_transform.cu
+++ b/cub/benchmarks/bench/transform/tile/test_device_transform.cu
@@ -25,7 +25,7 @@
 #include <cmath>
 #include <vector>
 
-#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 #  include <cuda_tile.h>
 #endif
 
@@ -83,7 +83,7 @@ struct square_op {
     template <class T> __host__ __device__ T operator()(T a) const { return a * a; }
 };
 
-#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 namespace ct = ::cuda::tiles;
 
 // Tile-friendly substitutes (must be stateless + trivially default constructible).
@@ -178,7 +178,7 @@ void test_fill(int64_t n, T value) {
 } // namespace
 
 // User self-registers identity_op and square_op as tile-eligible.
-#if defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 CUB_NAMESPACE_BEGIN
 namespace transform
 {
diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index bcb84f76fa4..bcdedf8ba95 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -15,9 +15,10 @@
 
 #include <cub/detail/choose_offset.cuh>
 #include <cub/device/dispatch/dispatch_transform.cuh>
+#include <cub/device/dispatch/dispatch_transform_tile_config.cuh>
 #include <cub/util_namespace.cuh>
 
-#if _CCCL_CTK_AT_LEAST(13, 3) && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 #  include <cub/device/dispatch/dispatch_transform_tile.cuh>
 #endif
 
@@ -103,7 +104,7 @@ struct DeviceTransform
 
     const auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env).get();
 
-#if _CCCL_CTK_AT_LEAST(13, 3) && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH) && _CCCL_TILE_COMPILATION()
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
     // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible AND
     // the runtime alignment + divisibility preconditions hold, route to the
     // tile kernel. Otherwise fall through to the standard CUB dispatch
@@ -123,7 +124,7 @@ struct DeviceTransform
           inputs, output, static_cast<offset_t>(num_items), stream);
       }
     }
-#endif // _CCCL_CTK_AT_LEAST(13, 3) && CCCL_ENABLE_TILE_TRANSFORM_DISPATCH && _CCCL_TILE_COMPILATION()
+#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 
     using tuning_env =
       ::cuda::std::execution::__query_result_or_t<Env, ::cuda::execution::__get_tuning_t, ::cuda::std::execution::env<>>;
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index e3bb569b2af..f9ec1e1ff31 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -16,6 +16,8 @@
 
 #include <cub/config.cuh>
 
+#include <cub/device/dispatch/dispatch_transform_tile_config.cuh>
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -24,7 +26,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if _CCCL_CTK_AT_LEAST(13, 3)
+#if _CCCL_CUB_HAS_TILE_TRANSFORM()
 
 #  include <cub/device/dispatch/dispatch_transform_tile_traits.cuh>
 #  include <cub/device/dispatch/kernels/kernel_transform_tile.cuh>
@@ -183,4 +185,4 @@ CUB_RUNTIME_FUNCTION cudaError_t dispatch(
 
 CUB_NAMESPACE_END
 
-#endif // _CCCL_CTK_AT_LEAST(13, 3)
+#endif // _CCCL_CUB_HAS_TILE_TRANSFORM()
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
new file mode 100644
index 00000000000..cd43a9d8b48
--- /dev/null
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
@@ -0,0 +1,34 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Single source of truth for the compile-time gates the tile transform headers
+// share. Two macros:
+//
+//   _CCCL_CUB_HAS_TILE_TRANSFORM()
+//     True when CUB's tile transform machinery is available: CTK 13.3 or newer,
+//     C++20 (tile DSL requires it), and the tile compilation trajectory
+//     (--enable-tile). When false, the tile headers (kernel / tuning / dispatch
+//     / traits) are skipped entirely.
+//
+//   _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+//     True when the dispatch hook in cub::DeviceTransform should fire. Same as
+//     _CCCL_CUB_HAS_TILE_TRANSFORM() plus the user opt-in macro
+//     CCCL_ENABLE_TILE_TRANSFORM_DISPATCH.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#define _CCCL_CUB_HAS_TILE_TRANSFORM() \
+  (_CCCL_CTK_AT_LEAST(13, 3) && _CCCL_STD_VER >= 2020 && _CCCL_TILE_COMPILATION())
+
+#define _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() \
+  (_CCCL_CUB_HAS_TILE_TRANSFORM() && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH))
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
index 330f34d8754..32422766ba4 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
@@ -29,6 +29,8 @@
 
 #include <cub/config.cuh>
 
+#include <cub/device/dispatch/dispatch_transform_tile_config.cuh>
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -37,7 +39,9 @@
 #  pragma system_header
 #endif // no system header
 
-#if _CCCL_CTK_AT_LEAST(13, 3)
+#if _CCCL_CUB_HAS_TILE_TRANSFORM()
+
+#  include <cuda_tile.h>
 
 #  include <cuda/std/__cccl/extended_data_types.h>
 #  include <cuda/std/__functional/operations.h>
@@ -45,10 +49,6 @@
 
 #  include <cstddef>
 
-#  if _CCCL_TILE_COMPILATION()
-#    include <cuda_tile.h>
-#  endif
-
 CUB_NAMESPACE_BEGIN
 
 // Public extension surface.
@@ -75,7 +75,6 @@ inline constexpr bool tile_mufu_heavy_v = tile_mufu_heavy<Op>::value;
 namespace detail::transform::tile
 {
 
-#  if _CCCL_TILE_COMPILATION()
 // Tile-friendly mirrors of common cuda::std ops. Each has a __tile__
 // templated operator() so it can be invoked from inside transform_kernel
 // where the arguments are ct::tile<T, ...> rather than scalar T.
@@ -96,16 +95,14 @@ struct tile_multiplies
     return a * b;
   }
 };
-#  endif // _CCCL_TILE_COMPILATION()
 
 } // namespace detail::transform::tile
 
 // Built-in trait specializations live in the public namespace alongside the
 // trait, but reference the internal substitute functors.
-#  if _CCCL_TILE_COMPILATION()
 namespace transform
 {
-#    if _CCCL_HAS_NVFP16()
+#  if _CCCL_HAS_NVFP16()
 template <>
 struct tile_eligible<::cuda::std::plus<::__half>, ::__half, 2> : ::cuda::std::true_type
 {
@@ -116,9 +113,9 @@ struct tile_eligible<::cuda::std::multiplies<::__half>, ::__half, 2> : ::cuda::s
 {
   using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_multiplies;
 };
-#    endif // _CCCL_HAS_NVFP16()
+#  endif // _CCCL_HAS_NVFP16()
 
-#    if _CCCL_HAS_NVBF16()
+#  if _CCCL_HAS_NVBF16()
 template <>
 struct tile_eligible<::cuda::std::plus<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type
 {
@@ -129,10 +126,9 @@ struct tile_eligible<::cuda::std::multiplies<::__nv_bfloat16>, ::__nv_bfloat16,
 {
   using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_multiplies;
 };
-#    endif // _CCCL_HAS_NVBF16()
+#  endif // _CCCL_HAS_NVBF16()
 } // namespace transform
-#  endif // _CCCL_TILE_COMPILATION()
 
 CUB_NAMESPACE_END
 
-#endif // _CCCL_CTK_AT_LEAST(13, 3)
+#endif // _CCCL_CUB_HAS_TILE_TRANSFORM()
diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index 5f6271e61e1..a5c7e2d2d82 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -5,6 +5,8 @@
 
 #include <cub/config.cuh>
 
+#include <cub/device/dispatch/dispatch_transform_tile_config.cuh>
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -13,7 +15,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if _CCCL_CTK_AT_LEAST(13, 3)
+#if _CCCL_CUB_HAS_TILE_TRANSFORM()
 
 #  include <cuda_tile.h>
 
@@ -74,4 +76,4 @@ __tile_global__ void fill_kernel(int64_t num_items_, T* __restrict__ out_, T val
 
 CUB_NAMESPACE_END
 
-#endif // _CCCL_CTK_AT_LEAST(13, 3)
+#endif // _CCCL_CUB_HAS_TILE_TRANSFORM()
diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
index 86c2d1b394f..4bd82475c5c 100644
--- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
@@ -11,6 +11,8 @@
 
 #include <cub/config.cuh>
 
+#include <cub/device/dispatch/dispatch_transform_tile_config.cuh>
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -19,7 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if _CCCL_CTK_AT_LEAST(13, 3)
+#if _CCCL_CUB_HAS_TILE_TRANSFORM()
 
 #  include <cuda/cmath>
 
@@ -100,4 +102,4 @@ constexpr int pick_tile_size(bool mufu_heavy = false, int cc_x10 = 1000)
 
 CUB_NAMESPACE_END
 
-#endif // _CCCL_CTK_AT_LEAST(13, 3)
+#endif // _CCCL_CUB_HAS_TILE_TRANSFORM()

From 48b949a189be94b59fe44efc283fd3fb9f500812 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 13:28:32 -0700
Subject: [PATCH 23/83] tidy kernel_transform_tile.cuh: use cuda::std::int64_t
 and drop _-suffix params

---
 .../kernels/kernel_transform_tile.cuh         | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index a5c7e2d2d82..18bf9cd5f86 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -19,7 +19,7 @@
 
 #  include <cuda_tile.h>
 
-#  include <cstdint>
+#  include <cuda/std/cstdint>
 
 CUB_NAMESPACE_BEGIN
 
@@ -31,42 +31,42 @@ namespace detail::transform::tile
 // the dispatch header are responsible for honoring those preconditions.
 template <int TileSize, typename Fn, typename Out, typename... Ins>
 __tile_global__ void
-transform_kernel(int64_t num_items_, Out* __restrict__ out_, const Ins* __restrict__... ins_)
+transform_kernel(::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins)
 {
-  namespace ct = cuda::tiles;
+  namespace ct = ::cuda::tiles;
 
   const auto bx = ct::bid().x;
   Fn fn{};
 
-  auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_));
-  auto out       = ct::assume_aligned<16>(out_);
+  const auto n         = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
+  const auto out_align = ct::assume_aligned<16>(out);
 
-  // Explicit int64_t element type on the extent; CTAD would deduce uint32_t
-  // and wrap at 2^32. Using int64_t lets us drop the 2^31 runtime cap.
-  auto out_span = ct::tensor_span{out, ct::extents<int64_t, ct::dynamic_extent>{num_items}};
+  // Explicit int64_t element type on the extent; CTAD would deduce uint32_t and wrap at 2^32. Using
+  // int64_t lets us drop the 2^31 runtime cap.
+  auto out_span = ct::tensor_span{out_align, ct::extents<::cuda::std::int64_t, ct::dynamic_extent>{n}};
   auto out_view = ct::partition_view{out_span, ct::shape<TileSize>{}};
 
-  auto load_one = [bx, num_items](auto* ptr_) {
-    auto ptr  = ct::assume_aligned<16>(ptr_);
-    auto span = ct::tensor_span{ptr, ct::extents<int64_t, ct::dynamic_extent>{num_items}};
-    auto view = ct::partition_view{span, ct::shape<TileSize>{}};
+  auto load_one = [bx, n](auto* ptr) {
+    auto ptr_align = ct::assume_aligned<16>(ptr);
+    auto span      = ct::tensor_span{ptr_align, ct::extents<::cuda::std::int64_t, ct::dynamic_extent>{n}};
+    auto view      = ct::partition_view{span, ct::shape<TileSize>{}};
     return view.load_masked(bx);
   };
 
-  out_view.store_masked(fn(load_one(ins_)...), bx);
+  out_view.store_masked(fn(load_one(ins)...), bx);
 }
 
 template <int TileSize, typename T>
-__tile_global__ void fill_kernel(int64_t num_items_, T* __restrict__ out_, T value)
+__tile_global__ void fill_kernel(::cuda::std::int64_t num_items, T* __restrict__ out, T value)
 {
-  namespace ct  = cuda::tiles;
+  namespace ct  = ::cuda::tiles;
   const auto bx = ct::bid().x;
 
-  auto num_items = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items_));
-  auto out       = ct::assume_aligned<16>(out_);
+  const auto n         = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
+  const auto out_align = ct::assume_aligned<16>(out);
 
   // Explicit int64_t element type on the extent (see transform_kernel above).
-  auto out_span = ct::tensor_span{out, ct::extents<int64_t, ct::dynamic_extent>{num_items}};
+  auto out_span = ct::tensor_span{out_align, ct::extents<::cuda::std::int64_t, ct::dynamic_extent>{n}};
   auto out_view = ct::partition_view{out_span, ct::shape<TileSize>{}};
   using tile_t  = ct::tile<T, ct::shape<TileSize>>;
   out_view.store_masked(ct::full<tile_t>(value), bx);

From bb091d0d33d938999141dc35371dcb0e753168e0 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 14:02:45 -0700
Subject: [PATCH 24/83] factor out make_partition_view helper and document
 assume_* annotations

---
 .../kernels/kernel_transform_tile.cuh         | 47 ++++++++++---------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index 18bf9cd5f86..781bd84b6b5 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -26,32 +26,39 @@ CUB_NAMESPACE_BEGIN
 namespace detail::transform::tile
 {
 
+// Build a tile partition_view for a 1D contiguous buffer. The two annotations are load-bearing:
+//   assume_aligned<16>      -- promises the pointer is 16-byte aligned, so the compiler can pick
+//                              LDG.E.128 vectorized loads/stores.
+//   ct::extents<int64_t,..> -- explicit element type on the extent; CTAD would deduce uint32_t and
+//                              wrap at 2^32. int64_t lets us cover the full num_items range.
+// The caller is responsible for honoring assume_aligned<16>; the dispatch header's
+// runtime_preconditions_ok enforces this before launching either kernel.
+template <int TileSize, typename T, typename N>
+__tile__ auto make_partition_view(T* ptr, N n)
+{
+  namespace ct        = ::cuda::tiles;
+  const auto ptr_align = ct::assume_aligned<16>(ptr);
+  auto span            = ct::tensor_span{ptr_align, ct::extents<::cuda::std::int64_t, ct::dynamic_extent>{n}};
+  return ct::partition_view{span, ct::shape<TileSize>{}};
+}
+
 // Tile DSL kernels backing cub::DeviceTransform's tile path. The kernels assume 16-byte alignment on
 // every pointer and 16-byte divisibility on num_items so the compiler can pick LDG.E.128. Callers in
 // the dispatch header are responsible for honoring those preconditions.
+//
+// assume_divisible<16>      -- promises num_items % 16 == 0, so the tile DSL can elide tail handling.
+// assume_bounded_below<0>   -- promises num_items >= 0; enables sign-comparison simplifications.
 template <int TileSize, typename Fn, typename Out, typename... Ins>
 __tile_global__ void
 transform_kernel(::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins)
 {
-  namespace ct = ::cuda::tiles;
-
+  namespace ct  = ::cuda::tiles;
   const auto bx = ct::bid().x;
   Fn fn{};
 
-  const auto n         = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
-  const auto out_align = ct::assume_aligned<16>(out);
-
-  // Explicit int64_t element type on the extent; CTAD would deduce uint32_t and wrap at 2^32. Using
-  // int64_t lets us drop the 2^31 runtime cap.
-  auto out_span = ct::tensor_span{out_align, ct::extents<::cuda::std::int64_t, ct::dynamic_extent>{n}};
-  auto out_view = ct::partition_view{out_span, ct::shape<TileSize>{}};
-
-  auto load_one = [bx, n](auto* ptr) {
-    auto ptr_align = ct::assume_aligned<16>(ptr);
-    auto span      = ct::tensor_span{ptr_align, ct::extents<::cuda::std::int64_t, ct::dynamic_extent>{n}};
-    auto view      = ct::partition_view{span, ct::shape<TileSize>{}};
-    return view.load_masked(bx);
-  };
+  const auto n     = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
+  auto out_view    = make_partition_view<TileSize>(out, n);
+  auto load_one    = [bx, n](auto* ptr) { return make_partition_view<TileSize>(ptr, n).load_masked(bx); };
 
   out_view.store_masked(fn(load_one(ins)...), bx);
 }
@@ -62,12 +69,8 @@ __tile_global__ void fill_kernel(::cuda::std::int64_t num_items, T* __restrict__
   namespace ct  = ::cuda::tiles;
   const auto bx = ct::bid().x;
 
-  const auto n         = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
-  const auto out_align = ct::assume_aligned<16>(out);
-
-  // Explicit int64_t element type on the extent (see transform_kernel above).
-  auto out_span = ct::tensor_span{out_align, ct::extents<::cuda::std::int64_t, ct::dynamic_extent>{n}};
-  auto out_view = ct::partition_view{out_span, ct::shape<TileSize>{}};
+  const auto n  = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
+  auto out_view = make_partition_view<TileSize>(out, n);
   using tile_t  = ct::tile<T, ct::shape<TileSize>>;
   out_view.store_masked(ct::full<tile_t>(value), bx);
 }

From 514b53617db1a2236152f7f06948f11ad9b1d158 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 14:09:27 -0700
Subject: [PATCH 25/83] const-qualify scalar parameters in transform_kernel and
 fill_kernel

---
 cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index 781bd84b6b5..84298e35e78 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -50,7 +50,7 @@ __tile__ auto make_partition_view(T* ptr, N n)
 // assume_bounded_below<0>   -- promises num_items >= 0; enables sign-comparison simplifications.
 template <int TileSize, typename Fn, typename Out, typename... Ins>
 __tile_global__ void
-transform_kernel(::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins)
+transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins)
 {
   namespace ct  = ::cuda::tiles;
   const auto bx = ct::bid().x;
@@ -64,7 +64,7 @@ transform_kernel(::cuda::std::int64_t num_items, Out* __restrict__ out, const In
 }
 
 template <int TileSize, typename T>
-__tile_global__ void fill_kernel(::cuda::std::int64_t num_items, T* __restrict__ out, T value)
+__tile_global__ void fill_kernel(const ::cuda::std::int64_t num_items, T* __restrict__ out, const T value)
 {
   namespace ct  = ::cuda::tiles;
   const auto bx = ct::bid().x;

From 607f7a07fdbddeca626b12d4cd0c51509c4353cc Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 14:18:43 -0700
Subject: [PATCH 26/83] rename runtime_preconditions_ok to
 runtime_preconditions_valid

---
 cub/cub/device/device_transform.cuh                       | 2 +-
 cub/cub/device/dispatch/dispatch_transform_tile.cuh       | 8 ++++----
 cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index bcdedf8ba95..9e1404c0f62 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -118,7 +118,7 @@ struct DeviceTransform
                        RandomAccessIteratorOut,
                        RandomAccessIteratorsIn...>)
     {
-      if (detail::transform::tile::runtime_preconditions_ok(inputs, output, static_cast<offset_t>(num_items)))
+      if (detail::transform::tile::runtime_preconditions_valid(inputs, output, static_cast<offset_t>(num_items)))
       {
         return detail::transform::tile::dispatch<TransformOp>(
           inputs, output, static_cast<offset_t>(num_items), stream);
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index f9ec1e1ff31..0c2dfaf9ba2 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -3,7 +3,7 @@
 
 // Internal dispatch helpers for cub::DeviceTransform's tile path:
 //   tile_dispatch_eligible_v  -- compile-time predicate the hook consults
-//   runtime_preconditions_ok  -- runtime alignment + divisibility predicate
+//   runtime_preconditions_valid  -- runtime alignment + divisibility predicate
 //   dispatch                  -- bridge that launches the tile kernel with
 //                                the trait's substitute functor
 //   DeviceTransform           -- internal tile-local Transform/Fill wrappers
@@ -128,7 +128,7 @@ inline constexpr bool tile_dispatch_eligible_v =
 // Returns false to tell the hook to fall back to the standard CUB dispatch.
 template <typename OutIter, typename... InIters, typename OffsetT>
 CUB_RUNTIME_FUNCTION bool
-runtime_preconditions_ok(::cuda::std::tuple<InIters...> const& inputs, OutIter output, OffsetT num_items)
+runtime_preconditions_valid(::cuda::std::tuple<InIters...> const& inputs, OutIter output, OffsetT num_items)
 {
   // Pointer alignment is in bytes (for LDG.E.128); the kernel's
   // ct::assume_divisible<N> applies to num_items as an element count. These
@@ -151,10 +151,10 @@ runtime_preconditions_ok(::cuda::std::tuple<InIters...> const& inputs, OutIter o
 
 // Bridge between cub::DeviceTransform::__transform_internal and the tile
 // DeviceTransform above. Precondition: tile_dispatch_eligible_v<Op, OutIter,
-// InIters...> is true AND runtime_preconditions_ok returned true. The kernel
+// InIters...> is true AND runtime_preconditions_valid returned true. The kernel
 // itself assumes 16-byte pointer alignment and num_items divisibility; the
 // caller (the hook in device_transform.cuh) is responsible for checking
-// runtime_preconditions_ok first.
+// runtime_preconditions_valid first.
 //
 // The tile kernel is launched with the trait's tile_op_type (a tile-friendly
 // mirror of Op with __tile__ operator), NOT the user's Op instance -- the
diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index 84298e35e78..f31cfca5d40 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -32,7 +32,7 @@ namespace detail::transform::tile
 //   ct::extents<int64_t,..> -- explicit element type on the extent; CTAD would deduce uint32_t and
 //                              wrap at 2^32. int64_t lets us cover the full num_items range.
 // The caller is responsible for honoring assume_aligned<16>; the dispatch header's
-// runtime_preconditions_ok enforces this before launching either kernel.
+// runtime_preconditions_valid enforces this before launching either kernel.
 template <int TileSize, typename T, typename N>
 __tile__ auto make_partition_view(T* ptr, N n)
 {

From a5d3eca54eadd82f0949ed63582086115a94d448 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 14:30:58 -0700
Subject: [PATCH 27/83] trim tile traits header includes

---
 cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
index 32422766ba4..660ec4f10c6 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
@@ -43,12 +43,10 @@
 
 #  include <cuda_tile.h>
 
-#  include <cuda/std/__cccl/extended_data_types.h>
+#  include <cuda/std/__cstddef/types.h>
 #  include <cuda/std/__functional/operations.h>
 #  include <cuda/std/__type_traits/integral_constant.h>
 
-#  include <cstddef>
-
 CUB_NAMESPACE_BEGIN
 
 // Public extension surface.

From 8eaa4950e88557bd0fb3e943bd1339eb26402dbc Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 14:37:13 -0700
Subject: [PATCH 28/83] annotate tile-path return-valued helpers with
 [[nodiscard]]

---
 cub/cub/device/dispatch/dispatch_transform_tile.cuh    | 10 +++++-----
 .../device/dispatch/kernels/kernel_transform_tile.cuh  |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 0c2dfaf9ba2..5e4bedf725b 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -55,7 +55,7 @@ namespace detail::transform::tile
 {
 
 template <int TileSize, typename Fn, typename Out, typename... Ins, ::cuda::std::size_t... Idx>
-cudaError_t launch_impl(
+[[nodiscard]] cudaError_t launch_impl(
   ::cuda::std::tuple<Ins*...> inputs,
   Out* output,
   int64_t num_items,
@@ -78,7 +78,7 @@ cudaError_t launch_impl(
 struct DeviceTransform
 {
   template <int TileSize = 0, bool MufuHeavy = false, typename Fn, typename Out, typename... Ins>
-  static cudaError_t
+  [[nodiscard]] static cudaError_t
   Transform(::cuda::std::tuple<Ins*...> inputs, Out* output, int64_t num_items, Fn, cudaStream_t stream = 0)
   {
     constexpr int chosen = (TileSize > 0) ? TileSize : pick_tile_size<Out, Ins...>(MufuHeavy);
@@ -87,7 +87,7 @@ struct DeviceTransform
 
   // Fill
   template <int TileSize = 0, typename T>
-  static cudaError_t Fill(T* output, int64_t num_items, T value, cudaStream_t stream = 0)
+  [[nodiscard]] static cudaError_t Fill(T* output, int64_t num_items, T value, cudaStream_t stream = 0)
   {
     if (num_items <= 0)
     {
@@ -127,7 +127,7 @@ inline constexpr bool tile_dispatch_eligible_v =
 // and ct::assume_divisible<16>, so violating these at runtime is UB.
 // Returns false to tell the hook to fall back to the standard CUB dispatch.
 template <typename OutIter, typename... InIters, typename OffsetT>
-CUB_RUNTIME_FUNCTION bool
+[[nodiscard]] CUB_RUNTIME_FUNCTION bool
 runtime_preconditions_valid(::cuda::std::tuple<InIters...> const& inputs, OutIter output, OffsetT num_items)
 {
   // Pointer alignment is in bytes (for LDG.E.128); the kernel's
@@ -160,7 +160,7 @@ runtime_preconditions_valid(::cuda::std::tuple<InIters...> const& inputs, OutIte
 // mirror of Op with __tile__ operator), NOT the user's Op instance -- the
 // user's scalar functor cannot be invoked on ct::tile arguments.
 template <typename TransformOp, typename OutIter, typename... InIters, typename OffsetT>
-CUB_RUNTIME_FUNCTION cudaError_t dispatch(
+[[nodiscard]] CUB_RUNTIME_FUNCTION cudaError_t dispatch(
   ::cuda::std::tuple<InIters...> inputs, OutIter output, OffsetT num_items, cudaStream_t stream)
 {
   auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output);
diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index f31cfca5d40..1d96a29e3c0 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -34,7 +34,7 @@ namespace detail::transform::tile
 // The caller is responsible for honoring assume_aligned<16>; the dispatch header's
 // runtime_preconditions_valid enforces this before launching either kernel.
 template <int TileSize, typename T, typename N>
-__tile__ auto make_partition_view(T* ptr, N n)
+[[nodiscard]] __tile__ auto make_partition_view(T* ptr, N n)
 {
   namespace ct        = ::cuda::tiles;
   const auto ptr_align = ct::assume_aligned<16>(ptr);

From 0055590b01134fe408ace0922b307fa919ec36d4 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 17:06:54 -0700
Subject: [PATCH 29/83] drop redundant __detail sub-namespace from tile
 dispatch helper

---
 cub/cub/device/dispatch/dispatch_transform_tile.cuh | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 5e4bedf725b..8b69b85ae22 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -100,13 +100,10 @@ struct DeviceTransform
   }
 };
 
-namespace __detail
-{
 template <typename Iter>
 using __unwrapped_value_t =
   ::cuda::std::remove_cv_t<::cuda::std::remove_pointer_t<decltype(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(
     ::cuda::std::declval<Iter>()))>>;
-} // namespace __detail
 
 // Combined compile-time predicate used by cub::DeviceTransform's __transform_internal
 // to decide whether to route a given (Op, OutIter, InIters...) to the tile path.
@@ -119,7 +116,7 @@ inline constexpr bool tile_dispatch_eligible_v =
   THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutIter>
   && (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<InIters> && ...)
   && CUB_NS_QUALIFIER::transform::tile_eligible_v<
-       Op, __detail::__unwrapped_value_t<OutIter>, sizeof...(InIters)>;
+       Op, __unwrapped_value_t<OutIter>, sizeof...(InIters)>;
 
 // Runtime predicate consulted by the cub::DeviceTransform tile hook before
 // it commits to the tile path. Mirrors how CUB's dispatch_t::CanVectorize

From c7ee05c41d63341454a3767b87a6e7847bb0c139 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 17:12:26 -0700
Subject: [PATCH 30/83] use cub::detail::it_value_t and drop hand-rolled unwrap
 helper

---
 .../device/dispatch/dispatch_transform_tile.cuh  | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 8b69b85ae22..49e61de1eba 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -39,9 +39,6 @@
 #  include <cuda/std/__tuple_dir/apply.h>
 #  include <cuda/std/__type_traits/is_empty.h>
 #  include <cuda/std/__type_traits/is_trivially_default_constructible.h>
-#  include <cuda/std/__utility/declval.h>
-#  include <cuda/std/__type_traits/remove_cv.h>
-#  include <cuda/std/__type_traits/remove_pointer.h>
 #  include <cuda/std/tuple>
 #  include <cuda/std/utility>
 
@@ -100,11 +97,6 @@ struct DeviceTransform
   }
 };
 
-template <typename Iter>
-using __unwrapped_value_t =
-  ::cuda::std::remove_cv_t<::cuda::std::remove_pointer_t<decltype(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(
-    ::cuda::std::declval<Iter>()))>>;
-
 // Combined compile-time predicate used by cub::DeviceTransform's __transform_internal
 // to decide whether to route a given (Op, OutIter, InIters...) to the tile path.
 // The call site lifts this into an `if constexpr`: when this is true the hook
@@ -115,8 +107,7 @@ template <typename Op, typename OutIter, typename... InIters>
 inline constexpr bool tile_dispatch_eligible_v =
   THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutIter>
   && (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<InIters> && ...)
-  && CUB_NS_QUALIFIER::transform::tile_eligible_v<
-       Op, __unwrapped_value_t<OutIter>, sizeof...(InIters)>;
+  && CUB_NS_QUALIFIER::transform::tile_eligible_v<Op, it_value_t<OutIter>, sizeof...(InIters)>;
 
 // Runtime predicate consulted by the cub::DeviceTransform tile hook before
 // it commits to the tile path. Mirrors how CUB's dispatch_t::CanVectorize
@@ -166,9 +157,8 @@ template <typename TransformOp, typename OutIter, typename... InIters, typename
       return ::cuda::std::make_tuple(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters)...);
     },
     inputs);
-  using out_value_t = ::cuda::std::remove_cv_t<::cuda::std::remove_pointer_t<decltype(out_ptr)>>;
-  using tile_op_t   =
-    typename CUB_NS_QUALIFIER::transform::tile_eligible<TransformOp, out_value_t, sizeof...(InIters)>::tile_op_type;
+  using tile_op_t =
+    typename CUB_NS_QUALIFIER::transform::tile_eligible<TransformOp, it_value_t<OutIter>, sizeof...(InIters)>::tile_op_type;
   static_assert(::cuda::std::is_empty_v<tile_op_t>,
                 "tile_op_type must be stateless (the tile kernel default-constructs it)");
   static_assert(::cuda::std::is_trivially_default_constructible_v<tile_op_t>,

From 91d3945d0ac13b84b6f861d95f07f89396971361 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 17:31:31 -0700
Subject: [PATCH 31/83] drop redundant template keyword on
 DeviceTransform::Transform call

---
 cub/cub/device/dispatch/dispatch_transform_tile.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 49e61de1eba..428f34e8c9b 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -164,7 +164,7 @@ template <typename TransformOp, typename OutIter, typename... InIters, typename
   static_assert(::cuda::std::is_trivially_default_constructible_v<tile_op_t>,
                 "tile_op_type must be trivially default constructible");
 
-  return DeviceTransform::template Transform<0, CUB_NS_QUALIFIER::transform::tile_mufu_heavy_v<TransformOp>, tile_op_t>(
+  return DeviceTransform::Transform<0, CUB_NS_QUALIFIER::transform::tile_mufu_heavy_v<TransformOp>, tile_op_t>(
     in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), tile_op_t{}, stream);
 }
 

From 3c67f7deb9b4d3f8d98d86bfa1efa7a1e9cdc9a9 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 17:35:28 -0700
Subject: [PATCH 32/83] wrap kernel-launch error checks with CubDebug

---
 cub/cub/device/dispatch/dispatch_transform_tile.cuh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 428f34e8c9b..7a0815d575c 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -31,6 +31,7 @@
 #  include <cub/device/dispatch/dispatch_transform_tile_traits.cuh>
 #  include <cub/device/dispatch/kernels/kernel_transform_tile.cuh>
 #  include <cub/device/dispatch/tuning/tuning_transform_tile.cuh>
+#  include <cub/util_debug.cuh>
 
 #  include <thrust/type_traits/is_contiguous_iterator.h>
 #  include <thrust/type_traits/unwrap_contiguous_iterator.h>
@@ -69,7 +70,7 @@ template <int TileSize, typename Fn, typename Out, typename... Ins, ::cuda::std:
   transform_kernel<TileSize, Fn><<<static_cast<unsigned int>(num_blocks), 1, 0, stream>>>(
     num_items, output, ::cuda::std::get<Idx>(inputs)...);
 
-  return cudaGetLastError();
+  return CubDebug(cudaGetLastError());
 }
 
 struct DeviceTransform
@@ -93,7 +94,7 @@ struct DeviceTransform
     constexpr int chosen     = (TileSize > 0) ? TileSize : pick_tile_size<T>();
     const int64_t num_blocks = (num_items + chosen - 1) / chosen;
     fill_kernel<chosen, T><<<static_cast<unsigned int>(num_blocks), 1, 0, stream>>>(num_items, output, value);
-    return cudaGetLastError();
+    return CubDebug(cudaGetLastError());
   }
 };
 

From fd9b7a285d8160db433ffa4f1c338019b5d853ad Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 17:39:03 -0700
Subject: [PATCH 33/83] fully qualify tile kernel-launch names and use unsigned
 in casts

---
 cub/cub/device/dispatch/dispatch_transform_tile.cuh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 7a0815d575c..cc0aa93cea6 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -67,8 +67,8 @@ template <int TileSize, typename Fn, typename Out, typename... Ins, ::cuda::std:
 
   const int64_t num_blocks = (num_items + TileSize - 1) / TileSize;
 
-  transform_kernel<TileSize, Fn><<<static_cast<unsigned int>(num_blocks), 1, 0, stream>>>(
-    num_items, output, ::cuda::std::get<Idx>(inputs)...);
+  CUB_NS_QUALIFIER::detail::transform::tile::transform_kernel<TileSize, Fn>
+    <<<static_cast<unsigned>(num_blocks), 1, 0, stream>>>(num_items, output, ::cuda::std::get<Idx>(inputs)...);
 
   return CubDebug(cudaGetLastError());
 }
@@ -93,7 +93,8 @@ struct DeviceTransform
     }
     constexpr int chosen     = (TileSize > 0) ? TileSize : pick_tile_size<T>();
     const int64_t num_blocks = (num_items + chosen - 1) / chosen;
-    fill_kernel<chosen, T><<<static_cast<unsigned int>(num_blocks), 1, 0, stream>>>(num_items, output, value);
+    CUB_NS_QUALIFIER::detail::transform::tile::fill_kernel<chosen, T>
+      <<<static_cast<unsigned>(num_blocks), 1, 0, stream>>>(num_items, output, value);
     return CubDebug(cudaGetLastError());
   }
 };

From 0793ffe82afb833f912d40f49f8301b3cb74f984 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 17:47:36 -0700
Subject: [PATCH 34/83] document tile_mufu_heavy with a usage hint

---
 cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
index 660ec4f10c6..bf8f9caa1a3 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
@@ -60,6 +60,8 @@ struct tile_eligible : ::cuda::std::false_type
 template <typename Op, typename T, ::cuda::std::size_t NIn>
 inline constexpr bool tile_eligible_v = tile_eligible<Op, T, NIn>::value;
 
+// Hint that Op uses MUFU (multi-function unit, sin/cos/exp/log/tanh/rcp/rsq). Setting this makes
+// the tile policy picker cap items/thread so MUFU pipes are not oversaturated.
 template <typename Op>
 struct tile_mufu_heavy : ::cuda::std::false_type
 {};

From 3e206e69b9179a999f069220b15c764c2607d705 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 17:51:37 -0700
Subject: [PATCH 35/83] use ::cuda::ceil_div for block-count math in tile
 dispatch

---
 cub/cub/device/dispatch/dispatch_transform_tile.cuh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index cc0aa93cea6..f4597cf67ff 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -36,6 +36,7 @@
 #  include <thrust/type_traits/is_contiguous_iterator.h>
 #  include <thrust/type_traits/unwrap_contiguous_iterator.h>
 
+#  include <cuda/cmath>
 #  include <cuda/std/__memory/is_sufficiently_aligned.h>
 #  include <cuda/std/__tuple_dir/apply.h>
 #  include <cuda/std/__type_traits/is_empty.h>
@@ -65,7 +66,7 @@ template <int TileSize, typename Fn, typename Out, typename... Ins, ::cuda::std:
     return cudaSuccess;
   }
 
-  const int64_t num_blocks = (num_items + TileSize - 1) / TileSize;
+  const int64_t num_blocks = ::cuda::ceil_div(num_items, int64_t{TileSize});
 
   CUB_NS_QUALIFIER::detail::transform::tile::transform_kernel<TileSize, Fn>
     <<<static_cast<unsigned>(num_blocks), 1, 0, stream>>>(num_items, output, ::cuda::std::get<Idx>(inputs)...);
@@ -92,7 +93,7 @@ struct DeviceTransform
       return cudaSuccess;
     }
     constexpr int chosen     = (TileSize > 0) ? TileSize : pick_tile_size<T>();
-    const int64_t num_blocks = (num_items + chosen - 1) / chosen;
+    const int64_t num_blocks = ::cuda::ceil_div(num_items, int64_t{chosen});
     CUB_NS_QUALIFIER::detail::transform::tile::fill_kernel<chosen, T>
       <<<static_cast<unsigned>(num_blocks), 1, 0, stream>>>(num_items, output, value);
     return CubDebug(cudaGetLastError());

From e56f0d642ff81ce1d11f2500b825b90b9b9bf196 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 17:57:24 -0700
Subject: [PATCH 36/83] reuse CUB's cc_to_min_bytes_in_flight, take
 compute_capability object

---
 .../dispatch/tuning/tuning_transform_tile.cuh | 28 +++++--------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
index 4bd82475c5c..aabc9c1852a 100644
--- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
@@ -1,8 +1,8 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-// Policy picker for cub::DeviceTransform's tile path. Mirrors the
-// bytes-in-flight target used by CUB's non-tile algorithms (see
+// Policy picker for cub::DeviceTransform's tile path. Shares the
+// bytes-in-flight target used by CUB's non-tile algorithms (calls
 // tuning_transform.cuh's cc_to_min_bytes_in_flight) but expresses the
 // answer as a TileSize, since tile kernels partition by compile-time
 // shape rather than threads*items.
@@ -23,6 +23,9 @@
 
 #if _CCCL_CUB_HAS_TILE_TRANSFORM()
 
+#  include <cub/device/dispatch/tuning/tuning_transform.cuh>
+
+#  include <cuda/__device/compute_capability.h>
 #  include <cuda/cmath>
 
 CUB_NAMESPACE_BEGIN
@@ -30,23 +33,6 @@ CUB_NAMESPACE_BEGIN
 namespace detail::transform::tile
 {
 
-constexpr int min_bytes_in_flight_per_sm(int cc_x10)
-{
-  if (cc_x10 >= 1000)
-  {
-    return 64 * 1024; // B200
-  }
-  if (cc_x10 >= 900)
-  {
-    return 48 * 1024; // H100/H200
-  }
-  if (cc_x10 >= 800)
-  {
-    return 16 * 1024; // A100
-  }
-  return 12 * 1024;
-}
-
 constexpr int min_size(int a)
 {
   return a;
@@ -63,7 +49,7 @@ constexpr int min_size(int a, int b, Ts... rest)
 // registers and the compiler unpacks them and packs them back. reducing the
 // compute work per thread helps here. need profiling to know the exact cause.
 template <typename Out, typename... Ins>
-constexpr int pick_tile_size(bool mufu_heavy = false, int cc_x10 = 1000)
+constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability cc = {10, 0})
 {
   constexpr int threads_per_block    = 128;
   constexpr int vector_bytes         = 16; // LDG.E.128 -> 16 bytes
@@ -75,7 +61,7 @@ constexpr int pick_tile_size(bool mufu_heavy = false, int cc_x10 = 1000)
 
   // Fill (zero inputs) keeps the same latency target by counting output bytes.
   constexpr int bytes_per_iter = (sizeof...(Ins) > 0) ? (int(sizeof(Ins)) + ... + 0) : int(sizeof(Out));
-  const int target             = min_bytes_in_flight_per_sm(cc_x10);
+  const int target             = cc_to_min_bytes_in_flight(cc);
   const int items_for_latency =
     static_cast<int>(::cuda::ceil_div(target, max_occupancy * threads_per_block * bytes_per_iter));
 

From e9e9939a4e46a8c895092d67433287e3c13da379 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 18:01:25 -0700
Subject: [PATCH 37/83] use ::cuda::std::min initializer list instead of
 hand-rolled variadic min

---
 .../dispatch/tuning/tuning_transform_tile.cuh      | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
index aabc9c1852a..f31066dd034 100644
--- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
@@ -27,23 +27,13 @@
 
 #  include <cuda/__device/compute_capability.h>
 #  include <cuda/cmath>
+#  include <cuda/std/__algorithm/min.h>
 
 CUB_NAMESPACE_BEGIN
 
 namespace detail::transform::tile
 {
 
-constexpr int min_size(int a)
-{
-  return a;
-}
-template <class... Ts>
-constexpr int min_size(int a, int b, Ts... rest)
-{
-  int m = a < b ? a : b;
-  return min_size(m, rest...);
-}
-
 // mufu_heavy=true tells the policy the functor body has heavy MUFU usage.
 // for small data types, vectorized load will make them arrive packed in
 // registers and the compiler unpacks them and packs them back. reducing the
@@ -56,7 +46,7 @@ constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability
   constexpr int max_items_per_thread = 32;
   constexpr int max_occupancy        = 16;
 
-  constexpr int min_elem      = min_size(int(sizeof(Out)), int(sizeof(Ins))...);
+  constexpr int min_elem      = ::cuda::std::min({int(sizeof(Out)), int(sizeof(Ins))...});
   constexpr int items_for_vec = static_cast<int>(::cuda::ceil_div(vector_bytes, min_elem));
 
   // Fill (zero inputs) keeps the same latency target by counting output bytes.

From cc77ef2157cafbf7dbf6e5bc296fe7fa760c6a7e Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 9 Jun 2026 18:14:53 -0700
Subject: [PATCH 38/83] drop int() casts on sizeof and use ::cuda::std::max

---
 .../dispatch/tuning/tuning_transform_tile.cuh  | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
index f31066dd034..51892ef7005 100644
--- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
@@ -27,7 +27,9 @@
 
 #  include <cuda/__device/compute_capability.h>
 #  include <cuda/cmath>
+#  include <cuda/std/__algorithm/max.h>
 #  include <cuda/std/__algorithm/min.h>
+#  include <cuda/std/__cstddef/types.h>
 
 CUB_NAMESPACE_BEGIN
 
@@ -46,17 +48,17 @@ constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability
   constexpr int max_items_per_thread = 32;
   constexpr int max_occupancy        = 16;
 
-  constexpr int min_elem      = ::cuda::std::min({int(sizeof(Out)), int(sizeof(Ins))...});
+  constexpr auto min_elem     = ::cuda::std::min({sizeof(Out), sizeof(Ins)...});
   constexpr int items_for_vec = static_cast<int>(::cuda::ceil_div(vector_bytes, min_elem));
 
   // Fill (zero inputs) keeps the same latency target by counting output bytes.
-  constexpr int bytes_per_iter = (sizeof...(Ins) > 0) ? (int(sizeof(Ins)) + ... + 0) : int(sizeof(Out));
-  const int target             = cc_to_min_bytes_in_flight(cc);
+  constexpr auto bytes_per_iter = (sizeof...(Ins) > 0) ? (sizeof(Ins) + ... + ::cuda::std::size_t{0}) : sizeof(Out);
+  const int target              = cc_to_min_bytes_in_flight(cc);
   const int items_for_latency =
     static_cast<int>(::cuda::ceil_div(target, max_occupancy * threads_per_block * bytes_per_iter));
 
-  int items = items_for_vec > items_for_latency ? items_for_vec : items_for_latency;
-  items     = static_cast<int>(::cuda::next_power_of_two(static_cast<unsigned int>(items)));
+  int items = ::cuda::std::max(items_for_vec, items_for_latency);
+  items     = static_cast<int>(::cuda::next_power_of_two(static_cast<unsigned>(items)));
   if (items > max_items_per_thread)
   {
     items = max_items_per_thread;
@@ -64,10 +66,10 @@ constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability
 
   if (mufu_heavy && min_elem < 4)
   {
-    const int byte_cap = vector_bytes / min_elem; // 16 for I8, 8 for I16/half/bf16
-    if (items > byte_cap)
+    const auto byte_cap = vector_bytes / min_elem; // 16 for I8, 8 for I16/half/bf16
+    if (static_cast<decltype(byte_cap)>(items) > byte_cap)
     {
-      items = byte_cap;
+      items = static_cast<int>(byte_cap);
     }
   }
 

From 69d2339676b8422e256cbc666b2745e7fe393570 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 10 Jun 2026 14:25:45 -0700
Subject: [PATCH 39/83] simplify _CCCL_CUB_HAS_TILE_TRANSFORM to just
 _CCCL_TILE_COMPILATION

---
 .../dispatch/dispatch_transform_tile_config.cuh     | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
index cd43a9d8b48..4636d3c5759 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
@@ -5,10 +5,12 @@
 // share. Two macros:
 //
 //   _CCCL_CUB_HAS_TILE_TRANSFORM()
-//     True when CUB's tile transform machinery is available: CTK 13.3 or newer,
-//     C++20 (tile DSL requires it), and the tile compilation trajectory
-//     (--enable-tile). When false, the tile headers (kernel / tuning / dispatch
-//     / traits) are skipped entirely.
+//     True when nvcc is compiling in tile mode (--enable-tile, i.e.
+//     _CCCL_TILE_COMPILATION()). The other preconditions tile needs are
+//     enforced where they belong: CTK 13.3+ is implied because --enable-tile
+//     is a 13.3+ nvcc flag, and C++20 is enforced by cuda_tile.h itself with
+//     an explicit #error. When false, the tile headers (kernel / tuning /
+//     dispatch / traits) are skipped entirely.
 //
 //   _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 //     True when the dispatch hook in cub::DeviceTransform should fire. Same as
@@ -27,8 +29,7 @@
 #  pragma system_header
 #endif // no system header
 
-#define _CCCL_CUB_HAS_TILE_TRANSFORM() \
-  (_CCCL_CTK_AT_LEAST(13, 3) && _CCCL_STD_VER >= 2020 && _CCCL_TILE_COMPILATION())
+#define _CCCL_CUB_HAS_TILE_TRANSFORM() _CCCL_TILE_COMPILATION()
 
 #define _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() \
   (_CCCL_CUB_HAS_TILE_TRANSFORM() && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH))

From 4c2d0c52f114b2da4dbdb9f6d450b74601c82b38 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 10 Jun 2026 15:55:09 -0700
Subject: [PATCH 40/83] fully qualify cub::detail/cub::transform refs + ::cuda*
 runtime types

Per @fbusato review: spell out the full namespace at every site instead
of relying on local cub::detail::transform::tile scope. Same for ::cudaError_t,
::cudaStream_t, ::cudaSuccess, ::cudaGetLastError(). Default stream
parameters now nullptr instead of literal 0. <cstdint> swapped for
<cuda/std/cstdint>; bare int64_t -> ::cuda::std::int64_t.
---
 cub/cub/device/device_transform.cuh           |  7 +--
 .../dispatch/dispatch_transform_tile.cuh      | 49 ++++++++++---------
 .../dispatch/tuning/tuning_transform_tile.cuh |  2 +-
 3 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index 9e1404c0f62..7d8bd316e81 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -113,14 +113,15 @@ struct DeviceTransform
     // not an error.
     if constexpr (StableAddress == detail::transform::requires_stable_address::no
                   && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
-                  && detail::transform::tile::tile_dispatch_eligible_v<
+                  && cub::detail::transform::tile::tile_dispatch_eligible_v<
                        TransformOp,
                        RandomAccessIteratorOut,
                        RandomAccessIteratorsIn...>)
     {
-      if (detail::transform::tile::runtime_preconditions_valid(inputs, output, static_cast<offset_t>(num_items)))
+      if (cub::detail::transform::tile::runtime_preconditions_valid(
+            inputs, output, static_cast<offset_t>(num_items)))
       {
-        return detail::transform::tile::dispatch<TransformOp>(
+        return cub::detail::transform::tile::dispatch<TransformOp>(
           inputs, output, static_cast<offset_t>(num_items), stream);
       }
     }
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index f4597cf67ff..63b565f812f 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -46,7 +46,7 @@
 
 #  include <cuda_runtime.h>
 
-#  include <cstdint>
+#  include <cuda/std/cstdint>
 
 CUB_NAMESPACE_BEGIN
 
@@ -54,49 +54,54 @@ namespace detail::transform::tile
 {
 
 template <int TileSize, typename Fn, typename Out, typename... Ins, ::cuda::std::size_t... Idx>
-[[nodiscard]] cudaError_t launch_impl(
+[[nodiscard]] ::cudaError_t launch_impl(
   ::cuda::std::tuple<Ins*...> inputs,
   Out* output,
-  int64_t num_items,
-  cudaStream_t stream,
+  ::cuda::std::int64_t num_items,
+  ::cudaStream_t stream,
   ::cuda::std::index_sequence<Idx...>)
 {
   if (num_items <= 0)
   {
-    return cudaSuccess;
+    return ::cudaSuccess;
   }
 
-  const int64_t num_blocks = ::cuda::ceil_div(num_items, int64_t{TileSize});
+  const ::cuda::std::int64_t num_blocks = ::cuda::ceil_div(num_items, ::cuda::std::int64_t{TileSize});
 
-  CUB_NS_QUALIFIER::detail::transform::tile::transform_kernel<TileSize, Fn>
+  cub::detail::transform::tile::transform_kernel<TileSize, Fn>
     <<<static_cast<unsigned>(num_blocks), 1, 0, stream>>>(num_items, output, ::cuda::std::get<Idx>(inputs)...);
 
-  return CubDebug(cudaGetLastError());
+  return CubDebug(::cudaGetLastError());
 }
 
 struct DeviceTransform
 {
   template <int TileSize = 0, bool MufuHeavy = false, typename Fn, typename Out, typename... Ins>
-  [[nodiscard]] static cudaError_t
-  Transform(::cuda::std::tuple<Ins*...> inputs, Out* output, int64_t num_items, Fn, cudaStream_t stream = 0)
+  [[nodiscard]] static ::cudaError_t Transform(
+    ::cuda::std::tuple<Ins*...> inputs,
+    Out* output,
+    ::cuda::std::int64_t num_items,
+    Fn,
+    ::cudaStream_t stream = nullptr)
   {
-    constexpr int chosen = (TileSize > 0) ? TileSize : pick_tile_size<Out, Ins...>(MufuHeavy);
+    constexpr int chosen = (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size<Out, Ins...>(MufuHeavy);
     return launch_impl<chosen, Fn>(inputs, output, num_items, stream, ::cuda::std::index_sequence_for<Ins...>{});
   }
 
   // Fill
   template <int TileSize = 0, typename T>
-  [[nodiscard]] static cudaError_t Fill(T* output, int64_t num_items, T value, cudaStream_t stream = 0)
+  [[nodiscard]] static ::cudaError_t
+  Fill(T* output, ::cuda::std::int64_t num_items, T value, ::cudaStream_t stream = nullptr)
   {
     if (num_items <= 0)
     {
-      return cudaSuccess;
+      return ::cudaSuccess;
     }
-    constexpr int chosen     = (TileSize > 0) ? TileSize : pick_tile_size<T>();
-    const int64_t num_blocks = ::cuda::ceil_div(num_items, int64_t{chosen});
-    CUB_NS_QUALIFIER::detail::transform::tile::fill_kernel<chosen, T>
+    constexpr int chosen                  = (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size<T>();
+    const ::cuda::std::int64_t num_blocks = ::cuda::ceil_div(num_items, ::cuda::std::int64_t{chosen});
+    cub::detail::transform::tile::fill_kernel<chosen, T>
       <<<static_cast<unsigned>(num_blocks), 1, 0, stream>>>(num_items, output, value);
-    return CubDebug(cudaGetLastError());
+    return CubDebug(::cudaGetLastError());
   }
 };
 
@@ -110,7 +115,7 @@ template <typename Op, typename OutIter, typename... InIters>
 inline constexpr bool tile_dispatch_eligible_v =
   THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutIter>
   && (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<InIters> && ...)
-  && CUB_NS_QUALIFIER::transform::tile_eligible_v<Op, it_value_t<OutIter>, sizeof...(InIters)>;
+  && cub::transform::tile_eligible_v<Op, cub::detail::it_value_t<OutIter>, sizeof...(InIters)>;
 
 // Runtime predicate consulted by the cub::DeviceTransform tile hook before
 // it commits to the tile path. Mirrors how CUB's dispatch_t::CanVectorize
@@ -151,8 +156,8 @@ runtime_preconditions_valid(::cuda::std::tuple<InIters...> const& inputs, OutIte
 // mirror of Op with __tile__ operator), NOT the user's Op instance -- the
 // user's scalar functor cannot be invoked on ct::tile arguments.
 template <typename TransformOp, typename OutIter, typename... InIters, typename OffsetT>
-[[nodiscard]] CUB_RUNTIME_FUNCTION cudaError_t dispatch(
-  ::cuda::std::tuple<InIters...> inputs, OutIter output, OffsetT num_items, cudaStream_t stream)
+[[nodiscard]] CUB_RUNTIME_FUNCTION ::cudaError_t dispatch(
+  ::cuda::std::tuple<InIters...> inputs, OutIter output, OffsetT num_items, ::cudaStream_t stream)
 {
   auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output);
   auto in_ptrs = ::cuda::std::apply(
@@ -161,13 +166,13 @@ template <typename TransformOp, typename OutIter, typename... InIters, typename
     },
     inputs);
   using tile_op_t =
-    typename CUB_NS_QUALIFIER::transform::tile_eligible<TransformOp, it_value_t<OutIter>, sizeof...(InIters)>::tile_op_type;
+    typename cub::transform::tile_eligible<TransformOp, cub::detail::it_value_t<OutIter>, sizeof...(InIters)>::tile_op_type;
   static_assert(::cuda::std::is_empty_v<tile_op_t>,
                 "tile_op_type must be stateless (the tile kernel default-constructs it)");
   static_assert(::cuda::std::is_trivially_default_constructible_v<tile_op_t>,
                 "tile_op_type must be trivially default constructible");
 
-  return DeviceTransform::Transform<0, CUB_NS_QUALIFIER::transform::tile_mufu_heavy_v<TransformOp>, tile_op_t>(
+  return DeviceTransform::Transform<0, cub::transform::tile_mufu_heavy_v<TransformOp>, tile_op_t>(
     in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), tile_op_t{}, stream);
 }
 
diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
index 51892ef7005..bea4e390eab 100644
--- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
@@ -53,7 +53,7 @@ constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability
 
   // Fill (zero inputs) keeps the same latency target by counting output bytes.
   constexpr auto bytes_per_iter = (sizeof...(Ins) > 0) ? (sizeof(Ins) + ... + ::cuda::std::size_t{0}) : sizeof(Out);
-  const int target              = cc_to_min_bytes_in_flight(cc);
+  const int target              = cub::detail::transform::cc_to_min_bytes_in_flight(cc);
   const int items_for_latency =
     static_cast<int>(::cuda::ceil_div(target, max_occupancy * threads_per_block * bytes_per_iter));
 

From 9d861ecee1fbf06dc43ef0f81cf8a7932f49eeeb Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 10 Jun 2026 16:08:49 -0700
Subject: [PATCH 41/83] reflow kernel_transform_tile.cuh comments to 120-column
 limit

Per @fbusato nit: the comment blocks were wrapped at ~96 cols, leaving
~20 cols of the 120 budget unused. Reflow to pack each line near 120.
No code or semantic changes.
---
 .../dispatch/kernels/kernel_transform_tile.cuh | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index 1d96a29e3c0..154eeceb9c3 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -27,12 +27,12 @@ namespace detail::transform::tile
 {
 
 // Build a tile partition_view for a 1D contiguous buffer. The two annotations are load-bearing:
-//   assume_aligned<16>      -- promises the pointer is 16-byte aligned, so the compiler can pick
-//                              LDG.E.128 vectorized loads/stores.
-//   ct::extents<int64_t,..> -- explicit element type on the extent; CTAD would deduce uint32_t and
-//                              wrap at 2^32. int64_t lets us cover the full num_items range.
-// The caller is responsible for honoring assume_aligned<16>; the dispatch header's
-// runtime_preconditions_valid enforces this before launching either kernel.
+//   assume_aligned<16>      -- promises the pointer is 16-byte aligned, so the compiler can pick LDG.E.128 vectorized
+//                              loads/stores.
+//   ct::extents<int64_t,..> -- explicit element type on the extent; CTAD would deduce uint32_t and wrap at 2^32.
+//                              int64_t lets us cover the full num_items range.
+// The caller is responsible for honoring assume_aligned<16>; the dispatch header's runtime_preconditions_valid
+// enforces this before launching either kernel.
 template <int TileSize, typename T, typename N>
 [[nodiscard]] __tile__ auto make_partition_view(T* ptr, N n)
 {
@@ -42,9 +42,9 @@ template <int TileSize, typename T, typename N>
   return ct::partition_view{span, ct::shape<TileSize>{}};
 }
 
-// Tile DSL kernels backing cub::DeviceTransform's tile path. The kernels assume 16-byte alignment on
-// every pointer and 16-byte divisibility on num_items so the compiler can pick LDG.E.128. Callers in
-// the dispatch header are responsible for honoring those preconditions.
+// Tile DSL kernels backing cub::DeviceTransform's tile path. The kernels assume 16-byte alignment on every pointer
+// and 16-byte divisibility on num_items so the compiler can pick LDG.E.128. Callers in the dispatch header are
+// responsible for honoring those preconditions.
 //
 // assume_divisible<16>      -- promises num_items % 16 == 0, so the tile DSL can elide tail handling.
 // assume_bounded_below<0>   -- promises num_items >= 0; enables sign-comparison simplifications.

From b83cc7031707cd836a894c91d045f69b70f1f86c Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 10 Jun 2026 16:53:49 -0700
Subject: [PATCH 42/83] anchor make_partition_view with using-decl; inline
 stateless Fn

Per @fbusato suggestion on transform_kernel:
- drop the named `Fn fn{}` local and construct the stateless functor
  inline as `Fn{}(...)` at the call site
- add `using cub::detail::transform::tile::make_partition_view;` to
  anchor the helper name explicitly (called twice), consistent with the
  full-qualification convention, instead of relying on enclosing-namespace
  lookup
---
 cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index 154eeceb9c3..d2131a5b646 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -53,14 +53,14 @@ __tile_global__ void
 transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins)
 {
   namespace ct  = ::cuda::tiles;
+  using cub::detail::transform::tile::make_partition_view;
   const auto bx = ct::bid().x;
-  Fn fn{};
 
   const auto n     = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
   auto out_view    = make_partition_view<TileSize>(out, n);
   auto load_one    = [bx, n](auto* ptr) { return make_partition_view<TileSize>(ptr, n).load_masked(bx); };
 
-  out_view.store_masked(fn(load_one(ins)...), bx);
+  out_view.store_masked(Fn{}(load_one(ins)...), bx);
 }
 
 template <int TileSize, typename T>

From 60db7cea4721d7bf5bc8114d5a59c091945e76ba Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 10 Jun 2026 17:00:21 -0700
Subject: [PATCH 43/83] anchor remaining intra-namespace helper calls

Same convention as the make_partition_view anchor in transform_kernel,
applied to the two spots the qualification sweep left bare:
- fill_kernel: add the make_partition_view using-decl (matches transform_kernel)
- DeviceTransform::Transform: full-path qualify launch_impl, matching the
  pick_tile_size call directly above it

Type-name references (DeviceTransform within its own namespace) left as-is.
---
 cub/cub/device/dispatch/dispatch_transform_tile.cuh       | 3 ++-
 cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 63b565f812f..c038faf8435 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -85,7 +85,8 @@ struct DeviceTransform
     ::cudaStream_t stream = nullptr)
   {
     constexpr int chosen = (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size<Out, Ins...>(MufuHeavy);
-    return launch_impl<chosen, Fn>(inputs, output, num_items, stream, ::cuda::std::index_sequence_for<Ins...>{});
+    return cub::detail::transform::tile::launch_impl<chosen, Fn>(
+      inputs, output, num_items, stream, ::cuda::std::index_sequence_for<Ins...>{});
   }
 
   // Fill
diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index d2131a5b646..157be4b0c2b 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -67,6 +67,7 @@ template <int TileSize, typename T>
 __tile_global__ void fill_kernel(const ::cuda::std::int64_t num_items, T* __restrict__ out, const T value)
 {
   namespace ct  = ::cuda::tiles;
+  using cub::detail::transform::tile::make_partition_view;
   const auto bx = ct::bid().x;
 
   const auto n  = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));

From 85394e1b1a6c231842b1d88cec6c5850339815d2 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 10 Jun 2026 17:06:07 -0700
Subject: [PATCH 44/83] rename make_partition_view ->
 make_aligned_partition_view

Per @fbusato: the helper bakes in ct::assume_aligned<16>, so the name
should advertise that it returns an aligned partition view rather than a
plain ct::partition_view. Pure rename, 6 sites, all in this file.
---
 .../dispatch/kernels/kernel_transform_tile.cuh       | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index 157be4b0c2b..63f5d61d5cb 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -34,7 +34,7 @@ namespace detail::transform::tile
 // The caller is responsible for honoring assume_aligned<16>; the dispatch header's runtime_preconditions_valid
 // enforces this before launching either kernel.
 template <int TileSize, typename T, typename N>
-[[nodiscard]] __tile__ auto make_partition_view(T* ptr, N n)
+[[nodiscard]] __tile__ auto make_aligned_partition_view(T* ptr, N n)
 {
   namespace ct        = ::cuda::tiles;
   const auto ptr_align = ct::assume_aligned<16>(ptr);
@@ -53,12 +53,12 @@ __tile_global__ void
 transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins)
 {
   namespace ct  = ::cuda::tiles;
-  using cub::detail::transform::tile::make_partition_view;
+  using cub::detail::transform::tile::make_aligned_partition_view;
   const auto bx = ct::bid().x;
 
   const auto n     = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
-  auto out_view    = make_partition_view<TileSize>(out, n);
-  auto load_one    = [bx, n](auto* ptr) { return make_partition_view<TileSize>(ptr, n).load_masked(bx); };
+  auto out_view    = make_aligned_partition_view<TileSize>(out, n);
+  auto load_one    = [bx, n](auto* ptr) { return make_aligned_partition_view<TileSize>(ptr, n).load_masked(bx); };
 
   out_view.store_masked(Fn{}(load_one(ins)...), bx);
 }
@@ -67,11 +67,11 @@ template <int TileSize, typename T>
 __tile_global__ void fill_kernel(const ::cuda::std::int64_t num_items, T* __restrict__ out, const T value)
 {
   namespace ct  = ::cuda::tiles;
-  using cub::detail::transform::tile::make_partition_view;
+  using cub::detail::transform::tile::make_aligned_partition_view;
   const auto bx = ct::bid().x;
 
   const auto n  = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
-  auto out_view = make_partition_view<TileSize>(out, n);
+  auto out_view = make_aligned_partition_view<TileSize>(out, n);
   using tile_t  = ct::tile<T, ct::shape<TileSize>>;
   out_view.store_masked(ct::full<tile_t>(value), bx);
 }

From 63f6bda135f0e5b8a9574aee42485ee130fea47b Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 10 Jun 2026 17:20:23 -0700
Subject: [PATCH 45/83] mark out_view const in both tile kernels

Per @fbusato: out_view is never re-seated, only stored through. ct::partition_view
is a non-owning view with shallow const (store_masked/load_masked are const member
functions, like std::span/mdspan), so a const view still writes through -- verified
the store still compiles and the dispatch test stays bit-exact. Kept the
make_aligned_partition_view using-decl rather than fully qualifying inline; the name
is already anchored. Applied to fill_kernel too for parity.
---
 .../dispatch/kernels/kernel_transform_tile.cuh       | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index 63f5d61d5cb..585cefc833d 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -56,9 +56,9 @@ transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, co
   using cub::detail::transform::tile::make_aligned_partition_view;
   const auto bx = ct::bid().x;
 
-  const auto n     = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
-  auto out_view    = make_aligned_partition_view<TileSize>(out, n);
-  auto load_one    = [bx, n](auto* ptr) { return make_aligned_partition_view<TileSize>(ptr, n).load_masked(bx); };
+  const auto n        = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
+  const auto out_view = make_aligned_partition_view<TileSize>(out, n);
+  auto load_one       = [bx, n](auto* ptr) { return make_aligned_partition_view<TileSize>(ptr, n).load_masked(bx); };
 
   out_view.store_masked(Fn{}(load_one(ins)...), bx);
 }
@@ -70,9 +70,9 @@ __tile_global__ void fill_kernel(const ::cuda::std::int64_t num_items, T* __rest
   using cub::detail::transform::tile::make_aligned_partition_view;
   const auto bx = ct::bid().x;
 
-  const auto n  = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
-  auto out_view = make_aligned_partition_view<TileSize>(out, n);
-  using tile_t  = ct::tile<T, ct::shape<TileSize>>;
+  const auto n        = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
+  const auto out_view = make_aligned_partition_view<TileSize>(out, n);
+  using tile_t        = ct::tile<T, ct::shape<TileSize>>;
   out_view.store_masked(ct::full<tile_t>(value), bx);
 }
 

From 0a81752c661efe8e9de7b8c06b32957a4c5554f7 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 10 Jun 2026 17:28:15 -0700
Subject: [PATCH 46/83] include specific libcu++ headers instead of
 <cuda/cmath> umbrella

Per @fbusato: use the narrowest internal header for each symbol, matching
the sibling non-tile dispatch_transform.cuh convention.
- dispatch_transform_tile.cuh: <cuda/cmath> -> <cuda/__cmath/ceil_div.h>;
  <cuda/std/utility> -> <cuda/std/__utility/integer_sequence.h>
- tuning_transform_tile.cuh: <cuda/cmath> -> <cuda/__cmath/ceil_div.h>
  + <cuda/__cmath/pow2.h>
Kept <cuda/std/tuple> (sibling keeps the umbrella) and <cuda/std/cstdint>.
---
 cub/cub/device/dispatch/dispatch_transform_tile.cuh      | 4 ++--
 cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index c038faf8435..2e05252cce8 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -36,13 +36,13 @@
 #  include <thrust/type_traits/is_contiguous_iterator.h>
 #  include <thrust/type_traits/unwrap_contiguous_iterator.h>
 
-#  include <cuda/cmath>
+#  include <cuda/__cmath/ceil_div.h>
 #  include <cuda/std/__memory/is_sufficiently_aligned.h>
 #  include <cuda/std/__tuple_dir/apply.h>
 #  include <cuda/std/__type_traits/is_empty.h>
 #  include <cuda/std/__type_traits/is_trivially_default_constructible.h>
+#  include <cuda/std/__utility/integer_sequence.h>
 #  include <cuda/std/tuple>
-#  include <cuda/std/utility>
 
 #  include <cuda_runtime.h>
 
diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
index bea4e390eab..df6b849951d 100644
--- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
@@ -25,8 +25,9 @@
 
 #  include <cub/device/dispatch/tuning/tuning_transform.cuh>
 
+#  include <cuda/__cmath/ceil_div.h>
+#  include <cuda/__cmath/pow2.h>
 #  include <cuda/__device/compute_capability.h>
-#  include <cuda/cmath>
 #  include <cuda/std/__algorithm/max.h>
 #  include <cuda/std/__algorithm/min.h>
 #  include <cuda/std/__cstddef/types.h>

From 33a783a31049402efe7a8fccb669fb1ce9d174b0 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 10 Jun 2026 17:31:26 -0700
Subject: [PATCH 47/83] make byte_cap constexpr in pick_tile_size

Per @fbusato: byte_cap = vector_bytes / min_elem is a constant expression
(both operands are constexpr), so constexpr expresses its nature better
than const.
---
 cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
index df6b849951d..b34da587fac 100644
--- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
@@ -67,7 +67,7 @@ constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability
 
   if (mufu_heavy && min_elem < 4)
   {
-    const auto byte_cap = vector_bytes / min_elem; // 16 for I8, 8 for I16/half/bf16
+    constexpr auto byte_cap = vector_bytes / min_elem; // 16 for I8, 8 for I16/half/bf16
     if (static_cast<decltype(byte_cap)>(items) > byte_cap)
     {
       items = static_cast<int>(byte_cap);

From 37cf303e0566e4ba7036ddf01156c67c06586766 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 10 Jun 2026 17:42:04 -0700
Subject: [PATCH 48/83] drop redundant static_cast<int> on items_for_vec

Per @fbusato: ceil_div's result initializes a constexpr int directly; the
size_t->int copy-init is well-formed (not list-init) and CCCL's warning set
(-Wall -Wextra, no -Wconversion) doesn't flag it, so the cast was noise.
---
 cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
index b34da587fac..fec23bcdb43 100644
--- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
@@ -50,7 +50,7 @@ constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability
   constexpr int max_occupancy        = 16;
 
   constexpr auto min_elem     = ::cuda::std::min({sizeof(Out), sizeof(Ins)...});
-  constexpr int items_for_vec = static_cast<int>(::cuda::ceil_div(vector_bytes, min_elem));
+  constexpr int items_for_vec = ::cuda::ceil_div(vector_bytes, min_elem);
 
   // Fill (zero inputs) keeps the same latency target by counting output bytes.
   constexpr auto bytes_per_iter = (sizeof...(Ins) > 0) ? (sizeof(Ins) + ... + ::cuda::std::size_t{0}) : sizeof(Out);

From 0fb0a4ff4d96455c324d363de7f3a05751c12143 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 10 Jun 2026 17:44:14 -0700
Subject: [PATCH 49/83] define gate macro as literal 1/0 to avoid
 expansion-to-defined UB

_CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() previously expanded to
(_CCCL_CUB_HAS_TILE_TRANSFORM() && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH)).
Using it in `#if` generated `defined` via macro expansion -- UB per the
standard, flagged by -Wexpansion-to-defined (in -Wall/-Wextra) and fatal
under CCCL's -Werror. Library headers hid it via #pragma GCC system_header,
but non-system consumers (benches, and the test once it moves to cub/tests)
would fail to compile.

Switch to the literal 1/0 form, the same idiom _CCCL_TILE_COMPILATION uses:
the defined() now lives directly in an #if, and consumers see #if 1 / #if 0.
---
 .../device/dispatch/dispatch_transform_tile_config.cuh | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
index 4636d3c5759..8c25ea9bd30 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
@@ -31,5 +31,11 @@
 
 #define _CCCL_CUB_HAS_TILE_TRANSFORM() _CCCL_TILE_COMPILATION()
 
-#define _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() \
-  (_CCCL_CUB_HAS_TILE_TRANSFORM() && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH))
+// Defined as a literal 1/0 (not (_CCCL_CUB_HAS_TILE_TRANSFORM() && defined(...))) so that
+// `#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()` in non-system code (benches, tests) does not
+// generate `defined` via macro expansion, which is UB and trips -Wexpansion-to-defined under -Werror.
+#if _CCCL_CUB_HAS_TILE_TRANSFORM() && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH)
+#  define _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() 1
+#else
+#  define _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() 0
+#endif

From 768ab0c3e802dd3cf41d2250899d144946dbb7a0 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 10 Jun 2026 18:00:53 -0700
Subject: [PATCH 50/83] make the vector-width cap an int so the cap comparison
 needs no casts

@fbusato noted the cap looked like it should be int. It actually deduced to
size_t (min_elem is size_t from sizeof/cuda::std::min, so vector_bytes/min_elem
promotes to size_t), which is why the use sites had two casts and an int-vs-size_t
compare. Cast once at the definition so it is genuinely int; the comparison and
assignment are then int-vs-int -- no sign-compare, no use-site casts.

Also renamed byte_cap -> vec_items_cap: the value is an items/thread count (how
many elements fit in one 16-byte vector load), not a byte count, so the old name
wrongly implied a byte quantity / size_t.
---
 cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
index fec23bcdb43..a7715e6f195 100644
--- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
@@ -67,10 +67,13 @@ constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability
 
   if (mufu_heavy && min_elem < 4)
   {
-    constexpr auto byte_cap = vector_bytes / min_elem; // 16 for I8, 8 for I16/half/bf16
-    if (static_cast<decltype(byte_cap)>(items) > byte_cap)
+    // Elements that fit in one 16-byte vector load -> items/thread cap for MUFU-heavy sub-4B ops.
+    // min_elem is size_t, so cast the quotient once here to keep this an int item count (matches
+    // items below, so the comparison/assignment stay int-vs-int: no sign-compare, no use-site casts).
+    constexpr int vec_items_cap = static_cast<int>(vector_bytes / min_elem); // 16 for I8, 8 for I16/half/bf16
+    if (items > vec_items_cap)
     {
-      items = static_cast<int>(byte_cap);
+      items = vec_items_cap;
     }
   }
 

From 418e59280aacb0fec1435341a5682669f50bd462 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 10 Jun 2026 19:07:20 -0700
Subject: [PATCH 51/83] drop CUB_NS_QUALIFIER from tile_eligible substitutes
 (sweep missed them)

The fa9b87caec qualification sweep converted CUB_NS_QUALIFIER -> cub:: in
dispatch_transform_tile.cuh but missed the four tile_op_type aliases in the
traits file. They are tile_eligible specializations inside cub::transform, so
literal cub::detail::transform::tile::tile_plus/tile_multiplies is correct (and
resolves even under CUB_WRAPPED_NAMESPACE via enclosing-namespace lookup).
CUB_NS_QUALIFIER is only needed to name cub from outside the namespace.
---
 .../device/dispatch/dispatch_transform_tile_traits.cuh    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
index bf8f9caa1a3..4759d97569e 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
@@ -106,12 +106,12 @@ namespace transform
 template <>
 struct tile_eligible<::cuda::std::plus<::__half>, ::__half, 2> : ::cuda::std::true_type
 {
-  using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_plus;
+  using tile_op_type = cub::detail::transform::tile::tile_plus;
 };
 template <>
 struct tile_eligible<::cuda::std::multiplies<::__half>, ::__half, 2> : ::cuda::std::true_type
 {
-  using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_multiplies;
+  using tile_op_type = cub::detail::transform::tile::tile_multiplies;
 };
 #  endif // _CCCL_HAS_NVFP16()
 
@@ -119,12 +119,12 @@ struct tile_eligible<::cuda::std::multiplies<::__half>, ::__half, 2> : ::cuda::s
 template <>
 struct tile_eligible<::cuda::std::plus<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type
 {
-  using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_plus;
+  using tile_op_type = cub::detail::transform::tile::tile_plus;
 };
 template <>
 struct tile_eligible<::cuda::std::multiplies<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type
 {
-  using tile_op_type = CUB_NS_QUALIFIER::detail::transform::tile::tile_multiplies;
+  using tile_op_type = cub::detail::transform::tile::tile_multiplies;
 };
 #  endif // _CCCL_HAS_NVBF16()
 } // namespace transform

From 5123278fa96b41ca170d3dd6da512f3263ecf69e Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 10 Jun 2026 19:19:33 -0700
Subject: [PATCH 52/83] document why num_blocks fits the unsigned grid x-dim

Per @fbusato suggestion: note that the static_cast<unsigned>(num_blocks) can't
truncate -- num_blocks > 2^32-1 would require num_items > TileSize * 2^32
(>= 2^40 elements), more than any device can hold. Added to launch_impl and the
parallel spot in Fill.
---
 cub/cub/device/dispatch/dispatch_transform_tile.cuh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 2e05252cce8..b1e6041d71a 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -66,6 +66,8 @@ template <int TileSize, typename Fn, typename Out, typename... Ins, ::cuda::std:
     return ::cudaSuccess;
   }
 
+  // One CTA per tile. The cast to the unsigned grid x-dim can't truncate: num_blocks > 2^32-1
+  // would need num_items > TileSize * 2^32 (>= 2^40 elements), more than any device can hold.
   const ::cuda::std::int64_t num_blocks = ::cuda::ceil_div(num_items, ::cuda::std::int64_t{TileSize});
 
   cub::detail::transform::tile::transform_kernel<TileSize, Fn>
@@ -99,6 +101,8 @@ struct DeviceTransform
       return ::cudaSuccess;
     }
     constexpr int chosen                  = (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size<T>();
+    // One CTA per tile; see launch_impl -- num_blocks can't exceed the unsigned grid x-dim for
+    // any device-sized num_items.
     const ::cuda::std::int64_t num_blocks = ::cuda::ceil_div(num_items, ::cuda::std::int64_t{chosen});
     cub::detail::transform::tile::fill_kernel<chosen, T>
       <<<static_cast<unsigned>(num_blocks), 1, 0, stream>>>(num_items, output, value);

From 1fa728905929ae52dc36d59d161f144214088c5d Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 10 Jun 2026 19:23:45 -0700
Subject: [PATCH 53/83] clang-format dispatch_transform_tile.cuh

---
 .../dispatch/dispatch_transform_tile.cuh      | 32 ++++++++-----------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index b1e6041d71a..c170dd71013 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -42,17 +42,15 @@
 #  include <cuda/std/__type_traits/is_empty.h>
 #  include <cuda/std/__type_traits/is_trivially_default_constructible.h>
 #  include <cuda/std/__utility/integer_sequence.h>
+#  include <cuda/std/cstdint>
 #  include <cuda/std/tuple>
 
 #  include <cuda_runtime.h>
 
-#  include <cuda/std/cstdint>
-
 CUB_NAMESPACE_BEGIN
 
 namespace detail::transform::tile
 {
-
 template <int TileSize, typename Fn, typename Out, typename... Ins, ::cuda::std::size_t... Idx>
 [[nodiscard]] ::cudaError_t launch_impl(
   ::cuda::std::tuple<Ins*...> inputs,
@@ -80,13 +78,10 @@ struct DeviceTransform
 {
   template <int TileSize = 0, bool MufuHeavy = false, typename Fn, typename Out, typename... Ins>
   [[nodiscard]] static ::cudaError_t Transform(
-    ::cuda::std::tuple<Ins*...> inputs,
-    Out* output,
-    ::cuda::std::int64_t num_items,
-    Fn,
-    ::cudaStream_t stream = nullptr)
+    ::cuda::std::tuple<Ins*...> inputs, Out* output, ::cuda::std::int64_t num_items, Fn, ::cudaStream_t stream = nullptr)
   {
-    constexpr int chosen = (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size<Out, Ins...>(MufuHeavy);
+    constexpr int chosen =
+      (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size<Out, Ins...>(MufuHeavy);
     return cub::detail::transform::tile::launch_impl<chosen, Fn>(
       inputs, output, num_items, stream, ::cuda::std::index_sequence_for<Ins...>{});
   }
@@ -100,7 +95,7 @@ struct DeviceTransform
     {
       return ::cudaSuccess;
     }
-    constexpr int chosen                  = (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size<T>();
+    constexpr int chosen = (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size<T>();
     // One CTA per tile; see launch_impl -- num_blocks can't exceed the unsigned grid x-dim for
     // any device-sized num_items.
     const ::cuda::std::int64_t num_blocks = ::cuda::ceil_div(num_items, ::cuda::std::int64_t{chosen});
@@ -137,13 +132,13 @@ runtime_preconditions_valid(::cuda::std::tuple<InIters...> const& inputs, OutIte
   constexpr int byte_align    = 16;
   constexpr int items_divisor = 16;
 
-  auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output);
+  auto out_ptr           = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output);
   const bool aligned_out = ::cuda::std::is_sufficiently_aligned<byte_align>(out_ptr);
   const bool aligned_in  = ::cuda::std::apply(
     [](auto... iters) {
-      return ((::cuda::std::is_sufficiently_aligned<byte_align>(
-                THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters)))
-              && ...);
+      return (
+        (::cuda::std::is_sufficiently_aligned<byte_align>(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters)))
+        && ...);
     },
     inputs);
 
@@ -161,8 +156,8 @@ runtime_preconditions_valid(::cuda::std::tuple<InIters...> const& inputs, OutIte
 // mirror of Op with __tile__ operator), NOT the user's Op instance -- the
 // user's scalar functor cannot be invoked on ct::tile arguments.
 template <typename TransformOp, typename OutIter, typename... InIters, typename OffsetT>
-[[nodiscard]] CUB_RUNTIME_FUNCTION ::cudaError_t dispatch(
-  ::cuda::std::tuple<InIters...> inputs, OutIter output, OffsetT num_items, ::cudaStream_t stream)
+[[nodiscard]] CUB_RUNTIME_FUNCTION ::cudaError_t
+dispatch(::cuda::std::tuple<InIters...> inputs, OutIter output, OffsetT num_items, ::cudaStream_t stream)
 {
   auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output);
   auto in_ptrs = ::cuda::std::apply(
@@ -170,8 +165,8 @@ template <typename TransformOp, typename OutIter, typename... InIters, typename
       return ::cuda::std::make_tuple(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters)...);
     },
     inputs);
-  using tile_op_t =
-    typename cub::transform::tile_eligible<TransformOp, cub::detail::it_value_t<OutIter>, sizeof...(InIters)>::tile_op_type;
+  using tile_op_t = typename cub::transform::
+    tile_eligible<TransformOp, cub::detail::it_value_t<OutIter>, sizeof...(InIters)>::tile_op_type;
   static_assert(::cuda::std::is_empty_v<tile_op_t>,
                 "tile_op_type must be stateless (the tile kernel default-constructs it)");
   static_assert(::cuda::std::is_trivially_default_constructible_v<tile_op_t>,
@@ -180,7 +175,6 @@ template <typename TransformOp, typename OutIter, typename... InIters, typename
   return DeviceTransform::Transform<0, cub::transform::tile_mufu_heavy_v<TransformOp>, tile_op_t>(
     in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), tile_op_t{}, stream);
 }
-
 } // namespace detail::transform::tile
 
 CUB_NAMESPACE_END

From 34568d4e57326e1de1c65abb9d01ab25a2762448 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Wed, 10 Jun 2026 19:37:44 -0700
Subject: [PATCH 54/83] split tile_op_t alias into an intermediate out_value_t

Per @fbusato: break the dense one-liner into an intermediate alias for the
output value type, so the tile_eligible instantiation reads cleanly on one line.
---
 cub/cub/device/dispatch/dispatch_transform_tile.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index c170dd71013..67756cb20c9 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -165,8 +165,8 @@ dispatch(::cuda::std::tuple<InIters...> inputs, OutIter output, OffsetT num_item
       return ::cuda::std::make_tuple(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters)...);
     },
     inputs);
-  using tile_op_t = typename cub::transform::
-    tile_eligible<TransformOp, cub::detail::it_value_t<OutIter>, sizeof...(InIters)>::tile_op_type;
+  using out_value_t = cub::detail::it_value_t<OutIter>;
+  using tile_op_t = typename cub::transform::tile_eligible<TransformOp, out_value_t, sizeof...(InIters)>::tile_op_type;
   static_assert(::cuda::std::is_empty_v<tile_op_t>,
                 "tile_op_type must be stateless (the tile kernel default-constructs it)");
   static_assert(::cuda::std::is_trivially_default_constructible_v<tile_op_t>,

From f80bb25d1741a89b17a7ed3e5cc052044f6d6bd5 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Thu, 11 Jun 2026 16:14:58 -0700
Subject: [PATCH 55/83] separate tile eligibility from the tile-operator
 substitute

Per @fbusato: tile_eligible<Op,T,NIn> previously did two jobs -- mark a combo
eligible AND carry a `tile_op_type` substitute. Split the two axes:
- tile_eligible<Op,T,NIn> -> eligibility only (bool).
- tile_operator<Op>       -> the __tile__ functor the tile kernel runs for Op,
                             with NO default: a scalar functor cannot run on
                             ct::tile, so every eligible op must specialize it.
                             Omitting it is a clear static_assert, not a cryptic
                             "calling __host__ __device__ from __tile_global__"
                             kernel error. tile_operator_t<Op> is the alias.

dispatch now uses tile_operator_t<Op>. Built-in cuda::std::plus/multiplies
substitutes and all bench/test registrations migrated to the two-trait form.
---
 .../bench/transform/tile/babelstream.cu       | 12 ++-
 cub/benchmarks/bench/transform/tile/copy.cu   |  3 +-
 .../bench/transform/tile/grayscale.cu         |  3 +-
 .../bench/transform/tile/pytorch.cu           | 42 ++++++----
 .../transform/tile/test_device_transform.cu   | 10 ++-
 .../dispatch/dispatch_transform_tile.cuh      | 14 ++--
 .../dispatch_transform_tile_traits.cuh        | 76 +++++++++++++------
 7 files changed, 105 insertions(+), 55 deletions(-)

diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu
index 297ef78379a..9ee750ce35b 100644
--- a/cub/benchmarks/bench/transform/tile/babelstream.cu
+++ b/cub/benchmarks/bench/transform/tile/babelstream.cu
@@ -62,10 +62,14 @@ struct tile_nstream_op {
 CUB_NAMESPACE_BEGIN
 namespace transform
 {
-template <class T> struct tile_eligible<mul_op,     T, 1> : ::cuda::std::true_type { using tile_op_type = tile_mul_op; };
-template <class T> struct tile_eligible<add_op,     T, 2> : ::cuda::std::true_type { using tile_op_type = tile_add_op; };
-template <class T> struct tile_eligible<triad_op,   T, 2> : ::cuda::std::true_type { using tile_op_type = tile_triad_op; };
-template <class T> struct tile_eligible<nstream_op, T, 3> : ::cuda::std::true_type { using tile_op_type = tile_nstream_op; };
+template <class T> struct tile_eligible<mul_op,     T, 1> : ::cuda::std::true_type {};
+template <class T> struct tile_eligible<add_op,     T, 2> : ::cuda::std::true_type {};
+template <class T> struct tile_eligible<triad_op,   T, 2> : ::cuda::std::true_type {};
+template <class T> struct tile_eligible<nstream_op, T, 3> : ::cuda::std::true_type {};
+template <> struct tile_operator<mul_op>     { using type = tile_mul_op; };
+template <> struct tile_operator<add_op>     { using type = tile_add_op; };
+template <> struct tile_operator<triad_op>   { using type = tile_triad_op; };
+template <> struct tile_operator<nstream_op> { using type = tile_nstream_op; };
 } // namespace transform
 CUB_NAMESPACE_END
 #endif
diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu
index da9665b2f25..6133c69c684 100644
--- a/cub/benchmarks/bench/transform/tile/copy.cu
+++ b/cub/benchmarks/bench/transform/tile/copy.cu
@@ -33,7 +33,8 @@ struct tile_identity {
 CUB_NAMESPACE_BEGIN
 namespace transform
 {
-template <class T> struct tile_eligible<identity, T, 1> : ::cuda::std::true_type { using tile_op_type = tile_identity; };
+template <class T> struct tile_eligible<identity, T, 1> : ::cuda::std::true_type {};
+template <> struct tile_operator<identity> { using type = tile_identity; };
 } // namespace transform
 CUB_NAMESPACE_END
 #endif
diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu
index 9f364304266..5ad936019fa 100644
--- a/cub/benchmarks/bench/transform/tile/grayscale.cu
+++ b/cub/benchmarks/bench/transform/tile/grayscale.cu
@@ -42,7 +42,8 @@ struct tile_rgb_to_y {
 CUB_NAMESPACE_BEGIN
 namespace transform
 {
-template <class T> struct tile_eligible<rgb_to_y, T, 3> : ::cuda::std::true_type { using tile_op_type = tile_rgb_to_y; };
+template <class T> struct tile_eligible<rgb_to_y, T, 3> : ::cuda::std::true_type {};
+template <> struct tile_operator<rgb_to_y> { using type = tile_rgb_to_y; };
 } // namespace transform
 CUB_NAMESPACE_END
 #endif
diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu
index 0e1767fdac7..25e90b7f66e 100644
--- a/cub/benchmarks/bench/transform/tile/pytorch.cu
+++ b/cub/benchmarks/bench/transform/tile/pytorch.cu
@@ -92,12 +92,18 @@ CUB_NAMESPACE_BEGIN
 namespace transform
 {
 // Unary
-template <class T> struct tile_eligible<relu_op,    T, 1> : ::cuda::std::true_type { using tile_op_type = tile_relu;    };
-template <class T> struct tile_eligible<sigmoid_op, T, 1> : ::cuda::std::true_type { using tile_op_type = tile_sigmoid; };
-template <class T> struct tile_eligible<tanh_op,    T, 1> : ::cuda::std::true_type { using tile_op_type = tile_tanh;    };
-template <class T> struct tile_eligible<gelu_op,    T, 1> : ::cuda::std::true_type { using tile_op_type = tile_gelu;    };
-template <class T> struct tile_eligible<sin_op,     T, 1> : ::cuda::std::true_type { using tile_op_type = tile_sin;     };
-template <class T> struct tile_eligible<exp_op,     T, 1> : ::cuda::std::true_type { using tile_op_type = tile_exp;     };
+template <class T> struct tile_eligible<relu_op,    T, 1> : ::cuda::std::true_type {};
+template <class T> struct tile_eligible<sigmoid_op, T, 1> : ::cuda::std::true_type {};
+template <class T> struct tile_eligible<tanh_op,    T, 1> : ::cuda::std::true_type {};
+template <class T> struct tile_eligible<gelu_op,    T, 1> : ::cuda::std::true_type {};
+template <class T> struct tile_eligible<sin_op,     T, 1> : ::cuda::std::true_type {};
+template <class T> struct tile_eligible<exp_op,     T, 1> : ::cuda::std::true_type {};
+template <> struct tile_operator<relu_op>    { using type = tile_relu;    };
+template <> struct tile_operator<sigmoid_op> { using type = tile_sigmoid; };
+template <> struct tile_operator<tanh_op>    { using type = tile_tanh;    };
+template <> struct tile_operator<gelu_op>    { using type = tile_gelu;    };
+template <> struct tile_operator<sin_op>     { using type = tile_sin;     };
+template <> struct tile_operator<exp_op>     { using type = tile_exp;     };
 
 // MUFU-heavy unary ops: hint to tile policy picker to cap items/thread at vector width on sub-4-byte types.
 template <> struct tile_mufu_heavy<sigmoid_op> : ::cuda::std::true_type {};
@@ -107,14 +113,22 @@ template <> struct tile_mufu_heavy<sin_op>     : ::cuda::std::true_type {};
 template <> struct tile_mufu_heavy<exp_op>     : ::cuda::std::true_type {};
 
 // Binary
-template <class T> struct tile_eligible<binary_add,  T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_add;  };
-template <class T> struct tile_eligible<binary_sub,  T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_sub;  };
-template <class T> struct tile_eligible<binary_mul,  T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_mul;  };
-template <class T> struct tile_eligible<binary_div,  T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_div;  };
-template <class T> struct tile_eligible<binary_le,   T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_le;   };
-template <class T> struct tile_eligible<binary_ge,   T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_ge;   };
-template <class T> struct tile_eligible<binary_fmin, T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_fmin; };
-template <class T> struct tile_eligible<binary_fmax, T, 2> : ::cuda::std::true_type { using tile_op_type = tile_binary_fmax; };
+template <class T> struct tile_eligible<binary_add,  T, 2> : ::cuda::std::true_type {};
+template <class T> struct tile_eligible<binary_sub,  T, 2> : ::cuda::std::true_type {};
+template <class T> struct tile_eligible<binary_mul,  T, 2> : ::cuda::std::true_type {};
+template <class T> struct tile_eligible<binary_div,  T, 2> : ::cuda::std::true_type {};
+template <class T> struct tile_eligible<binary_le,   T, 2> : ::cuda::std::true_type {};
+template <class T> struct tile_eligible<binary_ge,   T, 2> : ::cuda::std::true_type {};
+template <class T> struct tile_eligible<binary_fmin, T, 2> : ::cuda::std::true_type {};
+template <class T> struct tile_eligible<binary_fmax, T, 2> : ::cuda::std::true_type {};
+template <> struct tile_operator<binary_add>  { using type = tile_binary_add;  };
+template <> struct tile_operator<binary_sub>  { using type = tile_binary_sub;  };
+template <> struct tile_operator<binary_mul>  { using type = tile_binary_mul;  };
+template <> struct tile_operator<binary_div>  { using type = tile_binary_div;  };
+template <> struct tile_operator<binary_le>   { using type = tile_binary_le;   };
+template <> struct tile_operator<binary_ge>   { using type = tile_binary_ge;   };
+template <> struct tile_operator<binary_fmin> { using type = tile_binary_fmin; };
+template <> struct tile_operator<binary_fmax> { using type = tile_binary_fmax; };
 } // namespace transform
 CUB_NAMESPACE_END
 #endif
diff --git a/cub/benchmarks/bench/transform/tile/test_device_transform.cu b/cub/benchmarks/bench/transform/tile/test_device_transform.cu
index d3a143a3deb..e2c7a5006bb 100644
--- a/cub/benchmarks/bench/transform/tile/test_device_transform.cu
+++ b/cub/benchmarks/bench/transform/tile/test_device_transform.cu
@@ -182,10 +182,12 @@ void test_fill(int64_t n, T value) {
 CUB_NAMESPACE_BEGIN
 namespace transform
 {
-template <> struct tile_eligible<identity_op, int32_t, 1> : ::cuda::std::true_type { using tile_op_type = tile_identity_op; };
-template <> struct tile_eligible<identity_op, float, 1>   : ::cuda::std::true_type { using tile_op_type = tile_identity_op; };
-template <> struct tile_eligible<square_op, int32_t, 1>   : ::cuda::std::true_type { using tile_op_type = tile_square_op; };
-template <> struct tile_eligible<square_op, float, 1>     : ::cuda::std::true_type { using tile_op_type = tile_square_op; };
+template <> struct tile_eligible<identity_op, int32_t, 1> : ::cuda::std::true_type {};
+template <> struct tile_eligible<identity_op, float, 1>   : ::cuda::std::true_type {};
+template <> struct tile_eligible<square_op, int32_t, 1>   : ::cuda::std::true_type {};
+template <> struct tile_eligible<square_op, float, 1>     : ::cuda::std::true_type {};
+template <> struct tile_operator<identity_op> { using type = tile_identity_op; };
+template <> struct tile_operator<square_op>   { using type = tile_square_op; };
 } // namespace transform
 CUB_NAMESPACE_END
 #endif
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 67756cb20c9..7ce52562fa0 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -152,9 +152,9 @@ runtime_preconditions_valid(::cuda::std::tuple<InIters...> const& inputs, OutIte
 // caller (the hook in device_transform.cuh) is responsible for checking
 // runtime_preconditions_valid first.
 //
-// The tile kernel is launched with the trait's tile_op_type (a tile-friendly
-// mirror of Op with __tile__ operator), NOT the user's Op instance -- the
-// user's scalar functor cannot be invoked on ct::tile arguments.
+// The tile kernel is launched with tile_operator_t<Op>: for a scalar Op that is its
+// registered tile-friendly mirror (a __tile__ functor), and for an already-tile Op it
+// is Op itself. A scalar functor cannot be invoked on ct::tile arguments.
 template <typename TransformOp, typename OutIter, typename... InIters, typename OffsetT>
 [[nodiscard]] CUB_RUNTIME_FUNCTION ::cudaError_t
 dispatch(::cuda::std::tuple<InIters...> inputs, OutIter output, OffsetT num_items, ::cudaStream_t stream)
@@ -165,12 +165,12 @@ dispatch(::cuda::std::tuple<InIters...> inputs, OutIter output, OffsetT num_item
       return ::cuda::std::make_tuple(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters)...);
     },
     inputs);
-  using out_value_t = cub::detail::it_value_t<OutIter>;
-  using tile_op_t = typename cub::transform::tile_eligible<TransformOp, out_value_t, sizeof...(InIters)>::tile_op_type;
+  // The tile functor to run for TransformOp: its registered tile_operator mirror.
+  using tile_op_t = cub::transform::tile_operator_t<TransformOp>;
   static_assert(::cuda::std::is_empty_v<tile_op_t>,
-                "tile_op_type must be stateless (the tile kernel default-constructs it)");
+                "tile_operator type must be stateless (the tile kernel default-constructs it)");
   static_assert(::cuda::std::is_trivially_default_constructible_v<tile_op_t>,
-                "tile_op_type must be trivially default constructible");
+                "tile_operator type must be trivially default constructible");
 
   return DeviceTransform::Transform<0, cub::transform::tile_mufu_heavy_v<TransformOp>, tile_op_t>(
     in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), tile_op_t{}, stream);
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
index 4759d97569e..f51d280264b 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
@@ -3,23 +3,27 @@
 
 // Compile-time policy for cub::DeviceTransform's tile path.
 //
-// PUBLIC EXTENSION POINTS (cub::transform):
-//   tile_eligible<Op, T, NIn>   -- specialize this to opt a (functor type,
-//                                   element type, input arity) combo into
-//                                   the tile dispatch path.
+// PUBLIC EXTENSION POINTS (cub::transform) -- two independent axes:
+//   tile_eligible<Op, T, NIn>   -- specialize to true_type to opt a (functor
+//                                   type, element type, input arity) combo into
+//                                   the tile dispatch path. Eligibility only.
 //   tile_eligible_v<...>        -- variable-template companion.
+//   tile_operator<Op>           -- the __tile__ functor the tile kernel runs
+//                                   for Op. No default: every tile-eligible Op
+//                                   must specialize it with `using type = <a
+//                                   stateless __tile__ functor mirroring Op>`,
+//                                   because a scalar functor (e.g.
+//                                   cuda::std::plus<__half>) cannot be invoked
+//                                   on ct::tile. Omitting it is a clear
+//                                   static_assert, not a cryptic kernel error.
+//   tile_operator_t<Op>         -- alias for tile_operator<Op>::type.
 //   tile_mufu_heavy<Op>         -- specialize to flag Op as MUFU-heavy; the
 //                                   tile policy picker uses this hint.
 //   tile_mufu_heavy_v<...>      -- variable-template companion.
 //
-// Users call cub::DeviceTransform::Transform with whatever scalar functor
-// they have (e.g. cuda::std::plus<__half>). That scalar functor is NOT
-// directly callable from a tile transform_kernel -- its operator() takes
-// scalars, not ct::tile. So eligible specializations declare a `tile_op_type`
-// member naming a tile-friendly replacement (a stateless functor with a
-// __tile__ templated operator() that performs the same op on ct::tile args).
-// The dispatch hook launches the tile kernel with the replacement, not the
-// user's original functor instance.
+// Eligibility ("may this combo use the tile path?") and substitution ("which
+// __tile__ functor do we actually run?") are separate traits, so an eligible op
+// always registers both: tile_eligible<Op,T,NIn> and tile_operator<Op>.
 //
 // INTERNAL (cub::detail::transform::tile):
 //   tile_plus, tile_multiplies   -- shipped tile-friendly substitutes used by
@@ -41,18 +45,17 @@
 
 #if _CCCL_CUB_HAS_TILE_TRANSFORM()
 
-#  include <cuda_tile.h>
-
 #  include <cuda/std/__cstddef/types.h>
 #  include <cuda/std/__functional/operations.h>
 #  include <cuda/std/__type_traits/integral_constant.h>
 
+#  include <cuda_tile.h>
+
 CUB_NAMESPACE_BEGIN
 
 // Public extension surface.
 namespace transform
 {
-
 template <typename Op, typename T, ::cuda::std::size_t NIn>
 struct tile_eligible : ::cuda::std::false_type
 {};
@@ -60,6 +63,21 @@ struct tile_eligible : ::cuda::std::false_type
 template <typename Op, typename T, ::cuda::std::size_t NIn>
 inline constexpr bool tile_eligible_v = tile_eligible<Op, T, NIn>::value;
 
+// The __tile__ functor the tile kernel runs for Op -- the tile-side mirror of the scalar Op. There is
+// no default: a scalar functor cannot be invoked on ct::tile, so every tile-eligible Op must specialize
+// this with a `type` naming a stateless __tile__ functor. tile_eligible<Op,...> says a combo MAY use the
+// tile path; tile_operator<Op> says WHAT the tile kernel runs.
+template <typename Op>
+struct tile_operator
+{
+  static_assert(sizeof(Op) == 0,
+                "cub::transform::tile_operator<Op> must be specialized for every tile-eligible Op: "
+                "provide `using type = <stateless __tile__ functor mirroring Op>`.");
+};
+
+template <typename Op>
+using tile_operator_t = typename tile_operator<Op>::type;
+
 // Hint that Op uses MUFU (multi-function unit, sin/cos/exp/log/tanh/rcp/rsq). Setting this makes
 // the tile policy picker cap items/thread so MUFU pipes are not oversaturated.
 template <typename Op>
@@ -68,13 +86,11 @@ struct tile_mufu_heavy : ::cuda::std::false_type
 
 template <typename Op>
 inline constexpr bool tile_mufu_heavy_v = tile_mufu_heavy<Op>::value;
-
 } // namespace transform
 
 // Internal substitutes shipped by CCCL.
 namespace detail::transform::tile
 {
-
 // Tile-friendly mirrors of common cuda::std ops. Each has a __tile__
 // templated operator() so it can be invoked from inside transform_kernel
 // where the arguments are ct::tile<T, ...> rather than scalar T.
@@ -95,36 +111,48 @@ struct tile_multiplies
     return a * b;
   }
 };
-
 } // namespace detail::transform::tile
 
 // Built-in trait specializations live in the public namespace alongside the
 // trait, but reference the internal substitute functors.
 namespace transform
 {
+// cuda::std::plus / multiplies are scalar ops, so each is marked eligible and given a tile_operator mirror.
 #  if _CCCL_HAS_NVFP16()
 template <>
 struct tile_eligible<::cuda::std::plus<::__half>, ::__half, 2> : ::cuda::std::true_type
+{};
+template <>
+struct tile_eligible<::cuda::std::multiplies<::__half>, ::__half, 2> : ::cuda::std::true_type
+{};
+template <>
+struct tile_operator<::cuda::std::plus<::__half>>
 {
-  using tile_op_type = cub::detail::transform::tile::tile_plus;
+  using type = cub::detail::transform::tile::tile_plus;
 };
 template <>
-struct tile_eligible<::cuda::std::multiplies<::__half>, ::__half, 2> : ::cuda::std::true_type
+struct tile_operator<::cuda::std::multiplies<::__half>>
 {
-  using tile_op_type = cub::detail::transform::tile::tile_multiplies;
+  using type = cub::detail::transform::tile::tile_multiplies;
 };
 #  endif // _CCCL_HAS_NVFP16()
 
 #  if _CCCL_HAS_NVBF16()
 template <>
 struct tile_eligible<::cuda::std::plus<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type
+{};
+template <>
+struct tile_eligible<::cuda::std::multiplies<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type
+{};
+template <>
+struct tile_operator<::cuda::std::plus<::__nv_bfloat16>>
 {
-  using tile_op_type = cub::detail::transform::tile::tile_plus;
+  using type = cub::detail::transform::tile::tile_plus;
 };
 template <>
-struct tile_eligible<::cuda::std::multiplies<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type
+struct tile_operator<::cuda::std::multiplies<::__nv_bfloat16>>
 {
-  using tile_op_type = cub::detail::transform::tile::tile_multiplies;
+  using type = cub::detail::transform::tile::tile_multiplies;
 };
 #  endif // _CCCL_HAS_NVBF16()
 } // namespace transform

From dcb838dc7efc7e887ead8a6ef12e76593f4379d5 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Thu, 11 Jun 2026 17:03:15 -0700
Subject: [PATCH 56/83] add gated c2h test for the tile transform dispatch path

catch2_test_device_transform_tile.cu, gated by _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED():
in a normal build it is a single skipped test; under --enable-tile it registers a
unary (square) and a binary (add) scalar op with tile_operator substitutes and checks
the transform result is bit-exact with a host std::transform reference across sizes that
span the tile path (n % 16 == 0) and the CUB fallback. Verified the TU compiles under
--enable-tile; the dispatch correctness it checks also matches the standalone harness.

Building it with --enable-tile in-tree needs a scoped/conditional CMake flag (global
--enable-tile breaks the c++17 c2h support lib) -- left as a follow-up.
---
 cub/test/catch2_test_device_transform_tile.cu | 145 ++++++++++++++++++
 1 file changed, 145 insertions(+)
 create mode 100644 cub/test/catch2_test_device_transform_tile.cu

diff --git a/cub/test/catch2_test_device_transform_tile.cu b/cub/test/catch2_test_device_transform_tile.cu
new file mode 100644
index 00000000000..3ff05fac915
--- /dev/null
+++ b/cub/test/catch2_test_device_transform_tile.cu
@@ -0,0 +1,145 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+
+#include <cub/device/device_transform.cuh>
+
+#include <c2h/catch2_test_helper.h>
+
+// The tile dispatch path only exists when nvcc is invoked with --enable-tile and the user opts in via
+// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH. In any other build this file compiles to a single skipped test.
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+#  include <algorithm>
+
+#  include <cuda_tile.h>
+
+#  include "catch2_test_launch_helper.h"
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::Transform, transform_many);
+
+namespace ct = ::cuda::tiles;
+
+// Each scalar op (passed to Transform, used by the CUB fallback) pairs with a tile-side mirror
+// registered through tile_operator. The bodies use tile-tile arithmetic and wrap for unsigned types,
+// so the tile result is bit-exact with the host reference.
+
+// Unary: v * v.
+struct square_op
+{
+  template <class T>
+  __host__ __device__ T operator()(T v) const
+  {
+    return static_cast<T>(v * v);
+  }
+};
+struct tile_square_op
+{
+  template <class T>
+  __tile__ auto operator()(T v) const
+  {
+    return v * v;
+  }
+};
+
+// Binary: a + b.
+struct add_op
+{
+  template <class A, class B>
+  __host__ __device__ auto operator()(A a, B b) const
+  {
+    return static_cast<A>(a + b);
+  }
+};
+struct tile_add_op
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return a + b;
+  }
+};
+
+CUB_NAMESPACE_BEGIN
+namespace transform
+{
+template <class T>
+struct tile_eligible<square_op, T, 1> : ::cuda::std::true_type
+{};
+template <>
+struct tile_operator<square_op>
+{
+  using type = tile_square_op;
+};
+
+template <class T>
+struct tile_eligible<add_op, T, 2> : ::cuda::std::true_type
+{};
+template <>
+struct tile_operator<add_op>
+{
+  using type = tile_add_op;
+};
+} // namespace transform
+CUB_NAMESPACE_END
+
+// Unsigned types so arithmetic wraps deterministically and matches the host reference bit-for-bit.
+using tile_types = c2h::type_list<::cuda::std::uint32_t, ::cuda::std::uint64_t>;
+
+// Sizes span the runtime preconditions: multiples of 16 (with aligned c2h buffers) take the tile
+// kernel; the others fall back to the standard CUB dispatch. Both must produce identical results.
+#  define TILE_TRANSFORM_SIZES GENERATE(::cuda::std::int64_t{0}, 16, 32, 128, 1024, 4096, 65536, 17, 127, 1000)
+
+C2H_TEST("DeviceTransform tile dispatch: unary scalar op routed through its tile_operator substitute",
+         "[device][transform][tile]",
+         tile_types)
+{
+  using type                           = c2h::get<0, TestType>;
+  const ::cuda::std::int64_t num_items = TILE_TRANSFORM_SIZES;
+  CAPTURE(c2h::type_name<type>(), num_items);
+
+  c2h::device_vector<type> in(num_items, thrust::no_init);
+  c2h::gen(C2H_SEED(2), in);
+  c2h::device_vector<type> result(num_items, thrust::no_init);
+
+  transform_many(::cuda::std::make_tuple(in.begin()), result.begin(), num_items, square_op{});
+
+  c2h::host_vector<type> in_h = in;
+  c2h::host_vector<type> reference_h(num_items, thrust::no_init);
+  std::transform(in_h.begin(), in_h.end(), reference_h.begin(), square_op{});
+  REQUIRE(reference_h == result);
+}
+
+C2H_TEST("DeviceTransform tile dispatch: binary scalar op routed through its tile_operator substitute",
+         "[device][transform][tile]",
+         tile_types)
+{
+  using type                           = c2h::get<0, TestType>;
+  const ::cuda::std::int64_t num_items = TILE_TRANSFORM_SIZES;
+  CAPTURE(c2h::type_name<type>(), num_items);
+
+  c2h::device_vector<type> a(num_items, thrust::no_init);
+  c2h::device_vector<type> b(num_items, thrust::no_init);
+  c2h::gen(C2H_SEED(2), a);
+  c2h::gen(C2H_SEED(2), b);
+  c2h::device_vector<type> result(num_items, thrust::no_init);
+
+  transform_many(::cuda::std::make_tuple(a.begin(), b.begin()), result.begin(), num_items, add_op{});
+
+  c2h::host_vector<type> a_h = a;
+  c2h::host_vector<type> b_h = b;
+  c2h::host_vector<type> reference_h(num_items, thrust::no_init);
+  std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), add_op{});
+  REQUIRE(reference_h == result);
+}
+
+#else // !_CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+
+C2H_TEST("DeviceTransform tile dispatch requires --enable-tile", "[device][transform][tile]")
+{
+  SUCCEED("tile transform dispatch not enabled in this build");
+}
+
+#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()

From 14a30fc9f66d8b3e6d5316c8401bd42638c8a1a2 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Thu, 11 Jun 2026 17:10:42 -0700
Subject: [PATCH 57/83] use thrust::device_vector in copy bench

Per @fbusato: replace the manual cudaMalloc/cudaFree pair with
thrust::device_vector (RAII, no leak on early return), passing
thrust::raw_pointer_cast(...) to Transform.
---
 cub/benchmarks/bench/transform/tile/copy.cu | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu
index 6133c69c684..8b7cbb4dedf 100644
--- a/cub/benchmarks/bench/transform/tile/copy.cu
+++ b/cub/benchmarks/bench/transform/tile/copy.cu
@@ -10,6 +10,8 @@
 
 #include <cub/device/device_transform.cuh>
 
+#include <thrust/device_vector.h>
+
 #include <cuda_runtime.h>
 #include <cuda/std/tuple>
 #include <vector>
@@ -41,18 +43,18 @@ CUB_NAMESPACE_END
 
 template <typename T>
 void copy(nvbench::state& state, nvbench::type_list<T>) {
-    auto n = state.get_int64("Elements{io}");
-    T *in, *out;
-    cudaMalloc(&in, n * sizeof(T)); cudaMalloc(&out, n * sizeof(T));
-    bench_init::rand_fill(in, n, 0xA111); cudaDeviceSynchronize();
+    const auto n = state.get_int64("Elements{io}");
+    thrust::device_vector<T> in(n), out(n);
+    T* in_ptr  = thrust::raw_pointer_cast(in.data());
+    T* out_ptr = thrust::raw_pointer_cast(out.data());
+    bench_init::rand_fill(in_ptr, n, 0xA111); cudaDeviceSynchronize();
     state.add_element_count(n);
     state.add_global_memory_reads<T>(n);
     state.add_global_memory_writes<T>(n);
     state.exec([&](nvbench::launch& launch) {
         cub::DeviceTransform::Transform(
-            ::cuda::std::make_tuple(in), out, n, identity{}, launch.get_stream());
+            ::cuda::std::make_tuple(in_ptr), out_ptr, n, identity{}, launch.get_stream());
     });
-    cudaFree(in); cudaFree(out);
 }
 
 using types = nvbench::type_list<std::int8_t, std::int16_t, std::int32_t, float, double>;

From 67af2573dcf4c1bc2a491055eb1b82fa06e44557 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Thu, 11 Jun 2026 17:24:29 -0700
Subject: [PATCH 58/83] rewrite tile babelstream bench to CUB conventions

Mirror the base transform/babelstream.cu: include ../common.h (nvbench_helper +
bench_transform), thrust::device_vector with constant init, try/catch OOM, and
NVBENCH_BENCH_TYPES with set_type_axes_names + nvbench::range. Named ops keep their
gated tile_operator registrations so --enable-tile routes them to the tile kernel.
Drops the ad-hoc Buffers/cudaMalloc and bench_init.cuh usage. __int128 omitted
(unsupported on the tile path).
---
 .../bench/transform/tile/babelstream.cu       | 314 +++++++++++-------
 1 file changed, 193 insertions(+), 121 deletions(-)

diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu
index 9ee750ce35b..6e9caf03f2d 100644
--- a/cub/benchmarks/bench/transform/tile/babelstream.cu
+++ b/cub/benchmarks/bench/transform/tile/babelstream.cu
@@ -1,162 +1,234 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// BabelStream-style bandwidth benchmarks via cub::DeviceTransform::Transform.
-// Custom ops self-register their tile substitutes via tile_eligible<>, so the
-// dispatch hook routes them to the tile kernel under --enable-tile + the
-// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH macro.
+// Tile variant of the BabelStream transform bench. The lambdas of the base benchmark are replaced by
+// named, stateless ops that register a tile_operator substitute (gated). Under --enable-tile +
+// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH the dispatch hook routes them to the tile kernel; otherwise this
+// is the standard CUB transform path. This file disappears once tile dispatch is fully transparent.
 
-#include <nvbench/nvbench.cuh>
-
-#include <cub/device/device_transform.cuh>
-
-#include <cuda_runtime.h>
-#include <cuda/std/tuple>
-#include <vector>
-#include <cstdint>
+#include "../common.h"
 
 #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 #  include <cuda_tile.h>
 #endif
 
-#include "bench_init.cuh"
-
-// User-defined scalar ops (used at the call site, in both build modes).
-struct mul_op {
-    template <class B>
-    __host__ __device__ auto operator()(B b) const { return -(b + b); }
+// Stateless scalar ops, used at the call site in both build modes. Constants are baked in so the ops
+// stay stateless (the tile substitute must be trivially default constructible): with startScalar == -2,
+// `c * scalar` is `-(c + c)`, `b + scalar * c` is `b - c - c`, etc.
+struct mul_op
+{
+  template <class B>
+  __host__ __device__ auto operator()(B b) const
+  {
+    return -(b + b);
+  }
 };
-struct add_op {
-    template <class A, class B>
-    __host__ __device__ auto operator()(A a, B b) const { return a + b; }
+struct add_op
+{
+  template <class A, class B>
+  __host__ __device__ auto operator()(A a, B b) const
+  {
+    return a + b;
+  }
 };
-struct triad_op {
-    template <class B, class C>
-    __host__ __device__ auto operator()(B b, C c) const { return b - c - c; }
+struct triad_op
+{
+  template <class B, class C>
+  __host__ __device__ auto operator()(B b, C c) const
+  {
+    return b - c - c;
+  }
 };
-struct nstream_op {
-    template <class A, class B, class C>
-    __host__ __device__ auto operator()(A a, B b, C c) const { return a + b - c - c; }
+struct nstream_op
+{
+  template <class A, class B, class C>
+  __host__ __device__ auto operator()(A a, B b, C c) const
+  {
+    return a + b - c - c;
+  }
 };
 
 #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
-// Tile-friendly substitutes (must be stateless + trivially default constructible).
-struct tile_mul_op {
-    template <class B>
-    __tile__ auto operator()(B b) const { return -(b + b); }
+struct tile_mul_op
+{
+  template <class B>
+  __tile__ auto operator()(B b) const
+  {
+    return -(b + b);
+  }
 };
-struct tile_add_op {
-    template <class A, class B>
-    __tile__ auto operator()(A a, B b) const { return a + b; }
+struct tile_add_op
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return a + b;
+  }
 };
-struct tile_triad_op {
-    template <class B, class C>
-    __tile__ auto operator()(B b, C c) const { return b - c - c; }
+struct tile_triad_op
+{
+  template <class B, class C>
+  __tile__ auto operator()(B b, C c) const
+  {
+    return b - c - c;
+  }
 };
-struct tile_nstream_op {
-    template <class A, class B, class C>
-    __tile__ auto operator()(A a, B b, C c) const { return a + b - c - c; }
+struct tile_nstream_op
+{
+  template <class A, class B, class C>
+  __tile__ auto operator()(A a, B b, C c) const
+  {
+    return a + b - c - c;
+  }
 };
 
-// Self-register each scalar op for all T (partial specialization on T).
 CUB_NAMESPACE_BEGIN
 namespace transform
 {
-template <class T> struct tile_eligible<mul_op,     T, 1> : ::cuda::std::true_type {};
-template <class T> struct tile_eligible<add_op,     T, 2> : ::cuda::std::true_type {};
-template <class T> struct tile_eligible<triad_op,   T, 2> : ::cuda::std::true_type {};
-template <class T> struct tile_eligible<nstream_op, T, 3> : ::cuda::std::true_type {};
-template <> struct tile_operator<mul_op>     { using type = tile_mul_op; };
-template <> struct tile_operator<add_op>     { using type = tile_add_op; };
-template <> struct tile_operator<triad_op>   { using type = tile_triad_op; };
-template <> struct tile_operator<nstream_op> { using type = tile_nstream_op; };
+template <class T>
+struct tile_eligible<mul_op, T, 1> : ::cuda::std::true_type
+{};
+template <class T>
+struct tile_eligible<add_op, T, 2> : ::cuda::std::true_type
+{};
+template <class T>
+struct tile_eligible<triad_op, T, 2> : ::cuda::std::true_type
+{};
+template <class T>
+struct tile_eligible<nstream_op, T, 3> : ::cuda::std::true_type
+{};
+template <>
+struct tile_operator<mul_op>
+{
+  using type = tile_mul_op;
+};
+template <>
+struct tile_operator<add_op>
+{
+  using type = tile_add_op;
+};
+template <>
+struct tile_operator<triad_op>
+{
+  using type = tile_triad_op;
+};
+template <>
+struct tile_operator<nstream_op>
+{
+  using type = tile_nstream_op;
+};
 } // namespace transform
 CUB_NAMESPACE_END
+#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+
+// The tile path does not support __int128 (no tensor_span/partition_view for it), so the type axis
+// omits it relative to the base babelstream bench.
+#ifdef TUNE_T
+using element_types = nvbench::type_list<TUNE_T>;
+#else
+using element_types = nvbench::type_list<std::int8_t, std::int16_t, float, double>;
 #endif
 
-// True if `bytes_needed` worth of GPU memory is available, with 5% headroom
-// for driver overhead. Caller should `state.skip(...)` on false.
-inline bool gpu_mem_available(size_t bytes_needed) {
-    size_t free_b = 0, total_b = 0;
-    if (cudaMemGetInfo(&free_b, &total_b) != cudaSuccess) return false;
-    return bytes_needed + (bytes_needed / 20) < free_b;
-}
+inline auto array_size_powers = nvbench::range(16, 32, 4);
 
-template <typename T>
-struct Buffers {
-    T *a{}, *b{}, *c{};
-    int64_t n{};
-    Buffers(int64_t n) : n(n) {
-        cudaMalloc(&a, n * sizeof(T));
-        cudaMalloc(&b, n * sizeof(T));
-        cudaMalloc(&c, n * sizeof(T));
-        bench_init::rand_fill(a, n, 0xA111);
-        bench_init::rand_fill(b, n, 0xB222);
-        bench_init::rand_fill(c, n, 0xC333);
-        cudaDeviceSynchronize();
-    }
-    ~Buffers() { cudaFree(a); cudaFree(b); cudaFree(c); }
-};
+// Same constant inputs as the base bench so nstream maintains a consistent workload.
+inline constexpr auto startA      = 11;
+inline constexpr auto startB      = 2;
+inline constexpr auto startC      = 1;
+inline constexpr auto startScalar = -2;
+static_assert(startA == (startA + startB + startScalar * startC), "nstream must have a consistent workload");
 
-// --- benchmarks ---
 template <typename T>
-void mul(nvbench::state& state, nvbench::type_list<T>) {
-    auto n = state.get_int64("Elements{io}");
-    Buffers<T> buf(n);
-    state.add_element_count(n);
-    state.add_global_memory_reads<T>(n);
-    state.add_global_memory_writes<T>(n);
-    state.exec([&](nvbench::launch& launch) {
-        cub::DeviceTransform::Transform(
-            ::cuda::std::make_tuple(buf.b), buf.c, n, mul_op{}, launch.get_stream());
-    });
+static void mul(nvbench::state& state, nvbench::type_list<T>)
+try
+{
+  const auto n = state.get_int64("Elements{io}");
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(n);
+  state.add_global_memory_writes<T>(n);
+  bench_transform(state, cuda::std::tuple{c.begin()}, b.begin(), n, mul_op{});
 }
-
-template <typename T>
-void add(nvbench::state& state, nvbench::type_list<T>) {
-    auto n = state.get_int64("Elements{io}");
-    Buffers<T> buf(n);
-    state.add_element_count(n);
-    state.add_global_memory_reads<T>(2 * n);
-    state.add_global_memory_writes<T>(n);
-    state.exec([&](nvbench::launch& launch) {
-        cub::DeviceTransform::Transform(
-            ::cuda::std::make_tuple(buf.a, buf.b), buf.c, n, add_op{}, launch.get_stream());
-    });
+catch (const std::bad_alloc&)
+{
+  state.skip("Skipping: out of memory.");
 }
 
+NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(element_types))
+  .set_name("tile_mul")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
+
 template <typename T>
-void triad(nvbench::state& state, nvbench::type_list<T>) {
-    auto n = state.get_int64("Elements{io}");
-    Buffers<T> buf(n);
-    state.add_element_count(n);
-    state.add_global_memory_reads<T>(2 * n);
-    state.add_global_memory_writes<T>(n);
-    state.exec([&](nvbench::launch& launch) {
-        cub::DeviceTransform::Transform(
-            ::cuda::std::make_tuple(buf.b, buf.c), buf.a, n, triad_op{}, launch.get_stream());
-    });
+static void add(nvbench::state& state, nvbench::type_list<T>)
+try
+{
+  const auto n = state.get_int64("Elements{io}");
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(2 * n);
+  state.add_global_memory_writes<T>(n);
+  bench_transform(state, cuda::std::tuple{a.begin(), b.begin()}, c.begin(), n, add_op{});
+}
+catch (const std::bad_alloc&)
+{
+  state.skip("Skipping: out of memory.");
 }
 
+NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(element_types))
+  .set_name("tile_add")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
+
 template <typename T>
-void nstream(nvbench::state& state, nvbench::type_list<T>) {
-    auto n = state.get_int64("Elements{io}");
-    Buffers<T> buf(n);
-    state.add_element_count(n);
-    state.add_global_memory_reads<T>(3 * n);
-    state.add_global_memory_writes<T>(n);
-    state.exec([&](nvbench::launch& launch) {
-        cub::DeviceTransform::Transform(
-            ::cuda::std::make_tuple(buf.a, buf.b, buf.c), buf.a, n, nstream_op{}, launch.get_stream());
-    });
+static void triad(nvbench::state& state, nvbench::type_list<T>)
+try
+{
+  const auto n = state.get_int64("Elements{io}");
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(2 * n);
+  state.add_global_memory_writes<T>(n);
+  bench_transform(state, cuda::std::tuple{b.begin(), c.begin()}, a.begin(), n, triad_op{});
+}
+catch (const std::bad_alloc&)
+{
+  state.skip("Skipping: out of memory.");
 }
 
-using types = nvbench::type_list<std::int8_t, std::int16_t, float, double>;
-inline auto sizes = std::vector<nvbench::int64_t>{16, 20, 24, 28, 31};
+NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(element_types))
+  .set_name("tile_triad")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
 
-NVBENCH_BENCH_TYPES(mul,     NVBENCH_TYPE_AXES(types)).set_name("tile_mul").add_int64_power_of_two_axis("Elements{io}", sizes);
-NVBENCH_BENCH_TYPES(add,     NVBENCH_TYPE_AXES(types)).set_name("tile_add").add_int64_power_of_two_axis("Elements{io}", sizes);
-NVBENCH_BENCH_TYPES(triad,   NVBENCH_TYPE_AXES(types)).set_name("tile_triad").add_int64_power_of_two_axis("Elements{io}", sizes);
-NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(types)).set_name("tile_nstream").add_int64_power_of_two_axis("Elements{io}", sizes);
+template <typename T>
+static void nstream(nvbench::state& state, nvbench::type_list<T>)
+try
+{
+  const auto n = state.get_int64("Elements{io}");
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(3 * n);
+  state.add_global_memory_writes<T>(n);
+  bench_transform(state, cuda::std::tuple{a.begin(), b.begin(), c.begin()}, a.begin(), n, nstream_op{});
+}
+catch (const std::bad_alloc&)
+{
+  state.skip("Skipping: out of memory.");
+}
 
-NVBENCH_MAIN
+NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types))
+  .set_name("tile_nstream")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);

From 2a1983abfd5c40fa00a5ebdcf5741b0cea59c1aa Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Thu, 11 Jun 2026 17:28:45 -0700
Subject: [PATCH 59/83] rewrite remaining tile benches to CUB conventions; drop
 redundant files

grayscale/copy/pytorch: same treatment as babelstream -- ../common.h
(nvbench_helper + bench_transform), thrust::device_vector + generate(),
try/catch OOM, NVBENCH_BENCH_TYPES with set_type_axes_names + nvbench::range,
gated tile_operator registrations (pytorch keeps tile_mufu_heavy hints).

Deletions:
- fill.cu: the tile path has no fill kernel wired into dispatch (zero-input
  Transform isn't routed), so it only duplicated the base fill bench. fill-on-tile
  remains a follow-up.
- test_device_transform.cu: superseded by cub/test/catch2_test_device_transform_tile.cu.
- bench_init.cuh: replaced by nvbench_helper/common.h.
---
 .../bench/transform/tile/bench_init.cuh       |  67 --
 cub/benchmarks/bench/transform/tile/copy.cu   |  95 +--
 cub/benchmarks/bench/transform/tile/fill.cu   |  34 -
 .../bench/transform/tile/grayscale.cu         | 111 +--
 .../bench/transform/tile/pytorch.cu           | 640 +++++++++++++-----
 .../transform/tile/test_device_transform.cu   | 217 ------
 6 files changed, 589 insertions(+), 575 deletions(-)
 delete mode 100644 cub/benchmarks/bench/transform/tile/bench_init.cuh
 delete mode 100644 cub/benchmarks/bench/transform/tile/fill.cu
 delete mode 100644 cub/benchmarks/bench/transform/tile/test_device_transform.cu

diff --git a/cub/benchmarks/bench/transform/tile/bench_init.cuh b/cub/benchmarks/bench/transform/tile/bench_init.cuh
deleted file mode 100644
index da3e37f8c40..00000000000
--- a/cub/benchmarks/bench/transform/tile/bench_init.cuh
+++ /dev/null
@@ -1,67 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
-// SPDX-License-Identifier: BSD-3-Clause
-
-#pragma once
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-#include <cstdint>
-#include <type_traits>
-
-namespace bench_init {
-
-// splitmix64 — fast deterministic PRNG, one mix per element.
-__device__ __forceinline__ uint64_t splitmix64(uint64_t x) {
-    x += 0x9E3779B97F4A7C15ULL;
-    x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9ULL;
-    x = (x ^ (x >> 27)) * 0x94D049BB133111EBULL;
-    return x ^ (x >> 31);
-}
-
-// Map a uint64 to a "reasonable" finite value of T in roughly [-1, 1) for floats,
-// or to a non-zero byte for small ints (so neither all-zero nor pathological).
-template <typename T>
-__device__ __forceinline__ T from_random(uint64_t r) {
-    if constexpr (std::is_same_v<T, float>) {
-        // 24-bit mantissa precision, range (-1, 1)
-        uint32_t u = uint32_t(r >> 40);                // 24 bits
-        float f = float(u) * (1.0f / float(1u << 23)) - 1.0f;
-        return f;
-    } else if constexpr (std::is_same_v<T, double>) {
-        uint64_t u = r >> 11;                          // 53 bits
-        double d = double(u) * (1.0 / double(1ull << 52)) - 1.0;
-        return d;
-    } else if constexpr (std::is_same_v<T, __half>) {
-        uint32_t u = uint32_t(r >> 40);
-        float f = float(u) * (1.0f / float(1u << 23)) - 1.0f;
-        return __float2half(f);
-    } else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
-        uint32_t u = uint32_t(r >> 40);
-        float f = float(u) * (1.0f / float(1u << 23)) - 1.0f;
-        return __float2bfloat16(f);
-    } else {
-        // integer types: small non-zero values, biased away from zero so div is meaningful
-        int v = int(r & 0x7f) + 1;                     // 1..128
-        if (r & 0x100) v = -v;                          // sometimes negative
-        return T(v);
-    }
-}
-
-template <typename T>
-__global__ void rand_fill_kernel(T* __restrict__ p, int64_t n, uint64_t seed) {
-    int64_t stride = int64_t(gridDim.x) * blockDim.x;
-    for (int64_t i = int64_t(blockIdx.x) * blockDim.x + threadIdx.x; i < n; i += stride) {
-        p[i] = from_random<T>(splitmix64(seed ^ uint64_t(i)));
-    }
-}
-
-template <typename T>
-inline void rand_fill(T* p, int64_t n, uint64_t seed = 0xC0FFEE) {
-    int block = 256;
-    int64_t nblocks = (n + block - 1) / block;
-    int grid = int(nblocks < 65535 ? nblocks : 65535);
-    rand_fill_kernel<T><<<grid, block>>>(p, n, seed);
-}
-
-} // namespace bench_init
diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu
index 8b7cbb4dedf..85ed12e0d4d 100644
--- a/cub/benchmarks/bench/transform/tile/copy.cu
+++ b/cub/benchmarks/bench/transform/tile/copy.cu
@@ -1,67 +1,78 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// Pure copy bench (identity transform). Custom identity op self-registers
-// its tile substitute via tile_eligible<>; under --enable-tile + the
-// dispatch macro this routes to the tile load_masked/store_masked path,
-// otherwise it falls through to CUB's standard transform.
+// Pure copy (identity transform) -- measures plain load/store bandwidth through the tile
+// load_masked/store_masked path. The identity op registers a tile_operator substitute (gated); under
+// --enable-tile + CCCL_ENABLE_TILE_TRANSFORM_DISPATCH the dispatch hook routes it to the tile kernel,
+// otherwise it falls through to CUB's standard transform. This file disappears once tile dispatch is
+// fully transparent.
 
-#include <nvbench/nvbench.cuh>
-
-#include <cub/device/device_transform.cuh>
-
-#include <thrust/device_vector.h>
-
-#include <cuda_runtime.h>
-#include <cuda/std/tuple>
-#include <vector>
-#include <cstdint>
+#include "../common.h"
 
 #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 #  include <cuda_tile.h>
 #endif
 
-#include "bench_init.cuh"
-
-struct identity {
-    template <class T> __host__ __device__ auto operator()(T v) const { return v; }
+struct identity
+{
+  template <class T>
+  __host__ __device__ auto operator()(T v) const
+  {
+    return v;
+  }
 };
 
 #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
-struct tile_identity {
-    template <class T> __tile__ auto operator()(T v) const { return v; }
+struct tile_identity
+{
+  template <class T>
+  __tile__ auto operator()(T v) const
+  {
+    return v;
+  }
 };
 
 CUB_NAMESPACE_BEGIN
 namespace transform
 {
-template <class T> struct tile_eligible<identity, T, 1> : ::cuda::std::true_type {};
-template <> struct tile_operator<identity> { using type = tile_identity; };
+template <class T>
+struct tile_eligible<identity, T, 1> : ::cuda::std::true_type
+{};
+template <>
+struct tile_operator<identity>
+{
+  using type = tile_identity;
+};
 } // namespace transform
 CUB_NAMESPACE_END
+#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+
+#ifdef TUNE_T
+using element_types = nvbench::type_list<TUNE_T>;
+#else
+using element_types = nvbench::type_list<std::int8_t, std::int16_t, std::int32_t, float, double>;
 #endif
 
 template <typename T>
-void copy(nvbench::state& state, nvbench::type_list<T>) {
-    const auto n = state.get_int64("Elements{io}");
-    thrust::device_vector<T> in(n), out(n);
-    T* in_ptr  = thrust::raw_pointer_cast(in.data());
-    T* out_ptr = thrust::raw_pointer_cast(out.data());
-    bench_init::rand_fill(in_ptr, n, 0xA111); cudaDeviceSynchronize();
-    state.add_element_count(n);
-    state.add_global_memory_reads<T>(n);
-    state.add_global_memory_writes<T>(n);
-    state.exec([&](nvbench::launch& launch) {
-        cub::DeviceTransform::Transform(
-            ::cuda::std::make_tuple(in_ptr), out_ptr, n, identity{}, launch.get_stream());
-    });
-}
+static void copy(nvbench::state& state, nvbench::type_list<T>)
+try
+{
+  const auto n = state.get_int64("Elements{io}");
 
-using types = nvbench::type_list<std::int8_t, std::int16_t, std::int32_t, float, double>;
-inline auto sizes = std::vector<nvbench::int64_t>{16, 20, 24, 28, 31};
+  thrust::device_vector<T> in = generate(n);
+  thrust::device_vector<T> out(n, thrust::no_init);
 
-NVBENCH_BENCH_TYPES(copy, NVBENCH_TYPE_AXES(types))
-    .set_name("tile_copy")
-    .add_int64_power_of_two_axis("Elements{io}", sizes);
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(n);
+  state.add_global_memory_writes<T>(n);
+  bench_transform(state, cuda::std::tuple{in.begin()}, out.begin(), n, identity{});
+}
+catch (const std::bad_alloc&)
+{
+  state.skip("Skipping: out of memory.");
+}
 
-NVBENCH_MAIN
+NVBENCH_BENCH_TYPES(copy, NVBENCH_TYPE_AXES(element_types))
+  .set_name("tile_copy")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 32, 4));
diff --git a/cub/benchmarks/bench/transform/tile/fill.cu b/cub/benchmarks/bench/transform/tile/fill.cu
deleted file mode 100644
index 5105b25b67b..00000000000
--- a/cub/benchmarks/bench/transform/tile/fill.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
-// SPDX-License-Identifier: BSD-3-Clause
-
-// Fill: zero-input broadcast. Calls cub::DeviceTransform::Fill, which goes
-// through the unified __transform_internal path -- our trait dispatch hook
-// sees the zero-input case but currently has no trait spec for it, so this
-// lands on CUB's standard Fill kernel. Wire a tile substitute later if Fill
-// becomes a bottleneck.
-
-#include <nvbench/nvbench.cuh>
-
-#include <cub/device/device_transform.cuh>
-
-#include <cuda_runtime.h>
-
-template <typename T>
-void fill(nvbench::state& state, nvbench::type_list<T>) {
-    const auto n = state.get_int64("Elements{io}");
-    T* out; cudaMalloc(&out, n * sizeof(T));
-    state.add_element_count(n);
-    state.add_global_memory_writes<T>(n);
-    state.exec([&](nvbench::launch& launch) {
-        cub::DeviceTransform::Fill(out, n, T(42), launch.get_stream());
-    });
-    cudaFree(out);
-}
-
-// CUB sweeps integral types: int8/16/32/64
-using fill_types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t>;
-
-NVBENCH_BENCH_TYPES(fill, NVBENCH_TYPE_AXES(fill_types)).set_name("tile_fill")
-    .add_int64_power_of_two_axis("Elements{io}", std::vector<nvbench::int64_t>{16, 20, 24, 28, 31});
-
-NVBENCH_MAIN
diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu
index 5ad936019fa..f9ab98d62ad 100644
--- a/cub/benchmarks/bench/transform/tile/grayscale.cu
+++ b/cub/benchmarks/bench/transform/tile/grayscale.cu
@@ -1,76 +1,85 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// Grayscale: RGB pixel -> luminance via three separate input streams.
-// Custom rgb_to_y op self-registers its tile substitute via tile_eligible<>.
+// Tile variant of the grayscale transform bench. Unlike the base bench (a single rgb_t<T> struct
+// input), this uses three separate R/G/B streams so the inputs are plain element types the tile path
+// can vectorize. The named rgb_to_y op registers a tile_operator substitute (gated). This file
+// disappears once tile dispatch is fully transparent.
 
-#include <nvbench/nvbench.cuh>
-
-#include <cub/device/device_transform.cuh>
-
-#include <cuda_runtime.h>
-#include <cuda/std/tuple>
-#include <vector>
+#include "../common.h"
 
 #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 #  include <cuda_tile.h>
 #endif
 
-#include "bench_init.cuh"
-
-struct rgb_to_y {
-    template <class R, class G, class B>
-    __host__ __device__ auto operator()(R r, G g, B b) const {
-        constexpr float w_r = 0.2989f;
-        constexpr float w_g = 0.587f;
-        constexpr float w_b = 0.114f;
-        return w_r * r + w_g * g + w_b * b;
-    }
+struct rgb_to_y
+{
+  template <class R, class G, class B>
+  __host__ __device__ auto operator()(R r, G g, B b) const
+  {
+    constexpr float w_r = 0.2989f;
+    constexpr float w_g = 0.587f;
+    constexpr float w_b = 0.114f;
+    return w_r * r + w_g * g + w_b * b;
+  }
 };
 
 #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
-struct tile_rgb_to_y {
-    template <class R, class G, class B>
-    __tile__ auto operator()(R r, G g, B b) const {
-        constexpr float w_r = 0.2989f;
-        constexpr float w_g = 0.587f;
-        constexpr float w_b = 0.114f;
-        return w_r * r + w_g * g + w_b * b;
-    }
+struct tile_rgb_to_y
+{
+  template <class R, class G, class B>
+  __tile__ auto operator()(R r, G g, B b) const
+  {
+    constexpr float w_r = 0.2989f;
+    constexpr float w_g = 0.587f;
+    constexpr float w_b = 0.114f;
+    return w_r * r + w_g * g + w_b * b;
+  }
 };
 
 CUB_NAMESPACE_BEGIN
 namespace transform
 {
-template <class T> struct tile_eligible<rgb_to_y, T, 3> : ::cuda::std::true_type {};
-template <> struct tile_operator<rgb_to_y> { using type = tile_rgb_to_y; };
+template <class T>
+struct tile_eligible<rgb_to_y, T, 3> : ::cuda::std::true_type
+{};
+template <>
+struct tile_operator<rgb_to_y>
+{
+  using type = tile_rgb_to_y;
+};
 } // namespace transform
 CUB_NAMESPACE_END
+#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+
+#ifdef TUNE_T
+using value_types = nvbench::type_list<TUNE_T>;
+#else
+using value_types = nvbench::type_list<float, double>;
 #endif
 
 template <typename T>
-void grayscale(nvbench::state& state, nvbench::type_list<T>) {
-    const auto n = state.get_int64("Elements{io}");
-    T *r, *g, *b, *out;
-    cudaMalloc(&r, n*sizeof(T)); cudaMalloc(&g, n*sizeof(T)); cudaMalloc(&b, n*sizeof(T));
-    cudaMalloc(&out, n*sizeof(T));
-    bench_init::rand_fill(r, n, 0xA111);
-    bench_init::rand_fill(g, n, 0xA222);
-    bench_init::rand_fill(b, n, 0xA333);
-
-    state.add_element_count(n);
-    state.add_global_memory_reads<T>(3 * n);   // matches CUB's rgb_t<T> = 3*sizeof(T)
-    state.add_global_memory_writes<T>(n);
-    state.exec([&](nvbench::launch& launch) {
-        cub::DeviceTransform::Transform(
-            ::cuda::std::make_tuple(r, g, b), out, n, rgb_to_y{}, launch.get_stream());
-    });
-    cudaFree(r); cudaFree(g); cudaFree(b); cudaFree(out);
-}
+static void grayscale(nvbench::state& state, nvbench::type_list<T>)
+try
+{
+  const auto n = state.get_int64("Elements{io}");
 
-using value_types = nvbench::type_list<float, double>;
+  thrust::device_vector<T> r = generate(n);
+  thrust::device_vector<T> g = generate(n);
+  thrust::device_vector<T> b = generate(n);
+  thrust::device_vector<T> out(n, thrust::no_init);
 
-NVBENCH_BENCH_TYPES(grayscale, NVBENCH_TYPE_AXES(value_types)).set_name("tile_grayscale")
-    .add_int64_power_of_two_axis("Elements{io}", std::vector<nvbench::int64_t>{16, 20, 24, 28, 31});
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(3 * n); // matches the base bench's rgb_t<T> = 3 * sizeof(T)
+  state.add_global_memory_writes<T>(n);
+  bench_transform(state, cuda::std::tuple{r.begin(), g.begin(), b.begin()}, out.begin(), n, rgb_to_y{});
+}
+catch (const std::bad_alloc&)
+{
+  state.skip("Skipping: out of memory.");
+}
 
-NVBENCH_MAIN
+NVBENCH_BENCH_TYPES(grayscale, NVBENCH_TYPE_AXES(value_types))
+  .set_name("tile_grayscale")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 32, 4));
diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu
index 25e90b7f66e..006f88ff5c6 100644
--- a/cub/benchmarks/bench/transform/tile/pytorch.cu
+++ b/cub/benchmarks/bench/transform/tile/pytorch.cu
@@ -1,200 +1,512 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// PyTorch-style ops via cub::DeviceTransform::Transform. Each custom op
-// self-registers a tile substitute through tile_eligible<>, so the dispatch
-// hook routes them to the tile kernel under --enable-tile + the
-// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH macro. MUFU-heavy ops also opt into
-// tile_mufu_heavy<> so the tile policy picker caps items/thread at the
-// vector width on sub-4-byte types.
+// Tile variant of the PyTorch-style transform benches. Each named op registers a tile_operator
+// substitute (gated); MUFU-heavy ops also opt into tile_mufu_heavy<> so the tile policy picker caps
+// items/thread at the vector width on sub-4-byte types. Under --enable-tile +
+// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH the dispatch hook routes them to the tile kernel; otherwise this
+// is the standard CUB path. This file disappears once tile dispatch is fully transparent.
 
-#include <nvbench/nvbench.cuh>
-
-#include <cub/device/device_transform.cuh>
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
 #include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
 #include <cuda/std/cmath>
-#include <cuda/std/tuple>
-#include <vector>
+
+#include "../common.h"
 
 #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 #  include <cuda_tile.h>
 #endif
 
-#include "bench_init.cuh"
-
-// ========================================================================
-// Scalar ops (the types the user passes to cub::DeviceTransform::Transform).
-// Sub-4-byte input types compute in float and cast back, matching the tile
-// substitute below.
-// ========================================================================
-template <class T> __host__ __device__ float to_f(T v) { return static_cast<float>(v); }
-template <class T> __host__ __device__ T from_f(float f) { return static_cast<T>(f); }
-
-struct relu_op    { template <class T> __host__ __device__ T operator()(T v) const {
-    float f = to_f(v); return from_f<T>(f > 0.0f ? f : 0.0f); } };
-struct sigmoid_op { template <class T> __host__ __device__ T operator()(T v) const {
-    float f = to_f(v); return from_f<T>(1.0f / (1.0f + ::cuda::std::exp(-f))); } };
-struct tanh_op    { template <class T> __host__ __device__ T operator()(T v) const {
-    return from_f<T>(::cuda::std::tanh(to_f(v))); } };
-struct gelu_op    { template <class T> __host__ __device__ T operator()(T v) const {
+// Scalar ops the user passes to Transform. Sub-4-byte input types compute in float and cast back,
+// matching the tile substitutes below.
+template <class T>
+__host__ __device__ float to_f(T v)
+{
+  return static_cast<float>(v);
+}
+template <class T>
+__host__ __device__ T from_f(float f)
+{
+  return static_cast<T>(f);
+}
+
+struct relu_op
+{
+  template <class T>
+  __host__ __device__ T operator()(T v) const
+  {
+    float f = to_f(v);
+    return from_f<T>(f > 0.0f ? f : 0.0f);
+  }
+};
+struct sigmoid_op
+{
+  template <class T>
+  __host__ __device__ T operator()(T v) const
+  {
+    float f = to_f(v);
+    return from_f<T>(1.0f / (1.0f + ::cuda::std::exp(-f)));
+  }
+};
+struct tanh_op
+{
+  template <class T>
+  __host__ __device__ T operator()(T v) const
+  {
+    return from_f<T>(::cuda::std::tanh(to_f(v)));
+  }
+};
+struct gelu_op
+{
+  template <class T>
+  __host__ __device__ T operator()(T v) const
+  {
     constexpr float k0 = 0.7978845608028654f, k1 = 0.044715f;
     float f = to_f(v);
-    return from_f<T>(0.5f * f * (1.0f + ::cuda::std::tanh(k0 * (f + k1 * f * f * f)))); } };
-struct sin_op     { template <class T> __host__ __device__ T operator()(T v) const {
-    return from_f<T>(::cuda::std::sin(to_f(v))); } };
-struct exp_op     { template <class T> __host__ __device__ T operator()(T v) const {
-    return from_f<T>(::cuda::std::exp(to_f(v))); } };
-
-struct binary_add  { template <class A, class B> __host__ __device__ auto operator()(A a, B b) const { return a + b; } };
-struct binary_sub  { template <class A, class B> __host__ __device__ auto operator()(A a, B b) const { return a - b; } };
-struct binary_mul  { template <class A, class B> __host__ __device__ auto operator()(A a, B b) const { return a * b; } };
-struct binary_div  { template <class A, class B> __host__ __device__ auto operator()(A a, B b) const { return a / b; } };
-struct binary_le   { template <class A, class B> __host__ __device__ A operator()(A a, B b) const { return static_cast<A>(a <= b); } };
-struct binary_ge   { template <class A, class B> __host__ __device__ A operator()(A a, B b) const { return static_cast<A>(a >= b); } };
-struct binary_fmin { template <class A, class B> __host__ __device__ auto operator()(A a, B b) const { return a < b ? a : b; } };
-struct binary_fmax { template <class A, class B> __host__ __device__ auto operator()(A a, B b) const { return a > b ? a : b; } };
-
-// ========================================================================
-// Tile substitutes + trait registration. Only compiled under tile mode.
-// ========================================================================
+    return from_f<T>(0.5f * f * (1.0f + ::cuda::std::tanh(k0 * (f + k1 * f * f * f))));
+  }
+};
+struct sin_op
+{
+  template <class T>
+  __host__ __device__ T operator()(T v) const
+  {
+    return from_f<T>(::cuda::std::sin(to_f(v)));
+  }
+};
+struct exp_op
+{
+  template <class T>
+  __host__ __device__ T operator()(T v) const
+  {
+    return from_f<T>(::cuda::std::exp(to_f(v)));
+  }
+};
+
+struct binary_add
+{
+  template <class A, class B>
+  __host__ __device__ auto operator()(A a, B b) const
+  {
+    return a + b;
+  }
+};
+struct binary_sub
+{
+  template <class A, class B>
+  __host__ __device__ auto operator()(A a, B b) const
+  {
+    return a - b;
+  }
+};
+struct binary_mul
+{
+  template <class A, class B>
+  __host__ __device__ auto operator()(A a, B b) const
+  {
+    return a * b;
+  }
+};
+struct binary_div
+{
+  template <class A, class B>
+  __host__ __device__ auto operator()(A a, B b) const
+  {
+    return a / b;
+  }
+};
+struct binary_le
+{
+  template <class A, class B>
+  __host__ __device__ A operator()(A a, B b) const
+  {
+    return static_cast<A>(a <= b);
+  }
+};
+struct binary_ge
+{
+  template <class A, class B>
+  __host__ __device__ A operator()(A a, B b) const
+  {
+    return static_cast<A>(a >= b);
+  }
+};
+struct binary_fmin
+{
+  template <class A, class B>
+  __host__ __device__ auto operator()(A a, B b) const
+  {
+    return a < b ? a : b;
+  }
+};
+struct binary_fmax
+{
+  template <class A, class B>
+  __host__ __device__ auto operator()(A a, B b) const
+  {
+    return a > b ? a : b;
+  }
+};
+
 #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 namespace ct = ::cuda::tiles;
 
-template <class T> __tile__ auto as_float(T v) { return ct::element_cast<float>(v); }
-template <class T, class F> __tile__ auto from_float(F f) { return ct::element_cast<ct::tile_element_t<T>>(f); }
+template <class T>
+__tile__ auto as_float(T v)
+{
+  return ct::element_cast<float>(v);
+}
+template <class T, class F>
+__tile__ auto from_float(F f)
+{
+  return ct::element_cast<ct::tile_element_t<T>>(f);
+}
 
-struct tile_relu    { template <class T> __tile__ auto operator()(T v) const {
-    auto f = as_float(v); return from_float<T>(ct::select(f > 0.0f, f, f - f)); } };
-struct tile_sigmoid { template <class T> __tile__ auto operator()(T v) const {
-    auto f = as_float(v); return from_float<T>(1.0f / (1.0f + ct::exp(-f))); } };
-struct tile_tanh    { template <class T> __tile__ auto operator()(T v) const {
-    return from_float<T>(ct::tanh(as_float(v))); } };
-struct tile_gelu    { template <class T> __tile__ auto operator()(T v) const {
+struct tile_relu
+{
+  template <class T>
+  __tile__ auto operator()(T v) const
+  {
+    auto f = as_float(v);
+    return from_float<T>(ct::select(f > 0.0f, f, f - f));
+  }
+};
+struct tile_sigmoid
+{
+  template <class T>
+  __tile__ auto operator()(T v) const
+  {
+    auto f = as_float(v);
+    return from_float<T>(1.0f / (1.0f + ct::exp(-f)));
+  }
+};
+struct tile_tanh
+{
+  template <class T>
+  __tile__ auto operator()(T v) const
+  {
+    return from_float<T>(ct::tanh(as_float(v)));
+  }
+};
+struct tile_gelu
+{
+  template <class T>
+  __tile__ auto operator()(T v) const
+  {
     constexpr float k0 = 0.7978845608028654f, k1 = 0.044715f;
     auto f = as_float(v);
-    return from_float<T>(0.5f * f * (1.0f + ct::tanh(k0 * (f + k1 * f * f * f)))); } };
-struct tile_sin     { template <class T> __tile__ auto operator()(T v) const { return from_float<T>(ct::sin(as_float(v))); } };
-struct tile_exp     { template <class T> __tile__ auto operator()(T v) const { return from_float<T>(ct::exp(as_float(v))); } };
-
-struct tile_binary_add  { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a + b; } };
-struct tile_binary_sub  { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a - b; } };
-struct tile_binary_mul  { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a * b; } };
-struct tile_binary_div  { template <class A, class B> __tile__ auto operator()(A a, B b) const { return a / b; } };
-struct tile_binary_le   { template <class A, class B> __tile__ auto operator()(A a, B b) const { return ct::element_cast<ct::tile_element_t<A>>(a <= b); } };
-struct tile_binary_ge   { template <class A, class B> __tile__ auto operator()(A a, B b) const { return ct::element_cast<ct::tile_element_t<A>>(a >= b); } };
-struct tile_binary_fmin { template <class A, class B> __tile__ auto operator()(A a, B b) const { return ct::select(a < b, a, b); } };
-struct tile_binary_fmax { template <class A, class B> __tile__ auto operator()(A a, B b) const { return ct::select(a > b, a, b); } };
+    return from_float<T>(0.5f * f * (1.0f + ct::tanh(k0 * (f + k1 * f * f * f))));
+  }
+};
+struct tile_sin
+{
+  template <class T>
+  __tile__ auto operator()(T v) const
+  {
+    return from_float<T>(ct::sin(as_float(v)));
+  }
+};
+struct tile_exp
+{
+  template <class T>
+  __tile__ auto operator()(T v) const
+  {
+    return from_float<T>(ct::exp(as_float(v)));
+  }
+};
+
+struct tile_binary_add
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return a + b;
+  }
+};
+struct tile_binary_sub
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return a - b;
+  }
+};
+struct tile_binary_mul
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return a * b;
+  }
+};
+struct tile_binary_div
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return a / b;
+  }
+};
+struct tile_binary_le
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return ct::element_cast<ct::tile_element_t<A>>(a <= b);
+  }
+};
+struct tile_binary_ge
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return ct::element_cast<ct::tile_element_t<A>>(a >= b);
+  }
+};
+struct tile_binary_fmin
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return ct::select(a < b, a, b);
+  }
+};
+struct tile_binary_fmax
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return ct::select(a > b, a, b);
+  }
+};
 
 CUB_NAMESPACE_BEGIN
 namespace transform
 {
 // Unary
-template <class T> struct tile_eligible<relu_op,    T, 1> : ::cuda::std::true_type {};
-template <class T> struct tile_eligible<sigmoid_op, T, 1> : ::cuda::std::true_type {};
-template <class T> struct tile_eligible<tanh_op,    T, 1> : ::cuda::std::true_type {};
-template <class T> struct tile_eligible<gelu_op,    T, 1> : ::cuda::std::true_type {};
-template <class T> struct tile_eligible<sin_op,     T, 1> : ::cuda::std::true_type {};
-template <class T> struct tile_eligible<exp_op,     T, 1> : ::cuda::std::true_type {};
-template <> struct tile_operator<relu_op>    { using type = tile_relu;    };
-template <> struct tile_operator<sigmoid_op> { using type = tile_sigmoid; };
-template <> struct tile_operator<tanh_op>    { using type = tile_tanh;    };
-template <> struct tile_operator<gelu_op>    { using type = tile_gelu;    };
-template <> struct tile_operator<sin_op>     { using type = tile_sin;     };
-template <> struct tile_operator<exp_op>     { using type = tile_exp;     };
-
-// MUFU-heavy unary ops: hint to tile policy picker to cap items/thread at vector width on sub-4-byte types.
-template <> struct tile_mufu_heavy<sigmoid_op> : ::cuda::std::true_type {};
-template <> struct tile_mufu_heavy<tanh_op>    : ::cuda::std::true_type {};
-template <> struct tile_mufu_heavy<gelu_op>    : ::cuda::std::true_type {};
-template <> struct tile_mufu_heavy<sin_op>     : ::cuda::std::true_type {};
-template <> struct tile_mufu_heavy<exp_op>     : ::cuda::std::true_type {};
+template <class T>
+struct tile_eligible<relu_op, T, 1> : ::cuda::std::true_type
+{};
+template <class T>
+struct tile_eligible<sigmoid_op, T, 1> : ::cuda::std::true_type
+{};
+template <class T>
+struct tile_eligible<tanh_op, T, 1> : ::cuda::std::true_type
+{};
+template <class T>
+struct tile_eligible<gelu_op, T, 1> : ::cuda::std::true_type
+{};
+template <class T>
+struct tile_eligible<sin_op, T, 1> : ::cuda::std::true_type
+{};
+template <class T>
+struct tile_eligible<exp_op, T, 1> : ::cuda::std::true_type
+{};
+template <>
+struct tile_operator<relu_op>
+{
+  using type = tile_relu;
+};
+template <>
+struct tile_operator<sigmoid_op>
+{
+  using type = tile_sigmoid;
+};
+template <>
+struct tile_operator<tanh_op>
+{
+  using type = tile_tanh;
+};
+template <>
+struct tile_operator<gelu_op>
+{
+  using type = tile_gelu;
+};
+template <>
+struct tile_operator<sin_op>
+{
+  using type = tile_sin;
+};
+template <>
+struct tile_operator<exp_op>
+{
+  using type = tile_exp;
+};
+
+// MUFU-heavy unary ops: hint the tile policy picker to cap items/thread at the vector width on
+// sub-4-byte types.
+template <>
+struct tile_mufu_heavy<sigmoid_op> : ::cuda::std::true_type
+{};
+template <>
+struct tile_mufu_heavy<tanh_op> : ::cuda::std::true_type
+{};
+template <>
+struct tile_mufu_heavy<gelu_op> : ::cuda::std::true_type
+{};
+template <>
+struct tile_mufu_heavy<sin_op> : ::cuda::std::true_type
+{};
+template <>
+struct tile_mufu_heavy<exp_op> : ::cuda::std::true_type
+{};
 
 // Binary
-template <class T> struct tile_eligible<binary_add,  T, 2> : ::cuda::std::true_type {};
-template <class T> struct tile_eligible<binary_sub,  T, 2> : ::cuda::std::true_type {};
-template <class T> struct tile_eligible<binary_mul,  T, 2> : ::cuda::std::true_type {};
-template <class T> struct tile_eligible<binary_div,  T, 2> : ::cuda::std::true_type {};
-template <class T> struct tile_eligible<binary_le,   T, 2> : ::cuda::std::true_type {};
-template <class T> struct tile_eligible<binary_ge,   T, 2> : ::cuda::std::true_type {};
-template <class T> struct tile_eligible<binary_fmin, T, 2> : ::cuda::std::true_type {};
-template <class T> struct tile_eligible<binary_fmax, T, 2> : ::cuda::std::true_type {};
-template <> struct tile_operator<binary_add>  { using type = tile_binary_add;  };
-template <> struct tile_operator<binary_sub>  { using type = tile_binary_sub;  };
-template <> struct tile_operator<binary_mul>  { using type = tile_binary_mul;  };
-template <> struct tile_operator<binary_div>  { using type = tile_binary_div;  };
-template <> struct tile_operator<binary_le>   { using type = tile_binary_le;   };
-template <> struct tile_operator<binary_ge>   { using type = tile_binary_ge;   };
-template <> struct tile_operator<binary_fmin> { using type = tile_binary_fmin; };
-template <> struct tile_operator<binary_fmax> { using type = tile_binary_fmax; };
+template <class T>
+struct tile_eligible<binary_add, T, 2> : ::cuda::std::true_type
+{};
+template <class T>
+struct tile_eligible<binary_sub, T, 2> : ::cuda::std::true_type
+{};
+template <class T>
+struct tile_eligible<binary_mul, T, 2> : ::cuda::std::true_type
+{};
+template <class T>
+struct tile_eligible<binary_div, T, 2> : ::cuda::std::true_type
+{};
+template <class T>
+struct tile_eligible<binary_le, T, 2> : ::cuda::std::true_type
+{};
+template <class T>
+struct tile_eligible<binary_ge, T, 2> : ::cuda::std::true_type
+{};
+template <class T>
+struct tile_eligible<binary_fmin, T, 2> : ::cuda::std::true_type
+{};
+template <class T>
+struct tile_eligible<binary_fmax, T, 2> : ::cuda::std::true_type
+{};
+template <>
+struct tile_operator<binary_add>
+{
+  using type = tile_binary_add;
+};
+template <>
+struct tile_operator<binary_sub>
+{
+  using type = tile_binary_sub;
+};
+template <>
+struct tile_operator<binary_mul>
+{
+  using type = tile_binary_mul;
+};
+template <>
+struct tile_operator<binary_div>
+{
+  using type = tile_binary_div;
+};
+template <>
+struct tile_operator<binary_le>
+{
+  using type = tile_binary_le;
+};
+template <>
+struct tile_operator<binary_ge>
+{
+  using type = tile_binary_ge;
+};
+template <>
+struct tile_operator<binary_fmin>
+{
+  using type = tile_binary_fmin;
+};
+template <>
+struct tile_operator<binary_fmax>
+{
+  using type = tile_binary_fmax;
+};
 } // namespace transform
 CUB_NAMESPACE_END
+#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+
+#ifdef TUNE_T
+using element_types = nvbench::type_list<TUNE_T>;
+#else
+using element_types = nvbench::type_list<
+#  if _CCCL_HAS_NVFP16()
+  __half,
+#  endif
+#  if _CCCL_HAS_NVBF16()
+  __nv_bfloat16,
+#  endif
+  float>;
 #endif
 
-// ========================================================================
-// Bench harness.
-// ========================================================================
 template <typename Op, typename T>
-void run_unary(nvbench::state& state) {
-    const auto n = state.get_int64("Elements{io}");
-    T *in, *out;
-    cudaMalloc(&in, n * sizeof(T)); cudaMalloc(&out, n * sizeof(T));
-    bench_init::rand_fill(in, n, 0xA111); cudaDeviceSynchronize();
-    state.add_element_count(n);
-    state.add_global_memory_reads<T>(n);
-    state.add_global_memory_writes<T>(n);
-    state.exec([&](nvbench::launch& launch) {
-        cub::DeviceTransform::Transform(
-            ::cuda::std::make_tuple(in), out, n, Op{}, launch.get_stream());
-    });
-    cudaFree(in); cudaFree(out);
+static void run_unary(nvbench::state& state)
+try
+{
+  const auto n = state.get_int64("Elements{io}");
+  thrust::device_vector<T> in(n, T(1));
+  thrust::device_vector<T> out(n, thrust::no_init);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(n);
+  state.add_global_memory_writes<T>(n);
+  bench_transform(state, cuda::std::tuple{in.begin()}, out.begin(), n, Op{});
+}
+catch (const std::bad_alloc&)
+{
+  state.skip("Skipping: out of memory.");
 }
 
 template <typename Op, typename T>
-void run_binary(nvbench::state& state) {
-    const auto n = state.get_int64("Elements{io}");
-    T *a, *b, *out;
-    cudaMalloc(&a, n*sizeof(T)); cudaMalloc(&b, n*sizeof(T)); cudaMalloc(&out, n*sizeof(T));
-    bench_init::rand_fill(a, n, 0xA111);
-    bench_init::rand_fill(b, n, 0xB222);
-    cudaDeviceSynchronize();
-    state.add_element_count(n);
-    state.add_global_memory_reads<T>(2*n);
-    state.add_global_memory_writes<T>(n);
-    state.exec([&](nvbench::launch& launch) {
-        cub::DeviceTransform::Transform(
-            ::cuda::std::make_tuple(a, b), out, n, Op{}, launch.get_stream());
-    });
-    cudaFree(a); cudaFree(b); cudaFree(out);
+static void run_binary(nvbench::state& state)
+try
+{
+  const auto n = state.get_int64("Elements{io}");
+  thrust::device_vector<T> a(n, T(1));
+  thrust::device_vector<T> b(n, T(1));
+  thrust::device_vector<T> out(n, thrust::no_init);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(2 * n);
+  state.add_global_memory_writes<T>(n);
+  bench_transform(state, cuda::std::tuple{a.begin(), b.begin()}, out.begin(), n, Op{});
+}
+catch (const std::bad_alloc&)
+{
+  state.skip("Skipping: out of memory.");
 }
 
-using element_types = nvbench::type_list<__half, __nv_bfloat16, float>;
-inline auto pt_sizes = std::vector<nvbench::int64_t>{16, 20, 24, 28, 31};
-
-#define UNARY_BENCH(name, op) \
-    template <typename T> void name##_bench(nvbench::state& state, nvbench::type_list<T>) { run_unary<op, T>(state); } \
-    NVBENCH_BENCH_TYPES(name##_bench, NVBENCH_TYPE_AXES(element_types)).set_name("tile_" #name).add_int64_power_of_two_axis("Elements{io}", pt_sizes);
-
-UNARY_BENCH(relu,    relu_op)
-UNARY_BENCH(sigmoid, sigmoid_op)
-UNARY_BENCH(tanh,    tanh_op)
-UNARY_BENCH(gelu,    gelu_op)
-UNARY_BENCH(sin,     sin_op)
-UNARY_BENCH(exp,     exp_op)
-
-#define BINARY_BENCH(name, op) \
-    template <typename T> void name##_bench(nvbench::state& state, nvbench::type_list<T>) { run_binary<op, T>(state); } \
-    NVBENCH_BENCH_TYPES(name##_bench, NVBENCH_TYPE_AXES(element_types)).set_name("tile_pt_" #name).add_int64_power_of_two_axis("Elements{io}", pt_sizes);
-
-BINARY_BENCH(add,  binary_add)
-BINARY_BENCH(sub,  binary_sub)
-BINARY_BENCH(mul,  binary_mul)
-BINARY_BENCH(div,  binary_div)
-BINARY_BENCH(le,   binary_le)
-BINARY_BENCH(ge,   binary_ge)
-BINARY_BENCH(fmin, binary_fmin)
-BINARY_BENCH(fmax, binary_fmax)
-
-NVBENCH_MAIN
+inline auto pt_sizes = nvbench::range(16, 32, 4);
+
+#define UNARY_BENCH(name, op)                                            \
+  template <typename T>                                                  \
+  static void name##_bench(nvbench::state& state, nvbench::type_list<T>) \
+  {                                                                      \
+    run_unary<op, T>(state);                                             \
+  }                                                                      \
+  NVBENCH_BENCH_TYPES(name##_bench, NVBENCH_TYPE_AXES(element_types))    \
+    .set_name("tile_" #name)                                             \
+    .set_type_axes_names({"T{ct}"})                                      \
+    .add_int64_power_of_two_axis("Elements{io}", pt_sizes)
+
+UNARY_BENCH(relu, relu_op);
+UNARY_BENCH(sigmoid, sigmoid_op);
+UNARY_BENCH(tanh, tanh_op);
+UNARY_BENCH(gelu, gelu_op);
+UNARY_BENCH(sin, sin_op);
+UNARY_BENCH(exp, exp_op);
+
+#define BINARY_BENCH(name, op)                                           \
+  template <typename T>                                                  \
+  static void name##_bench(nvbench::state& state, nvbench::type_list<T>) \
+  {                                                                      \
+    run_binary<op, T>(state);                                            \
+  }                                                                      \
+  NVBENCH_BENCH_TYPES(name##_bench, NVBENCH_TYPE_AXES(element_types))    \
+    .set_name("tile_pt_" #name)                                          \
+    .set_type_axes_names({"T{ct}"})                                      \
+    .add_int64_power_of_two_axis("Elements{io}", pt_sizes)
+
+BINARY_BENCH(add, binary_add);
+BINARY_BENCH(sub, binary_sub);
+BINARY_BENCH(mul, binary_mul);
+BINARY_BENCH(div, binary_div);
+BINARY_BENCH(le, binary_le);
+BINARY_BENCH(ge, binary_ge);
+BINARY_BENCH(fmin, binary_fmin);
+BINARY_BENCH(fmax, binary_fmax);
diff --git a/cub/benchmarks/bench/transform/tile/test_device_transform.cu b/cub/benchmarks/bench/transform/tile/test_device_transform.cu
deleted file mode 100644
index e2c7a5006bb..00000000000
--- a/cub/benchmarks/bench/transform/tile/test_device_transform.cu
+++ /dev/null
@@ -1,217 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
-// SPDX-License-Identifier: BSD-3-Clause
-
-// Standalone correctness tests for cub::DeviceTransform with the tile
-// dispatch hook on. Exercises:
-//   - Built-in trait specs (cuda::std::plus, cuda::std::multiplies)
-//   - User-registered trait specs (square_op, identity_op)
-//   - cub::DeviceTransform::Fill (zero-input case)
-//
-// Built under --enable-tile + CCCL_ENABLE_TILE_TRANSFORM_DISPATCH so the
-// hook routes eligible combos to the tile kernel. Sits next to the benches
-// so it builds against the same tileiras toolchain; not part of CCCL's
-// catch2 suite.
-
-#include <cub/device/device_transform.cuh>
-
-#include <cuda_runtime.h>
-
-#include <cuda/std/functional>
-#include <cuda/std/tuple>
-
-#include <cstdio>
-#include <cstdint>
-#include <cstdlib>
-#include <cmath>
-#include <vector>
-
-#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
-#  include <cuda_tile.h>
-#endif
-
-namespace {
-
-int g_failures = 0;
-
-#define CUDA_CHECK(expr)                                                                  \
-    do {                                                                                  \
-        cudaError_t _e = (expr);                                                          \
-        if (_e != cudaSuccess) {                                                          \
-            std::fprintf(stderr, "%s:%d  CUDA error: %s\n", __FILE__, __LINE__,           \
-                         cudaGetErrorString(_e));                                         \
-            std::exit(2);                                                                 \
-        }                                                                                 \
-    } while (0)
-
-template <typename T>
-bool eq(T a, T b) { return a == b; }
-inline bool eq(float a, float b) {
-    float diff = std::fabs(a - b);
-    float tol  = 1e-5f * std::fmax(std::fabs(a), std::fabs(b));
-    return diff <= std::fmax(tol, 1e-6f);
-}
-
-template <typename T>
-void expect_array(const char* name, const std::vector<T>& got, const std::vector<T>& want) {
-    if (got.size() != want.size()) {
-        std::fprintf(stderr, "[FAIL] %s: size %zu != %zu\n", name, got.size(), want.size());
-        ++g_failures;
-        return;
-    }
-    int mismatches = 0;
-    for (size_t i = 0; i < got.size(); ++i) {
-        if (!eq(got[i], want[i])) {
-            if (mismatches < 4) {
-                std::fprintf(stderr, "[FAIL] %s: idx=%zu got=%g want=%g\n",
-                             name, i, double(got[i]), double(want[i]));
-            }
-            ++mismatches;
-        }
-    }
-    if (mismatches) { ++g_failures; std::fprintf(stderr, "[FAIL] %s: %d mismatches\n", name, mismatches); }
-    else            { std::printf("[ OK ] %s (n=%zu)\n", name, got.size()); }
-}
-
-// User-defined scalar functors (the call-site type). identity_op and square_op
-// don't have a cuda::std equivalent, so we self-register them. add and mul map
-// to cuda::std::plus / cuda::std::multiplies which CCCL already ships specs for.
-
-struct identity_op {
-    template <class T> __host__ __device__ T operator()(T a) const { return a; }
-};
-struct square_op {
-    template <class T> __host__ __device__ T operator()(T a) const { return a * a; }
-};
-
-#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
-namespace ct = ::cuda::tiles;
-
-// Tile-friendly substitutes (must be stateless + trivially default constructible).
-struct tile_identity_op {
-    template <class T> __tile__ auto operator()(T v) const { return v; }
-};
-struct tile_square_op {
-    template <class T> __tile__ auto operator()(T v) const { return v * v; }
-};
-#endif
-
-template <typename T>
-std::vector<T> ramp(int64_t n, T start = T{0}, T step = T{1}) {
-    std::vector<T> v(n);
-    for (int64_t i = 0; i < n; ++i) v[i] = T(start + step * T(i));
-    return v;
-}
-
-template <typename T>
-struct GpuVec {
-    T* d{};
-    int64_t n{};
-    explicit GpuVec(int64_t n) : n(n) { CUDA_CHECK(cudaMalloc(&d, n * sizeof(T))); }
-    explicit GpuVec(const std::vector<T>& h) : GpuVec(int64_t(h.size())) {
-        CUDA_CHECK(cudaMemcpy(d, h.data(), n * sizeof(T), cudaMemcpyHostToDevice));
-    }
-    ~GpuVec() { if (d) cudaFree(d); }
-    std::vector<T> to_host() const {
-        std::vector<T> h(n);
-        CUDA_CHECK(cudaMemcpy(h.data(), d, n * sizeof(T), cudaMemcpyDeviceToHost));
-        return h;
-    }
-};
-
-template <typename T>
-void test_identity(int64_t n) {
-    auto h_in = ramp<T>(n, T{1}, T{1});
-    GpuVec<T> dx(h_in), dy(n);
-    CUDA_CHECK(cub::DeviceTransform::Transform(
-        ::cuda::std::make_tuple(dx.d), dy.d, n, identity_op{}));
-    CUDA_CHECK(cudaDeviceSynchronize());
-    expect_array("identity", dy.to_host(), h_in);
-}
-
-template <typename T>
-void test_square(int64_t n) {
-    auto h_in = ramp<T>(n, T{1}, T{1});
-    std::vector<T> want(n);
-    for (int64_t i = 0; i < n; ++i) want[i] = h_in[i] * h_in[i];
-    GpuVec<T> dx(h_in), dy(n);
-    CUDA_CHECK(cub::DeviceTransform::Transform(
-        ::cuda::std::make_tuple(dx.d), dy.d, n, square_op{}));
-    CUDA_CHECK(cudaDeviceSynchronize());
-    expect_array("square", dy.to_host(), want);
-}
-
-template <typename T>
-void test_add(int64_t n) {
-    auto ha = ramp<T>(n, T{1},   T{1});
-    auto hb = ramp<T>(n, T{100}, T{2});
-    std::vector<T> want(n);
-    for (int64_t i = 0; i < n; ++i) want[i] = ha[i] + hb[i];
-    GpuVec<T> da(ha), db(hb), dc(n);
-    CUDA_CHECK(cub::DeviceTransform::Transform(
-        ::cuda::std::make_tuple(da.d, db.d), dc.d, n, ::cuda::std::plus<T>{}));
-    CUDA_CHECK(cudaDeviceSynchronize());
-    expect_array("add", dc.to_host(), want);
-}
-
-template <typename T>
-void test_mul(int64_t n) {
-    auto ha = ramp<T>(n, T{1}, T{1});
-    auto hb = ramp<T>(n, T{3}, T{1});
-    std::vector<T> want(n);
-    for (int64_t i = 0; i < n; ++i) want[i] = ha[i] * hb[i];
-    GpuVec<T> da(ha), db(hb), dc(n);
-    CUDA_CHECK(cub::DeviceTransform::Transform(
-        ::cuda::std::make_tuple(da.d, db.d), dc.d, n, ::cuda::std::multiplies<T>{}));
-    CUDA_CHECK(cudaDeviceSynchronize());
-    expect_array("mul", dc.to_host(), want);
-}
-
-template <typename T>
-void test_fill(int64_t n, T value) {
-    GpuVec<T> dy(n);
-    CUDA_CHECK(cub::DeviceTransform::Fill(dy.d, n, value));
-    CUDA_CHECK(cudaDeviceSynchronize());
-    std::vector<T> want(n, value);
-    expect_array("fill", dy.to_host(), want);
-}
-
-} // namespace
-
-// User self-registers identity_op and square_op as tile-eligible.
-#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
-CUB_NAMESPACE_BEGIN
-namespace transform
-{
-template <> struct tile_eligible<identity_op, int32_t, 1> : ::cuda::std::true_type {};
-template <> struct tile_eligible<identity_op, float, 1>   : ::cuda::std::true_type {};
-template <> struct tile_eligible<square_op, int32_t, 1>   : ::cuda::std::true_type {};
-template <> struct tile_eligible<square_op, float, 1>     : ::cuda::std::true_type {};
-template <> struct tile_operator<identity_op> { using type = tile_identity_op; };
-template <> struct tile_operator<square_op>   { using type = tile_square_op; };
-} // namespace transform
-CUB_NAMESPACE_END
-#endif
-
-int main() {
-    // pow-2, multiple tiles
-    test_identity<std::int32_t>(4096);
-    test_square<std::int32_t>(2048);
-    test_add<float>(4096);
-    test_mul<float>(2048);
-    test_fill<std::int32_t>(1024, 42);
-
-    // non-pow-2 num_items (still multiple of 16 to satisfy assume_divisible<16>)
-    test_add<float>(4112);     // 16 * 257
-    test_fill<std::int32_t>(1008, -7);   // 16 * 63
-
-    // single full tile and below-one-tile (still >=16, div by 16)
-    test_square<std::int32_t>(16);
-    test_add<float>(64);
-
-    if (g_failures) {
-        std::fprintf(stderr, "\n%d test group(s) FAILED\n", g_failures);
-        return 1;
-    }
-    std::printf("\nall tests passed\n");
-    return 0;
-}

From 01418396cec19a620a7bee9a1974deceef09562b Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Thu, 11 Jun 2026 18:51:10 -0700
Subject: [PATCH 60/83] accept cuda::aligned_size_t<16> as a compile-time
 tile-commit hint

DeviceTransform::Transform now accepts num_items as cuda::aligned_size_t<N> --
the same opt-in promise cuda::memcpy_async uses (pointers N-aligned, num_items a
multiple of N). When N>=16 and the op is tile-eligible, the hook commits to the
tile path at compile time and skips runtime_preconditions_valid; otherwise it
falls back to the existing runtime alignment/divisibility check.

No overload needed -- NumItemsT deduces the type; __get_size_align_v<NumItemsT>
reads the alignment. num_items is unwrapped to a plain integer for the offset
machinery (choose_signed_offset requires integral), so plain-integer callers are
byte-for-byte unchanged (count_t == NumItemsT, count == num_items). Verified on
sm_120 that the aligned_size_t commit path is bit-correct.
---
 cub/cub/device/device_transform.cuh | 42 ++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index 7d8bd316e81..24a828c5fc0 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -27,6 +27,7 @@
 #include <cuda/__functional/always_true_false.h>
 #include <cuda/__functional/call_or.h>
 #include <cuda/__iterator/zip_iterator.h>
+#include <cuda/__memory/aligned_size.h>
 #include <cuda/__stream/get_stream.h>
 #include <cuda/std/__execution/env.h>
 #include <cuda/std/tuple>
@@ -97,7 +98,16 @@ struct DeviceTransform
     // https://github.com/NVIDIA/cccl/issues/8805 for data. We use choose_signed_offset to just check if it can hold the
     // value passed by the user, but otherwise ignore the chosen signed offset type.
     using offset_t = ::cuda::std::int64_t;
-    if (const cudaError_t error = detail::choose_signed_offset<NumItemsT>::is_exceeding_offset_type(num_items))
+
+    // num_items may be a plain integer or a cuda::aligned_size_t<N> -- an opt-in promise (the same one
+    // cuda::memcpy_async uses) that the pointers are N-aligned and num_items is a multiple of N. Unwrap
+    // it to a plain integer for the offset machinery (choose_signed_offset requires an integral type);
+    // the alignment promise is read separately by the tile hook below. For a plain integer this is a
+    // no-op: count_t == NumItemsT and count == num_items.
+    constexpr ::cuda::std::size_t num_items_align = ::cuda::__get_size_align_v<NumItemsT>;
+    using count_t       = ::cuda::std::conditional_t<(num_items_align > 1), ::cuda::std::size_t, NumItemsT>;
+    const count_t count = static_cast<count_t>(num_items);
+    if (const cudaError_t error = detail::choose_signed_offset<count_t>::is_exceeding_offset_type(count))
     {
       return error;
     }
@@ -105,24 +115,24 @@ struct DeviceTransform
     const auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env).get();
 
 #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
-    // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible AND
-    // the runtime alignment + divisibility preconditions hold, route to the
-    // tile kernel. Otherwise fall through to the standard CUB dispatch
-    // below -- CUB's existing kernels handle the unaligned tail case via
-    // their own internal logic, so misalignment is a graceful fallback,
-    // not an error.
+    // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible we route to the tile kernel:
+    //  - if num_items is a cuda::aligned_size_t<N>=16, the caller has promised 16-byte pointer
+    //    alignment + divisibility, so we commit to tile at compile time and skip the runtime check;
+    //  - otherwise we check the alignment/divisibility preconditions at runtime and fall through to
+    //    the standard CUB dispatch below if they do not hold (CUB's kernels handle the unaligned/tail
+    //    case, so this is a graceful fallback, not an error).
     if constexpr (StableAddress == detail::transform::requires_stable_address::no
                   && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
-                  && cub::detail::transform::tile::tile_dispatch_eligible_v<
-                       TransformOp,
-                       RandomAccessIteratorOut,
-                       RandomAccessIteratorsIn...>)
+                  && cub::detail::transform::tile::
+                    tile_dispatch_eligible_v<TransformOp, RandomAccessIteratorOut, RandomAccessIteratorsIn...>)
     {
-      if (cub::detail::transform::tile::runtime_preconditions_valid(
-            inputs, output, static_cast<offset_t>(num_items)))
+      if constexpr (num_items_align >= 16)
+      {
+        return cub::detail::transform::tile::dispatch<TransformOp>(inputs, output, static_cast<offset_t>(count), stream);
+      }
+      else if (cub::detail::transform::tile::runtime_preconditions_valid(inputs, output, static_cast<offset_t>(count)))
       {
-        return cub::detail::transform::tile::dispatch<TransformOp>(
-          inputs, output, static_cast<offset_t>(num_items), stream);
+        return cub::detail::transform::tile::dispatch<TransformOp>(inputs, output, static_cast<offset_t>(count), stream);
       }
     }
 #endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
@@ -145,7 +155,7 @@ struct DeviceTransform
     return detail::transform::dispatch<StableAddress>(
       ::cuda::std::move(inputs),
       ::cuda::std::move(output),
-      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(count),
       ::cuda::std::move(predicate),
       ::cuda::std::move(transform_op),
       stream,

From 99042d3933723f1c231ec5b80ca85d4a92793cd0 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Thu, 11 Jun 2026 19:16:55 -0700
Subject: [PATCH 61/83] clang-format kernel_transform_tile.cuh and
 tuning_transform_tile.cuh

---
 .../dispatch/kernels/kernel_transform_tile.cuh   | 16 ++++++++--------
 .../dispatch/tuning/tuning_transform_tile.cuh    |  2 --
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index 585cefc833d..778721f6257 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -17,15 +17,14 @@
 
 #if _CCCL_CUB_HAS_TILE_TRANSFORM()
 
-#  include <cuda_tile.h>
-
 #  include <cuda/std/cstdint>
 
+#  include <cuda_tile.h>
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail::transform::tile
 {
-
 // Build a tile partition_view for a 1D contiguous buffer. The two annotations are load-bearing:
 //   assume_aligned<16>      -- promises the pointer is 16-byte aligned, so the compiler can pick LDG.E.128 vectorized
 //                              loads/stores.
@@ -36,7 +35,7 @@ namespace detail::transform::tile
 template <int TileSize, typename T, typename N>
 [[nodiscard]] __tile__ auto make_aligned_partition_view(T* ptr, N n)
 {
-  namespace ct        = ::cuda::tiles;
+  namespace ct         = ::cuda::tiles;
   const auto ptr_align = ct::assume_aligned<16>(ptr);
   auto span            = ct::tensor_span{ptr_align, ct::extents<::cuda::std::int64_t, ct::dynamic_extent>{n}};
   return ct::partition_view{span, ct::shape<TileSize>{}};
@@ -52,13 +51,15 @@ template <int TileSize, typename Fn, typename Out, typename... Ins>
 __tile_global__ void
 transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins)
 {
-  namespace ct  = ::cuda::tiles;
+  namespace ct = ::cuda::tiles;
   using cub::detail::transform::tile::make_aligned_partition_view;
   const auto bx = ct::bid().x;
 
   const auto n        = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
   const auto out_view = make_aligned_partition_view<TileSize>(out, n);
-  auto load_one       = [bx, n](auto* ptr) { return make_aligned_partition_view<TileSize>(ptr, n).load_masked(bx); };
+  auto load_one       = [bx, n](auto* ptr) {
+    return make_aligned_partition_view<TileSize>(ptr, n).load_masked(bx);
+  };
 
   out_view.store_masked(Fn{}(load_one(ins)...), bx);
 }
@@ -66,7 +67,7 @@ transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, co
 template <int TileSize, typename T>
 __tile_global__ void fill_kernel(const ::cuda::std::int64_t num_items, T* __restrict__ out, const T value)
 {
-  namespace ct  = ::cuda::tiles;
+  namespace ct = ::cuda::tiles;
   using cub::detail::transform::tile::make_aligned_partition_view;
   const auto bx = ct::bid().x;
 
@@ -75,7 +76,6 @@ __tile_global__ void fill_kernel(const ::cuda::std::int64_t num_items, T* __rest
   using tile_t        = ct::tile<T, ct::shape<TileSize>>;
   out_view.store_masked(ct::full<tile_t>(value), bx);
 }
-
 } // namespace detail::transform::tile
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
index a7715e6f195..8a11ad60f7a 100644
--- a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
@@ -36,7 +36,6 @@ CUB_NAMESPACE_BEGIN
 
 namespace detail::transform::tile
 {
-
 // mufu_heavy=true tells the policy the functor body has heavy MUFU usage.
 // for small data types, vectorized load will make them arrive packed in
 // registers and the compiler unpacks them and packs them back. reducing the
@@ -79,7 +78,6 @@ constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability
 
   return items * threads_per_block;
 }
-
 } // namespace detail::transform::tile
 
 CUB_NAMESPACE_END

From 4a5ec541567abc89550186f071341523faaaaee8 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Thu, 11 Jun 2026 23:48:26 -0700
Subject: [PATCH 62/83] guard fp16/bf16 in tile pytorch bench on CTK 12.2+

Matches the base transform/pytorch.cu: __half/__nv_bfloat16 are only added to the
type axis under _CCCL_CTK_AT_LEAST(12, 2). On CTK 12.0 __nv_bfloat16 has only
float/double constructors, so constructing it from an int literal (T(1) in
run_unary/run_binary) is ambiguous -- which broke the CTK 12.0 build.
---
 cub/benchmarks/bench/transform/tile/pytorch.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu
index 006f88ff5c6..ccb4680ba21 100644
--- a/cub/benchmarks/bench/transform/tile/pytorch.cu
+++ b/cub/benchmarks/bench/transform/tile/pytorch.cu
@@ -425,10 +425,10 @@ CUB_NAMESPACE_END
 using element_types = nvbench::type_list<TUNE_T>;
 #else
 using element_types = nvbench::type_list<
-#  if _CCCL_HAS_NVFP16()
+#  if _CCCL_HAS_NVFP16() && _CCCL_CTK_AT_LEAST(12, 2)
   __half,
 #  endif
-#  if _CCCL_HAS_NVBF16()
+#  if _CCCL_HAS_NVBF16() && _CCCL_CTK_AT_LEAST(12, 2)
   __nv_bfloat16,
 #  endif
   float>;

From 76ac55f715cd3da405538d3db9d3320426cdd91c Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Fri, 12 Jun 2026 12:24:01 -0700
Subject: [PATCH 63/83] drop redundant comment on the gate macro

---
 cub/cub/device/dispatch/dispatch_transform_tile_config.cuh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
index 8c25ea9bd30..05c928acc1c 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
@@ -31,9 +31,6 @@
 
 #define _CCCL_CUB_HAS_TILE_TRANSFORM() _CCCL_TILE_COMPILATION()
 
-// Defined as a literal 1/0 (not (_CCCL_CUB_HAS_TILE_TRANSFORM() && defined(...))) so that
-// `#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()` in non-system code (benches, tests) does not
-// generate `defined` via macro expansion, which is UB and trips -Wexpansion-to-defined under -Werror.
 #if _CCCL_CUB_HAS_TILE_TRANSFORM() && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH)
 #  define _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() 1
 #else

From 67b94897cb54721828597632f4f0487c41c8f32e Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Fri, 12 Jun 2026 15:37:47 -0700
Subject: [PATCH 64/83] address review nits in tile transform dispatch

- device_transform.cuh: reflow the aligned_size_t comment to 120 cols; use `constexpr auto num_items_align` instead of spelling out ::cuda::std::size_t

- dispatch_transform_tile.cuh: use ::cuda::std::iter_value_t over the CUB-local cub::detail::it_value_t; drop the redundant <cuda_runtime.h> include (cudaError_t/cudaStream_t/cudaGetLastError come in transitively via cub/util_debug.cuh)
---
 cub/cub/device/device_transform.cuh                | 14 ++++++--------
 .../device/dispatch/dispatch_transform_tile.cuh    |  5 ++---
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index 24a828c5fc0..9fff2582bc5 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -99,14 +99,12 @@ struct DeviceTransform
     // value passed by the user, but otherwise ignore the chosen signed offset type.
     using offset_t = ::cuda::std::int64_t;
 
-    // num_items may be a plain integer or a cuda::aligned_size_t<N> -- an opt-in promise (the same one
-    // cuda::memcpy_async uses) that the pointers are N-aligned and num_items is a multiple of N. Unwrap
-    // it to a plain integer for the offset machinery (choose_signed_offset requires an integral type);
-    // the alignment promise is read separately by the tile hook below. For a plain integer this is a
-    // no-op: count_t == NumItemsT and count == num_items.
-    constexpr ::cuda::std::size_t num_items_align = ::cuda::__get_size_align_v<NumItemsT>;
-    using count_t       = ::cuda::std::conditional_t<(num_items_align > 1), ::cuda::std::size_t, NumItemsT>;
-    const count_t count = static_cast<count_t>(num_items);
+    // num_items may be a plain integer or cuda::aligned_size_t<N> (the cuda::memcpy_async-style opt-in promising N-byte
+    // pointer alignment + size divisibility). Unwrap to a plain integer for the offset machinery (choose_signed_offset
+    // needs an integral type); the tile hook below reads the alignment promise. No-op for a plain integer.
+    constexpr auto num_items_align = ::cuda::__get_size_align_v<NumItemsT>;
+    using count_t                  = ::cuda::std::conditional_t<(num_items_align > 1), ::cuda::std::size_t, NumItemsT>;
+    const count_t count            = static_cast<count_t>(num_items);
     if (const cudaError_t error = detail::choose_signed_offset<count_t>::is_exceeding_offset_type(count))
     {
       return error;
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 7ce52562fa0..cee5effa0bb 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -37,6 +37,7 @@
 #  include <thrust/type_traits/unwrap_contiguous_iterator.h>
 
 #  include <cuda/__cmath/ceil_div.h>
+#  include <cuda/std/__iterator/readable_traits.h>
 #  include <cuda/std/__memory/is_sufficiently_aligned.h>
 #  include <cuda/std/__tuple_dir/apply.h>
 #  include <cuda/std/__type_traits/is_empty.h>
@@ -45,8 +46,6 @@
 #  include <cuda/std/cstdint>
 #  include <cuda/std/tuple>
 
-#  include <cuda_runtime.h>
-
 CUB_NAMESPACE_BEGIN
 
 namespace detail::transform::tile
@@ -115,7 +114,7 @@ template <typename Op, typename OutIter, typename... InIters>
 inline constexpr bool tile_dispatch_eligible_v =
   THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutIter>
   && (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<InIters> && ...)
-  && cub::transform::tile_eligible_v<Op, cub::detail::it_value_t<OutIter>, sizeof...(InIters)>;
+  && cub::transform::tile_eligible_v<Op, ::cuda::std::iter_value_t<OutIter>, sizeof...(InIters)>;
 
 // Runtime predicate consulted by the cub::DeviceTransform tile hook before
 // it commits to the tile path. Mirrors how CUB's dispatch_t::CanVectorize

From 8614d456c66eda5c95b6782dfc29ecf993cf97a8 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Fri, 12 Jun 2026 15:37:47 -0700
Subject: [PATCH 65/83] collapse tile_eligible to a single variable template

tile_eligible<Op,T,NIn> was a false_type struct that tile_eligible_v just forwarded to. Drop the struct and make tile_eligible_v the specializable extension point directly -- same (Op,T,NIn) granularity, one name instead of two. Updates the built-in half/bf16 specializations and the bench/test registrations to specialize the variable template (partial specialization over T).
---
 .../bench/transform/tile/babelstream.cu       | 12 ++----
 cub/benchmarks/bench/transform/tile/copy.cu   |  3 +-
 .../bench/transform/tile/grayscale.cu         |  3 +-
 .../bench/transform/tile/pytorch.cu           | 42 +++++++------------
 .../dispatch_transform_tile_traits.cuh        | 31 ++++++--------
 cub/test/catch2_test_device_transform_tile.cu |  6 +--
 6 files changed, 34 insertions(+), 63 deletions(-)

diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu
index 6e9caf03f2d..60ccfd9f5f7 100644
--- a/cub/benchmarks/bench/transform/tile/babelstream.cu
+++ b/cub/benchmarks/bench/transform/tile/babelstream.cu
@@ -86,17 +86,13 @@ CUB_NAMESPACE_BEGIN
 namespace transform
 {
 template <class T>
-struct tile_eligible<mul_op, T, 1> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<mul_op, T, 1> = true;
 template <class T>
-struct tile_eligible<add_op, T, 2> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<add_op, T, 2> = true;
 template <class T>
-struct tile_eligible<triad_op, T, 2> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<triad_op, T, 2> = true;
 template <class T>
-struct tile_eligible<nstream_op, T, 3> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<nstream_op, T, 3> = true;
 template <>
 struct tile_operator<mul_op>
 {
diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu
index 85ed12e0d4d..e766c420286 100644
--- a/cub/benchmarks/bench/transform/tile/copy.cu
+++ b/cub/benchmarks/bench/transform/tile/copy.cu
@@ -36,8 +36,7 @@ CUB_NAMESPACE_BEGIN
 namespace transform
 {
 template <class T>
-struct tile_eligible<identity, T, 1> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<identity, T, 1> = true;
 template <>
 struct tile_operator<identity>
 {
diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu
index f9ab98d62ad..daee79afc16 100644
--- a/cub/benchmarks/bench/transform/tile/grayscale.cu
+++ b/cub/benchmarks/bench/transform/tile/grayscale.cu
@@ -41,8 +41,7 @@ CUB_NAMESPACE_BEGIN
 namespace transform
 {
 template <class T>
-struct tile_eligible<rgb_to_y, T, 3> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<rgb_to_y, T, 3> = true;
 template <>
 struct tile_operator<rgb_to_y>
 {
diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu
index ccb4680ba21..527ac65eb72 100644
--- a/cub/benchmarks/bench/transform/tile/pytorch.cu
+++ b/cub/benchmarks/bench/transform/tile/pytorch.cu
@@ -286,23 +286,17 @@ namespace transform
 {
 // Unary
 template <class T>
-struct tile_eligible<relu_op, T, 1> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<relu_op, T, 1> = true;
 template <class T>
-struct tile_eligible<sigmoid_op, T, 1> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<sigmoid_op, T, 1> = true;
 template <class T>
-struct tile_eligible<tanh_op, T, 1> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<tanh_op, T, 1> = true;
 template <class T>
-struct tile_eligible<gelu_op, T, 1> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<gelu_op, T, 1> = true;
 template <class T>
-struct tile_eligible<sin_op, T, 1> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<sin_op, T, 1> = true;
 template <class T>
-struct tile_eligible<exp_op, T, 1> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<exp_op, T, 1> = true;
 template <>
 struct tile_operator<relu_op>
 {
@@ -354,29 +348,21 @@ struct tile_mufu_heavy<exp_op> : ::cuda::std::true_type
 
 // Binary
 template <class T>
-struct tile_eligible<binary_add, T, 2> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<binary_add, T, 2> = true;
 template <class T>
-struct tile_eligible<binary_sub, T, 2> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<binary_sub, T, 2> = true;
 template <class T>
-struct tile_eligible<binary_mul, T, 2> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<binary_mul, T, 2> = true;
 template <class T>
-struct tile_eligible<binary_div, T, 2> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<binary_div, T, 2> = true;
 template <class T>
-struct tile_eligible<binary_le, T, 2> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<binary_le, T, 2> = true;
 template <class T>
-struct tile_eligible<binary_ge, T, 2> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<binary_ge, T, 2> = true;
 template <class T>
-struct tile_eligible<binary_fmin, T, 2> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<binary_fmin, T, 2> = true;
 template <class T>
-struct tile_eligible<binary_fmax, T, 2> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<binary_fmax, T, 2> = true;
 template <>
 struct tile_operator<binary_add>
 {
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
index f51d280264b..c347f6f0631 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
@@ -4,10 +4,9 @@
 // Compile-time policy for cub::DeviceTransform's tile path.
 //
 // PUBLIC EXTENSION POINTS (cub::transform) -- two independent axes:
-//   tile_eligible<Op, T, NIn>   -- specialize to true_type to opt a (functor
-//                                   type, element type, input arity) combo into
-//                                   the tile dispatch path. Eligibility only.
-//   tile_eligible_v<...>        -- variable-template companion.
+//   tile_eligible_v<Op, T, NIn> -- specialize to true to opt a (functor type,
+//                                   element type, input arity) combo into the
+//                                   tile dispatch path. Eligibility only.
 //   tile_operator<Op>           -- the __tile__ functor the tile kernel runs
 //                                   for Op. No default: every tile-eligible Op
 //                                   must specialize it with `using type = <a
@@ -23,7 +22,7 @@
 //
 // Eligibility ("may this combo use the tile path?") and substitution ("which
 // __tile__ functor do we actually run?") are separate traits, so an eligible op
-// always registers both: tile_eligible<Op,T,NIn> and tile_operator<Op>.
+// always registers both: tile_eligible_v<Op,T,NIn> and tile_operator<Op>.
 //
 // INTERNAL (cub::detail::transform::tile):
 //   tile_plus, tile_multiplies   -- shipped tile-friendly substitutes used by
@@ -56,16 +55,14 @@ CUB_NAMESPACE_BEGIN
 // Public extension surface.
 namespace transform
 {
+// Opt a (functor type, element type, input arity) combo into the tile dispatch path: specialize this to
+// true for the combo. Eligibility only -- the __tile__ functor to actually run is named by tile_operator<Op>.
 template <typename Op, typename T, ::cuda::std::size_t NIn>
-struct tile_eligible : ::cuda::std::false_type
-{};
-
-template <typename Op, typename T, ::cuda::std::size_t NIn>
-inline constexpr bool tile_eligible_v = tile_eligible<Op, T, NIn>::value;
+inline constexpr bool tile_eligible_v = false;
 
 // The __tile__ functor the tile kernel runs for Op -- the tile-side mirror of the scalar Op. There is
 // no default: a scalar functor cannot be invoked on ct::tile, so every tile-eligible Op must specialize
-// this with a `type` naming a stateless __tile__ functor. tile_eligible<Op,...> says a combo MAY use the
+// this with a `type` naming a stateless __tile__ functor. tile_eligible_v<Op,...> says a combo MAY use the
 // tile path; tile_operator<Op> says WHAT the tile kernel runs.
 template <typename Op>
 struct tile_operator
@@ -120,11 +117,9 @@ namespace transform
 // cuda::std::plus / multiplies are scalar ops, so each is marked eligible and given a tile_operator mirror.
 #  if _CCCL_HAS_NVFP16()
 template <>
-struct tile_eligible<::cuda::std::plus<::__half>, ::__half, 2> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<::cuda::std::plus<::__half>, ::__half, 2> = true;
 template <>
-struct tile_eligible<::cuda::std::multiplies<::__half>, ::__half, 2> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<::cuda::std::multiplies<::__half>, ::__half, 2> = true;
 template <>
 struct tile_operator<::cuda::std::plus<::__half>>
 {
@@ -139,11 +134,9 @@ struct tile_operator<::cuda::std::multiplies<::__half>>
 
 #  if _CCCL_HAS_NVBF16()
 template <>
-struct tile_eligible<::cuda::std::plus<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<::cuda::std::plus<::__nv_bfloat16>, ::__nv_bfloat16, 2> = true;
 template <>
-struct tile_eligible<::cuda::std::multiplies<::__nv_bfloat16>, ::__nv_bfloat16, 2> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<::cuda::std::multiplies<::__nv_bfloat16>, ::__nv_bfloat16, 2> = true;
 template <>
 struct tile_operator<::cuda::std::plus<::__nv_bfloat16>>
 {
diff --git a/cub/test/catch2_test_device_transform_tile.cu b/cub/test/catch2_test_device_transform_tile.cu
index 3ff05fac915..f77eea0e31b 100644
--- a/cub/test/catch2_test_device_transform_tile.cu
+++ b/cub/test/catch2_test_device_transform_tile.cu
@@ -66,8 +66,7 @@ CUB_NAMESPACE_BEGIN
 namespace transform
 {
 template <class T>
-struct tile_eligible<square_op, T, 1> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<square_op, T, 1> = true;
 template <>
 struct tile_operator<square_op>
 {
@@ -75,8 +74,7 @@ struct tile_operator<square_op>
 };
 
 template <class T>
-struct tile_eligible<add_op, T, 2> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_eligible_v<add_op, T, 2> = true;
 template <>
 struct tile_operator<add_op>
 {

From 81f21333b3a65ccf29b06d28e4903272ea968caa Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Fri, 12 Jun 2026 15:37:47 -0700
Subject: [PATCH 66/83] avoid nvcc 13.4 tile lambda-linkage miscompile: call
 partition-view helper directly

transform_kernel loaded inputs through a load_one lambda that called make_aligned_partition_view. Under --expt-relaxed-constexpr (which the benches pass) nvcc 13.4 treats the implicitly-constexpr lambda as __host__ __device__, so a non-tile copy references the __tile__ helper -- which has no body outside tile space -- and emits it as a bodiless internal-linkage declaration, tripping the IR verifier ("Broken module"). Call the helper directly in the pack expansion instead; the kernel body is pure-tile so the conflict can't arise. Minimal repro + nvbug filed; same execution-space family as the _CCCL_API __tile__ strip.
---
 .../kernels/kernel_transform_tile.cuh         | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index 778721f6257..a8bcd7dd836 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -47,30 +47,26 @@ template <int TileSize, typename T, typename N>
 //
 // assume_divisible<16>      -- promises num_items % 16 == 0, so the tile DSL can elide tail handling.
 // assume_bounded_below<0>   -- promises num_items >= 0; enables sign-comparison simplifications.
+//
+// NOTE: make_aligned_partition_view is invoked directly. do NOT wrap these calls in a lambda because of compiler bug:
+// templated __tile__ helper + a lambda that calls it + --expt-relaxed-constexpr produces invalid IR.
 template <int TileSize, typename Fn, typename Out, typename... Ins>
 __tile_global__ void
 transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins)
 {
-  namespace ct = ::cuda::tiles;
-  using cub::detail::transform::tile::make_aligned_partition_view;
+  namespace ct  = ::cuda::tiles;
   const auto bx = ct::bid().x;
+  const auto n  = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
 
-  const auto n        = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
   const auto out_view = make_aligned_partition_view<TileSize>(out, n);
-  auto load_one       = [bx, n](auto* ptr) {
-    return make_aligned_partition_view<TileSize>(ptr, n).load_masked(bx);
-  };
-
-  out_view.store_masked(Fn{}(load_one(ins)...), bx);
+  out_view.store_masked(Fn{}(make_aligned_partition_view<TileSize>(ins, n).load_masked(bx)...), bx);
 }
 
 template <int TileSize, typename T>
 __tile_global__ void fill_kernel(const ::cuda::std::int64_t num_items, T* __restrict__ out, const T value)
 {
-  namespace ct = ::cuda::tiles;
-  using cub::detail::transform::tile::make_aligned_partition_view;
-  const auto bx = ct::bid().x;
-
+  namespace ct        = ::cuda::tiles;
+  const auto bx       = ct::bid().x;
   const auto n        = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
   const auto out_view = make_aligned_partition_view<TileSize>(out, n);
   using tile_t        = ct::tile<T, ct::shape<TileSize>>;

From 393ce96381fbc447158ec29d27f86f9d1dd87a62 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Fri, 12 Jun 2026 16:10:42 -0700
Subject: [PATCH 67/83] add opt-in CMake option to build cub::DeviceTransform's
 tile path under --enable-tile

CCCL_ENABLE_TILE_TRANSFORM_DISPATCH (default OFF, defined in cub/CMakeLists.txt) scopes nvcc --enable-tile + the dispatch opt-in macro to just the tile transform test and the bench/transform/tile benches, via per-target compile options in test/ and benchmarks/. This replaces forcing --enable-tile through global CMAKE_CUDA_FLAGS, which also lands on the C++17 c2h helper lib where cuda_tile.h hard-errors on the dialect. CI never sets the option, so the tile path stays compiled out there; locally the test builds + passes (960 assertions on sm_120).
---
 cub/CMakeLists.txt            | 10 ++++++++++
 cub/benchmarks/CMakeLists.txt | 16 ++++++++++++++++
 cub/test/CMakeLists.txt       | 16 ++++++++++++++++
 3 files changed, 42 insertions(+)

diff --git a/cub/CMakeLists.txt b/cub/CMakeLists.txt
index 4c8c778f7fe..4b872a7993c 100644
--- a/cub/CMakeLists.txt
+++ b/cub/CMakeLists.txt
@@ -10,6 +10,16 @@ option(CUB_ENABLE_HEADER_TESTING "Test that all public headers compile." ON)
 option(CUB_ENABLE_TESTING "Build CUB testing suite." ON)
 option(CUB_ENABLE_EXAMPLES "Build CUB examples." ON)
 
+# Opt-in: build cub::DeviceTransform's tile-DSL path (test + benches) under `nvcc --enable-tile`.
+# Defaults OFF; CI never sets it, so the tile code stays gated out except in an explicit local
+# --enable-tile build. Applied per-target in test/ and benchmarks/ -- never via global CMAKE_CUDA_FLAGS,
+# which would also hit the C++17 c2h helper lib where cuda_tile.h hard-errors on the dialect.
+option(
+  CCCL_ENABLE_TILE_TRANSFORM_DISPATCH
+  "Build cub::DeviceTransform's tile path (requires nvcc --enable-tile)."
+  OFF
+)
+
 option(CUB_ENABLE_TUNING "Build CUB tuning suite." OFF)
 if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
   set(CUB_ENABLE_TUNING OFF)
diff --git a/cub/benchmarks/CMakeLists.txt b/cub/benchmarks/CMakeLists.txt
index e54cf1c80db..c9e9e7893cd 100644
--- a/cub/benchmarks/CMakeLists.txt
+++ b/cub/benchmarks/CMakeLists.txt
@@ -91,6 +91,22 @@ function(add_bench target_name bench_name bench_src)
       cccl.nvbench_helper
       nvbench::main
   )
+
+  # Tile-DSL transform benches: build under --enable-tile + the dispatch opt-in when requested. Gated by
+  # CCCL_ENABLE_TILE_TRANSFORM_DISPATCH (default OFF) so CI builds the tile/ benches with the tile path off.
+  if (
+    CCCL_ENABLE_TILE_TRANSFORM_DISPATCH
+    AND "${bench_src}" MATCHES "/transform/tile/"
+  )
+    target_compile_options(
+      ${bench_target}
+      PRIVATE "$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--enable-tile>"
+    )
+    target_compile_definitions(
+      ${bench_target}
+      PRIVATE CCCL_ENABLE_TILE_TRANSFORM_DISPATCH
+    )
+  endif()
 endfunction()
 
 function(add_bench_dir bench_dir)
diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt
index ce46a86b93e..0214861c053 100644
--- a/cub/test/CMakeLists.txt
+++ b/cub/test/CMakeLists.txt
@@ -172,6 +172,22 @@ function(
       target_compile_options(${test_target} PRIVATE -ftemplate-depth=1000) # for handling large type lists
     endif()
 
+    # Tile-DSL transform test: compile under --enable-tile and turn on the dispatch hook. Gated by
+    # CCCL_ENABLE_TILE_TRANSFORM_DISPATCH (default OFF) so CI keeps the tile path compiled out.
+    if (
+      CCCL_ENABLE_TILE_TRANSFORM_DISPATCH
+      AND "${test_src}" MATCHES "test_device_transform_tile\\.cu$"
+    )
+      target_compile_options(
+        ${test_target}
+        PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--enable-tile>
+      )
+      target_compile_definitions(
+        ${test_target}
+        PRIVATE CCCL_ENABLE_TILE_TRANSFORM_DISPATCH
+      )
+    endif()
+
     # enable lambdas for all API examples
     if ("${test_src}" MATCHES "test.+_api\\.cu$")
       target_compile_options(

From 76d02eb7dc1b1399b506542070cfdfdd9683899e Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Fri, 12 Jun 2026 16:56:41 -0700
Subject: [PATCH 68/83] gate cub::DeviceTransform tile path on CTK 13.4

tile C++ exists since 13.3, but the 13.3 tile compiler has too many codegen issues, so 13.4 is the supported floor. _CCCL_CUB_HAS_TILE_TRANSFORM() now requires _CCCL_CTK_AT_LEAST(13, 4) in addition to --enable-tile, so the tile headers compile out entirely below 13.4. The CMake option errors cleanly if enabled on < 13.4 instead of failing later on an unrecognized --enable-tile.
---
 cub/CMakeLists.txt                                 | 11 +++++++++++
 .../dispatch/dispatch_transform_tile_config.cuh    | 14 ++++++++------
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/cub/CMakeLists.txt b/cub/CMakeLists.txt
index 4b872a7993c..e45b5635e91 100644
--- a/cub/CMakeLists.txt
+++ b/cub/CMakeLists.txt
@@ -19,6 +19,17 @@ option(
   "Build cub::DeviceTransform's tile path (requires nvcc --enable-tile)."
   OFF
 )
+if (
+  CCCL_ENABLE_TILE_TRANSFORM_DISPATCH
+  AND "${CMAKE_CUDA_COMPILER_ID}" STREQUAL "NVIDIA"
+  AND "${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS 13.4
+)
+  message(
+    FATAL_ERROR
+    "CCCL_ENABLE_TILE_TRANSFORM_DISPATCH requires CUDA 13.4+ (nvcc --enable-tile). "
+    "Found ${CMAKE_CUDA_COMPILER_VERSION}."
+  )
+endif()
 
 option(CUB_ENABLE_TUNING "Build CUB tuning suite." OFF)
 if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
index 05c928acc1c..833bca94d83 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
@@ -6,11 +6,13 @@
 //
 //   _CCCL_CUB_HAS_TILE_TRANSFORM()
 //     True when nvcc is compiling in tile mode (--enable-tile, i.e.
-//     _CCCL_TILE_COMPILATION()). The other preconditions tile needs are
-//     enforced where they belong: CTK 13.3+ is implied because --enable-tile
-//     is a 13.3+ nvcc flag, and C++20 is enforced by cuda_tile.h itself with
-//     an explicit #error. When false, the tile headers (kernel / tuning /
-//     dispatch / traits) are skipped entirely.
+//     _CCCL_TILE_COMPILATION()) AND the toolkit is CTK 13.4+. tile C++ exists
+//     since 13.3, but we require 13.4: the 13.3 tile compiler has too many
+//     codegen issues, so 13.4 is the supported floor. (C++20 is enforced by
+//     cuda_tile.h itself with an explicit #error.) The sm_80+ requirement is
+//     handled at runtime in the dispatch + NV_IF_TARGET in the kernels, not
+//     here, since this gate is host+device. When false, the tile headers
+//     (kernel / tuning / dispatch / traits) are skipped entirely.
 //
 //   _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
 //     True when the dispatch hook in cub::DeviceTransform should fire. Same as
@@ -29,7 +31,7 @@
 #  pragma system_header
 #endif // no system header
 
-#define _CCCL_CUB_HAS_TILE_TRANSFORM() _CCCL_TILE_COMPILATION()
+#define _CCCL_CUB_HAS_TILE_TRANSFORM() (_CCCL_TILE_COMPILATION() && _CCCL_CTK_AT_LEAST(13, 4))
 
 #if _CCCL_CUB_HAS_TILE_TRANSFORM() && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH)
 #  define _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() 1

From 6e713645baef0ae4340cfa52487e6028e23a75f4 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Fri, 12 Jun 2026 17:45:39 -0700
Subject: [PATCH 69/83] gate the tile path on sm_80 (NV_IF_TARGET + runtime cc
 check)

tile requires sm_80+. Mirrors how CUB handles its arch-specific ublkcp kernel -- no CMake arch gate: (1) NV_IF_TARGET(NV_PROVIDES_SM_80) around the transform/fill kernel bodies, so sub-80 cubins carry no tile SASS; (2) a runtime device_supports_tile() (cc >= 80) check in the dispatch hook -- below sm_80, or if the capability query fails, fall back to standard CUB. --enable-tile itself accepts all CTK-13.4 arches, so the floor is enforced at runtime, exactly like ublkcp's cc >= 90 policy gate.
---
 cub/cub/device/device_transform.cuh           | 22 +++++++++-----
 .../dispatch/dispatch_transform_tile.cuh      | 20 ++++++++-----
 .../kernels/kernel_transform_tile.cuh         | 30 +++++++++++--------
 3 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index 9fff2582bc5..c6059b1cb69 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -113,24 +113,32 @@ struct DeviceTransform
     const auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env).get();
 
 #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
-    // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible we route to the tile kernel:
+    // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible and the device is sm_80+ we route to the
+    // tile kernel:
     //  - if num_items is a cuda::aligned_size_t<N>=16, the caller has promised 16-byte pointer
     //    alignment + divisibility, so we commit to tile at compile time and skip the runtime check;
     //  - otherwise we check the alignment/divisibility preconditions at runtime and fall through to
     //    the standard CUB dispatch below if they do not hold (CUB's kernels handle the unaligned/tail
     //    case, so this is a graceful fallback, not an error).
+    // device_supports_tile() enforces the sm_80+ hardware floor at runtime; below it (or if the capability
+    // query fails) we fall through to the standard CUB dispatch.
     if constexpr (StableAddress == detail::transform::requires_stable_address::no
                   && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
                   && cub::detail::transform::tile::
                     tile_dispatch_eligible_v<TransformOp, RandomAccessIteratorOut, RandomAccessIteratorsIn...>)
     {
-      if constexpr (num_items_align >= 16)
+      if (cub::detail::transform::tile::device_supports_tile())
       {
-        return cub::detail::transform::tile::dispatch<TransformOp>(inputs, output, static_cast<offset_t>(count), stream);
-      }
-      else if (cub::detail::transform::tile::runtime_preconditions_valid(inputs, output, static_cast<offset_t>(count)))
-      {
-        return cub::detail::transform::tile::dispatch<TransformOp>(inputs, output, static_cast<offset_t>(count), stream);
+        if constexpr (num_items_align >= 16)
+        {
+          return cub::detail::transform::tile::dispatch<TransformOp>(
+            inputs, output, static_cast<offset_t>(count), stream);
+        }
+        else if (cub::detail::transform::tile::runtime_preconditions_valid(inputs, output, static_cast<offset_t>(count)))
+        {
+          return cub::detail::transform::tile::dispatch<TransformOp>(
+            inputs, output, static_cast<offset_t>(count), stream);
+        }
       }
     }
 #endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index cee5effa0bb..ff20071c6bb 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -10,7 +10,7 @@
 //                                used by `dispatch`
 // User-facing extension points (tile_eligible / tile_mufu_heavy) live in
 // dispatch_transform_tile_traits.cuh under cub::transform.
-// Requires CTK 13.3 or newer and nvcc invoked with --enable-tile.
+// Requires CTK 13.4 or newer and nvcc invoked with --enable-tile.
 
 #pragma once
 
@@ -32,11 +32,13 @@
 #  include <cub/device/dispatch/kernels/kernel_transform_tile.cuh>
 #  include <cub/device/dispatch/tuning/tuning_transform_tile.cuh>
 #  include <cub/util_debug.cuh>
+#  include <cub/util_device.cuh>
 
 #  include <thrust/type_traits/is_contiguous_iterator.h>
 #  include <thrust/type_traits/unwrap_contiguous_iterator.h>
 
 #  include <cuda/__cmath/ceil_div.h>
+#  include <cuda/__device/compute_capability.h>
 #  include <cuda/std/__iterator/readable_traits.h>
 #  include <cuda/std/__memory/is_sufficiently_aligned.h>
 #  include <cuda/std/__tuple_dir/apply.h>
@@ -104,18 +106,22 @@ struct DeviceTransform
   }
 };
 
-// Combined compile-time predicate used by cub::DeviceTransform's __transform_internal
-// to decide whether to route a given (Op, OutIter, InIters...) to the tile path.
-// The call site lifts this into an `if constexpr`: when this is true the hook
-// tries the tile kernel first and, on runtime alignment / divisibility
-// failure, falls through to the standard CUB dispatch below. When false, the
-// tile branch is discarded and only CUB's standard path is emitted.
+// Combined compile-time predicate for whether (Op, OutIter, InIters...) can use the tile path. We use this with
+// `if constexpr` for dispatch: when true the hook tries the tile kernel first and, on runtime alignment/divisibility
+// failure, falls through to the standard CUB dispatch; when false the tile branch is discarded entirely.
 template <typename Op, typename OutIter, typename... InIters>
 inline constexpr bool tile_dispatch_eligible_v =
   THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutIter>
   && (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<InIters> && ...)
   && cub::transform::tile_eligible_v<Op, ::cuda::std::iter_value_t<OutIter>, sizeof...(InIters)>;
 
+// Runtime arch gate: tile needs sm_80+. False (fall back to CUB) below sm_80 or if the cc query fails.
+[[nodiscard]] CUB_RUNTIME_FUNCTION inline bool device_supports_tile()
+{
+  ::cuda::compute_capability cc{};
+  return cub::detail::ptx_compute_cap(cc) == ::cudaSuccess && cc >= ::cuda::compute_capability{8, 0};
+}
+
 // Runtime predicate consulted by the cub::DeviceTransform tile hook before
 // it commits to the tile path. Mirrors how CUB's dispatch_t::CanVectorize
 // guards the vectorized kernel. The tile kernels use ct::assume_aligned<16>
diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index a8bcd7dd836..21345c6bf31 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -19,6 +19,8 @@
 
 #  include <cuda/std/cstdint>
 
+#  include <nv/target>
+
 #  include <cuda_tile.h>
 
 CUB_NAMESPACE_BEGIN
@@ -48,29 +50,33 @@ template <int TileSize, typename T, typename N>
 // assume_divisible<16>      -- promises num_items % 16 == 0, so the tile DSL can elide tail handling.
 // assume_bounded_below<0>   -- promises num_items >= 0; enables sign-comparison simplifications.
 //
+// The body is guarded by NV_IF_TARGET(NV_PROVIDES_SM_80): tile requires sm_80+, so on older arches the kernel
+// compiles to a no-op (no unsupported SASS). The dispatch only launches it on sm_80+ devices (runtime cc check).
+//
 // NOTE: make_aligned_partition_view is invoked directly. do NOT wrap these calls in a lambda because of compiler bug:
 // templated __tile__ helper + a lambda that calls it + --expt-relaxed-constexpr produces invalid IR.
 template <int TileSize, typename Fn, typename Out, typename... Ins>
 __tile_global__ void
 transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins)
 {
-  namespace ct  = ::cuda::tiles;
-  const auto bx = ct::bid().x;
-  const auto n  = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
-
-  const auto out_view = make_aligned_partition_view<TileSize>(out, n);
-  out_view.store_masked(Fn{}(make_aligned_partition_view<TileSize>(ins, n).load_masked(bx)...), bx);
+  namespace ct = ::cuda::tiles;
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_80,
+    (const auto bx = ct::bid().x; const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
+     const auto out_view                       = make_aligned_partition_view<TileSize>(out, n);
+     out_view.store_masked(Fn{}(make_aligned_partition_view<TileSize>(ins, n).load_masked(bx)...), bx);));
 }
 
 template <int TileSize, typename T>
 __tile_global__ void fill_kernel(const ::cuda::std::int64_t num_items, T* __restrict__ out, const T value)
 {
-  namespace ct        = ::cuda::tiles;
-  const auto bx       = ct::bid().x;
-  const auto n        = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
-  const auto out_view = make_aligned_partition_view<TileSize>(out, n);
-  using tile_t        = ct::tile<T, ct::shape<TileSize>>;
-  out_view.store_masked(ct::full<tile_t>(value), bx);
+  namespace ct = ::cuda::tiles;
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_80,
+    (const auto bx = ct::bid().x; const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
+     const auto out_view                       = make_aligned_partition_view<TileSize>(out, n);
+     using tile_t                              = ct::tile<T, ct::shape<TileSize>>;
+     out_view.store_masked(ct::full<tile_t>(value), bx);));
 }
 } // namespace detail::transform::tile
 

From a1c01c71087ffea97afaa4f1b9fc742eca2f57a2 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Fri, 12 Jun 2026 17:45:39 -0700
Subject: [PATCH 70/83] default CCCL_ENABLE_TILE_TRANSFORM_DISPATCH ON for CTK
 13.4+

Instead of a hard OFF default, the option now defaults ON when the toolkit can build the tile path (nvcc NVIDIA 13.4+), so 13.4+ configs -- including CI -- build and run the tile transform test (and benches) automatically; below 13.4 it stays OFF and compiles out. The sm_80+ hardware floor is handled at runtime (dispatch cc check + NV_IF_TARGET in the kernels), so an auto-enabled 13.4+ build still runs correctly on any GPU (falls back to standard CUB below sm_80). An explicit ON below 13.4 still errors via the existing guard.
---
 cub/CMakeLists.txt | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/cub/CMakeLists.txt b/cub/CMakeLists.txt
index e45b5635e91..820614d1e65 100644
--- a/cub/CMakeLists.txt
+++ b/cub/CMakeLists.txt
@@ -10,14 +10,21 @@ option(CUB_ENABLE_HEADER_TESTING "Test that all public headers compile." ON)
 option(CUB_ENABLE_TESTING "Build CUB testing suite." ON)
 option(CUB_ENABLE_EXAMPLES "Build CUB examples." ON)
 
-# Opt-in: build cub::DeviceTransform's tile-DSL path (test + benches) under `nvcc --enable-tile`.
-# Defaults OFF; CI never sets it, so the tile code stays gated out except in an explicit local
-# --enable-tile build. Applied per-target in test/ and benchmarks/ -- never via global CMAKE_CUDA_FLAGS,
-# which would also hit the C++17 c2h helper lib where cuda_tile.h hard-errors on the dialect.
+# Build cub::DeviceTransform's tile-DSL path (test + benches) under `nvcc --enable-tile`. Defaults ON when the
+# toolkit can build it (CTK 13.4+), so 13.4+ configs -- including CI -- exercise the tile path automatically; OFF
+# and compiled out below 13.4. The sm_80+ floor is enforced at runtime (dispatch cc check + NV_IF_TARGET in the
+# kernels), so a 13.4+ build still runs correctly on any GPU.
+set(_cccl_tile_transform_default OFF)
+if (
+  "${CMAKE_CUDA_COMPILER_ID}" STREQUAL "NVIDIA"
+  AND NOT "${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS 13.4
+)
+  set(_cccl_tile_transform_default ON)
+endif()
 option(
   CCCL_ENABLE_TILE_TRANSFORM_DISPATCH
   "Build cub::DeviceTransform's tile path (requires nvcc --enable-tile)."
-  OFF
+  ${_cccl_tile_transform_default}
 )
 if (
   CCCL_ENABLE_TILE_TRANSFORM_DISPATCH

From b9d49d02d8b0193382e0c17b4bd810b36bc1a013 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Fri, 12 Jun 2026 18:02:09 -0700
Subject: [PATCH 71/83] trim verbose comments in the tile transform headers

Comment-only cleanup: tighten the over-long doc blocks (gate-macro CTK note, tile_operator, runtime_preconditions_valid, the dispatch bridge, and the kernel header) and pack multiline comments to the 120-col limit. No code change.
---
 .../dispatch/dispatch_transform_tile.cuh      | 20 +++++--------------
 .../dispatch_transform_tile_config.cuh        |  9 ++-------
 .../dispatch_transform_tile_traits.cuh        |  5 +----
 .../kernels/kernel_transform_tile.cuh         | 18 +++++++----------
 4 files changed, 15 insertions(+), 37 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index ff20071c6bb..48393cc6dcc 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -122,11 +122,8 @@ inline constexpr bool tile_dispatch_eligible_v =
   return cub::detail::ptx_compute_cap(cc) == ::cudaSuccess && cc >= ::cuda::compute_capability{8, 0};
 }
 
-// Runtime predicate consulted by the cub::DeviceTransform tile hook before
-// it commits to the tile path. Mirrors how CUB's dispatch_t::CanVectorize
-// guards the vectorized kernel. The tile kernels use ct::assume_aligned<16>
-// and ct::assume_divisible<16>, so violating these at runtime is UB.
-// Returns false to tell the hook to fall back to the standard CUB dispatch.
+// Runtime precondition the tile hook checks before dispatching: 16-byte pointer alignment + num_items % 16 == 0
+// (the kernels assume_aligned<16>/assume_divisible<16>, so violating these is UB). False -> fall back to CUB.
 template <typename OutIter, typename... InIters, typename OffsetT>
 [[nodiscard]] CUB_RUNTIME_FUNCTION bool
 runtime_preconditions_valid(::cuda::std::tuple<InIters...> const& inputs, OutIter output, OffsetT num_items)
@@ -150,16 +147,9 @@ runtime_preconditions_valid(::cuda::std::tuple<InIters...> const& inputs, OutIte
   return aligned_out && aligned_in && (num_items % items_divisor) == 0;
 }
 
-// Bridge between cub::DeviceTransform::__transform_internal and the tile
-// DeviceTransform above. Precondition: tile_dispatch_eligible_v<Op, OutIter,
-// InIters...> is true AND runtime_preconditions_valid returned true. The kernel
-// itself assumes 16-byte pointer alignment and num_items divisibility; the
-// caller (the hook in device_transform.cuh) is responsible for checking
-// runtime_preconditions_valid first.
-//
-// The tile kernel is launched with tile_operator_t<Op>: for a scalar Op that is its
-// registered tile-friendly mirror (a __tile__ functor), and for an already-tile Op it
-// is Op itself. A scalar functor cannot be invoked on ct::tile arguments.
+// Bridge from cub::DeviceTransform::__transform_internal to the tile DeviceTransform. Precondition (the caller
+// checks it): tile_dispatch_eligible_v is true AND runtime_preconditions_valid returned true. Launches the kernel
+// with tile_operator_t<Op> -- Op's registered __tile__ mirror (a scalar functor can't be invoked on ct::tile).
 template <typename TransformOp, typename OutIter, typename... InIters, typename OffsetT>
 [[nodiscard]] CUB_RUNTIME_FUNCTION ::cudaError_t
 dispatch(::cuda::std::tuple<InIters...> inputs, OutIter output, OffsetT num_items, ::cudaStream_t stream)
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
index 833bca94d83..0c06b091335 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
@@ -5,13 +5,8 @@
 // share. Two macros:
 //
 //   _CCCL_CUB_HAS_TILE_TRANSFORM()
-//     True when nvcc is compiling in tile mode (--enable-tile, i.e.
-//     _CCCL_TILE_COMPILATION()) AND the toolkit is CTK 13.4+. tile C++ exists
-//     since 13.3, but we require 13.4: the 13.3 tile compiler has too many
-//     codegen issues, so 13.4 is the supported floor. (C++20 is enforced by
-//     cuda_tile.h itself with an explicit #error.) The sm_80+ requirement is
-//     handled at runtime in the dispatch + NV_IF_TARGET in the kernels, not
-//     here, since this gate is host+device. When false, the tile headers
+//     True when nvcc is in tile mode (--enable-tile / _CCCL_TILE_COMPILATION()) AND CTK 13.4+. The sm_80+
+//     requirement is handled at runtime + NV_IF_TARGET in the kernels, not here. When false, the tile headers
 //     (kernel / tuning / dispatch / traits) are skipped entirely.
 //
 //   _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
index c347f6f0631..d9db2d4684c 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
@@ -60,10 +60,7 @@ namespace transform
 template <typename Op, typename T, ::cuda::std::size_t NIn>
 inline constexpr bool tile_eligible_v = false;
 
-// The __tile__ functor the tile kernel runs for Op -- the tile-side mirror of the scalar Op. There is
-// no default: a scalar functor cannot be invoked on ct::tile, so every tile-eligible Op must specialize
-// this with a `type` naming a stateless __tile__ functor. tile_eligible_v<Op,...> says a combo MAY use the
-// tile path; tile_operator<Op> says WHAT the tile kernel runs.
+// The __tile__ functor the tile kernel runs for Op.
 template <typename Op>
 struct tile_operator
 {
diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index 21345c6bf31..bd36bd1843d 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -43,18 +43,14 @@ template <int TileSize, typename T, typename N>
   return ct::partition_view{span, ct::shape<TileSize>{}};
 }
 
-// Tile DSL kernels backing cub::DeviceTransform's tile path. The kernels assume 16-byte alignment on every pointer
-// and 16-byte divisibility on num_items so the compiler can pick LDG.E.128. Callers in the dispatch header are
-// responsible for honoring those preconditions.
+// Tile DSL kernels backing cub::DeviceTransform's tile path. They assume 16-byte pointer alignment + 16-divisible
+// num_items (so the compiler picks LDG.E.128); the dispatch header honors that. NV_IF_TARGET(NV_PROVIDES_SM_80)
+// guards the body -- tile needs sm_80+, so sub-80 arches get a no-op kernel (dispatch only launches it on sm_80+).
+//   assume_divisible<16>     -- num_items % 16 == 0, so the tile DSL can elide tail handling.
+//   assume_bounded_below<0>  -- num_items >= 0; enables sign-comparison simplifications.
 //
-// assume_divisible<16>      -- promises num_items % 16 == 0, so the tile DSL can elide tail handling.
-// assume_bounded_below<0>   -- promises num_items >= 0; enables sign-comparison simplifications.
-//
-// The body is guarded by NV_IF_TARGET(NV_PROVIDES_SM_80): tile requires sm_80+, so on older arches the kernel
-// compiles to a no-op (no unsupported SASS). The dispatch only launches it on sm_80+ devices (runtime cc check).
-//
-// NOTE: make_aligned_partition_view is invoked directly. do NOT wrap these calls in a lambda because of compiler bug:
-// templated __tile__ helper + a lambda that calls it + --expt-relaxed-constexpr produces invalid IR.
+// NOTE: make_aligned_partition_view is invoked directly -- do NOT wrap these calls in a lambda: nvcc 13.4
+// miscompiles a templated __tile__ helper called via a lambda under --expt-relaxed-constexpr (invalid IR).
 template <int TileSize, typename Fn, typename Out, typename... Ins>
 __tile_global__ void
 transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins)

From f76647e75c45fa6bbf8b0f1f7fdecfaf2329fe8a Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Sat, 13 Jun 2026 20:37:37 -0700
Subject: [PATCH 72/83] tidy tile bench type axes

copy: drop float (same 4-byte width as int32_t -- redundant for an identity-copy bandwidth bench, which only depends on element size). All tile benches: use nvbench:: type aliases (int8_t/int16_t/int32_t/float32_t/float64_t) for the element-type axis instead of std:: -- same underlying types, but nvbench's namespaced spelling is the bench convention. __half/__nv_bfloat16 (pytorch) keep their names; no nvbench alias.
---
 cub/benchmarks/bench/transform/tile/babelstream.cu | 2 +-
 cub/benchmarks/bench/transform/tile/copy.cu        | 2 +-
 cub/benchmarks/bench/transform/tile/grayscale.cu   | 2 +-
 cub/benchmarks/bench/transform/tile/pytorch.cu     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu
index 60ccfd9f5f7..45260565b2a 100644
--- a/cub/benchmarks/bench/transform/tile/babelstream.cu
+++ b/cub/benchmarks/bench/transform/tile/babelstream.cu
@@ -122,7 +122,7 @@ CUB_NAMESPACE_END
 #ifdef TUNE_T
 using element_types = nvbench::type_list<TUNE_T>;
 #else
-using element_types = nvbench::type_list<std::int8_t, std::int16_t, float, double>;
+using element_types = nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::float32_t, nvbench::float64_t>;
 #endif
 
 inline auto array_size_powers = nvbench::range(16, 32, 4);
diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu
index e766c420286..e6b869f15e0 100644
--- a/cub/benchmarks/bench/transform/tile/copy.cu
+++ b/cub/benchmarks/bench/transform/tile/copy.cu
@@ -49,7 +49,7 @@ CUB_NAMESPACE_END
 #ifdef TUNE_T
 using element_types = nvbench::type_list<TUNE_T>;
 #else
-using element_types = nvbench::type_list<std::int8_t, std::int16_t, std::int32_t, float, double>;
+using element_types = nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::int32_t, nvbench::float64_t>;
 #endif
 
 template <typename T>
diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu
index daee79afc16..f7e2a581ae0 100644
--- a/cub/benchmarks/bench/transform/tile/grayscale.cu
+++ b/cub/benchmarks/bench/transform/tile/grayscale.cu
@@ -54,7 +54,7 @@ CUB_NAMESPACE_END
 #ifdef TUNE_T
 using value_types = nvbench::type_list<TUNE_T>;
 #else
-using value_types = nvbench::type_list<float, double>;
+using value_types = nvbench::type_list<nvbench::float32_t, nvbench::float64_t>;
 #endif
 
 template <typename T>
diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu
index 527ac65eb72..be11d97523f 100644
--- a/cub/benchmarks/bench/transform/tile/pytorch.cu
+++ b/cub/benchmarks/bench/transform/tile/pytorch.cu
@@ -417,7 +417,7 @@ using element_types = nvbench::type_list<
 #  if _CCCL_HAS_NVBF16() && _CCCL_CTK_AT_LEAST(12, 2)
   __nv_bfloat16,
 #  endif
-  float>;
+  nvbench::float32_t>;
 #endif
 
 template <typename Op, typename T>

From b1df2c620102c685d0fca29ec6c8ed0bffb4ef0f Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Mon, 15 Jun 2026 09:51:48 -0700
Subject: [PATCH 73/83] collapse tile_mufu_heavy to a single variable template

Same treatment as tile_eligible: drop the false_type struct, make tile_mufu_heavy_v the specializable extension point directly. The dispatch consumer and the pytorch bench's specializations already used the _v form; this just removes the redundant struct, matching the _v-trait convention.
---
 cub/benchmarks/bench/transform/tile/pytorch.cu  | 17 ++++++-----------
 .../dispatch/dispatch_transform_tile_traits.cuh | 14 ++++----------
 2 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu
index be11d97523f..a63a89f68da 100644
--- a/cub/benchmarks/bench/transform/tile/pytorch.cu
+++ b/cub/benchmarks/bench/transform/tile/pytorch.cu
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 // Tile variant of the PyTorch-style transform benches. Each named op registers a tile_operator
-// substitute (gated); MUFU-heavy ops also opt into tile_mufu_heavy<> so the tile policy picker caps
+// substitute (gated); MUFU-heavy ops also opt into tile_mufu_heavy_v so the tile policy picker caps
 // items/thread at the vector width on sub-4-byte types. Under --enable-tile +
 // CCCL_ENABLE_TILE_TRANSFORM_DISPATCH the dispatch hook routes them to the tile kernel; otherwise this
 // is the standard CUB path. This file disappears once tile dispatch is fully transparent.
@@ -331,20 +331,15 @@ struct tile_operator<exp_op>
 // MUFU-heavy unary ops: hint the tile policy picker to cap items/thread at the vector width on
 // sub-4-byte types.
 template <>
-struct tile_mufu_heavy<sigmoid_op> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_mufu_heavy_v<sigmoid_op> = true;
 template <>
-struct tile_mufu_heavy<tanh_op> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_mufu_heavy_v<tanh_op> = true;
 template <>
-struct tile_mufu_heavy<gelu_op> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_mufu_heavy_v<gelu_op> = true;
 template <>
-struct tile_mufu_heavy<sin_op> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_mufu_heavy_v<sin_op> = true;
 template <>
-struct tile_mufu_heavy<exp_op> : ::cuda::std::true_type
-{};
+inline constexpr bool tile_mufu_heavy_v<exp_op> = true;
 
 // Binary
 template <class T>
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
index d9db2d4684c..ad4e05926e0 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
@@ -16,9 +16,7 @@
 //                                   on ct::tile. Omitting it is a clear
 //                                   static_assert, not a cryptic kernel error.
 //   tile_operator_t<Op>         -- alias for tile_operator<Op>::type.
-//   tile_mufu_heavy<Op>         -- specialize to flag Op as MUFU-heavy; the
-//                                   tile policy picker uses this hint.
-//   tile_mufu_heavy_v<...>      -- variable-template companion.
+//   tile_mufu_heavy_v<Op>       -- specialize to true to flag Op as MUFU-heavy; the tile policy picker uses it.
 //
 // Eligibility ("may this combo use the tile path?") and substitution ("which
 // __tile__ functor do we actually run?") are separate traits, so an eligible op
@@ -72,14 +70,10 @@ struct tile_operator
 template <typename Op>
 using tile_operator_t = typename tile_operator<Op>::type;
 
-// Hint that Op uses MUFU (multi-function unit, sin/cos/exp/log/tanh/rcp/rsq). Setting this makes
-// the tile policy picker cap items/thread so MUFU pipes are not oversaturated.
+// Hint that Op uses MUFU (multi-function unit, sin/cos/exp/log/tanh/rcp/rsq); specialize to true to make the tile
+// policy picker cap items/thread so MUFU pipes are not oversaturated.
 template <typename Op>
-struct tile_mufu_heavy : ::cuda::std::false_type
-{};
-
-template <typename Op>
-inline constexpr bool tile_mufu_heavy_v = tile_mufu_heavy<Op>::value;
+inline constexpr bool tile_mufu_heavy_v = false;
 } // namespace transform
 
 // Internal substitutes shipped by CCCL.

From 7428773a71fe2fa2ad4fbd24f8162be76e21f1d3 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Mon, 15 Jun 2026 15:23:42 -0700
Subject: [PATCH 74/83] libcudacxx: undef tile-unsupported builtins instead of
 stripping _CCCL_API

Restore upstream _CCCL_API; undef __builtin_assume_aligned and __builtin_launder
under tile. Green on 13.5; 13.4 still needs the _CCCL_TILE strip (exec-check unfixed).
---
 libcudacxx/include/cuda/std/__cccl/builtin.h    |  5 +++++
 libcudacxx/include/cuda/std/__cccl/visibility.h | 10 +---------
 libcudacxx/include/cuda/std/__new/launder.h     |  4 ++++
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h
index f6cd76cc929..69d1509ebd2 100644
--- a/libcudacxx/include/cuda/std/__cccl/builtin.h
+++ b/libcudacxx/include/cuda/std/__cccl/builtin.h
@@ -112,6 +112,11 @@
 #  define _CCCL_BUILTIN_ASSUME_ALIGNED(...) __builtin_assume_aligned(__VA_ARGS__)
 #endif // _CCCL_HAS_BUILTIN(__builtin_assume_aligned)
 
+#if _CCCL_TILE_COMPILATION() // __builtin_assume_aligned is not supported in tile mode
+#  undef _CCCL_BUILTIN_ASSUME_ALIGNED
+#  define _CCCL_BUILTIN_ASSUME_ALIGNED(_Ptr, ...) (_Ptr)
+#endif // _CCCL_TILE_COMPILATION()
+
 #if _CCCL_CHECK_BUILTIN(builtin_constant_p) || _CCCL_COMPILER(GCC)
 #  define _CCCL_BUILTIN_CONSTANT_P(...) __builtin_constant_p(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(builtin_constant_p)
diff --git a/libcudacxx/include/cuda/std/__cccl/visibility.h b/libcudacxx/include/cuda/std/__cccl/visibility.h
index 47337d8d8fd..075a98130aa 100644
--- a/libcudacxx/include/cuda/std/__cccl/visibility.h
+++ b/libcudacxx/include/cuda/std/__cccl/visibility.h
@@ -116,15 +116,7 @@
 #  define _CCCL_DEVICE_API      _CCCL_DEVICE
 #  define _CCCL_TILE_API        _CCCL_TILE
 #else // ^^^ _CCCL_COMPILER(NVHPC) ^^^ / vvv !_CCCL_COMPILER(NVHPC) vvv
-// Local fork patch: drop _CCCL_TILE from _CCCL_API. Under the tile compiler's
-// local-only context check, marking a host/device utility __tile__ means its
-// body must satisfy tile restrictions even when the caller is non-tile. That
-// fails for any utility that takes a user-provided callable (apply, invoke,
-// visit, runtime_assume_aligned, ...). Drop the marker globally; tile DSL
-// code in this branch uses its own tile-marked operations and doesn't depend
-// on libcudacxx utilities being tile-callable. Revert when upstream fixes the
-// marking discipline (or the compiler adopts per-instantiation checking).
-#  define _CCCL_API             _CCCL_HOST_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION
+#  define _CCCL_API             _CCCL_TILE _CCCL_HOST_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION
 #  define _CCCL_HOST_DEVICE_API _CCCL_HOST_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION
 #  define _CCCL_HOST_API        _CCCL_HOST _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION
 #  define _CCCL_DEVICE_API      _CCCL_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION
diff --git a/libcudacxx/include/cuda/std/__new/launder.h b/libcudacxx/include/cuda/std/__new/launder.h
index e2f3af192a0..3d67950fc18 100644
--- a/libcudacxx/include/cuda/std/__new/launder.h
+++ b/libcudacxx/include/cuda/std/__new/launder.h
@@ -32,6 +32,10 @@
 #  define _CCCL_BUILTIN_LAUNDER(...) __builtin_launder(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(builtin_launder) || _CCCL_COMPILER(GCC, >=, 7) || _CCCL_COMPILER(MSVC)
 
+#if _CCCL_TILE_COMPILATION() // __builtin_launder is not supported in tile mode
+#  undef _CCCL_BUILTIN_LAUNDER
+#endif // _CCCL_TILE_COMPILATION()
+
 _CCCL_BEGIN_NAMESPACE_CUDA_STD
 
 template <class _Tp>

From 0c13142fb3a1d31db586d2d4b7999b65e7b1d5e3 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Mon, 22 Jun 2026 14:38:10 -0700
Subject: [PATCH 75/83] drop aligned_size_t tile-commit hint so all
 DeviceTransform changes are behind the dispatch macro

---
 cub/cub/device/device_transform.cuh | 41 ++++++++---------------------
 1 file changed, 11 insertions(+), 30 deletions(-)

diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index c6059b1cb69..5a8cfdad497 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -27,7 +27,6 @@
 #include <cuda/__functional/always_true_false.h>
 #include <cuda/__functional/call_or.h>
 #include <cuda/__iterator/zip_iterator.h>
-#include <cuda/__memory/aligned_size.h>
 #include <cuda/__stream/get_stream.h>
 #include <cuda/std/__execution/env.h>
 #include <cuda/std/tuple>
@@ -98,14 +97,7 @@ struct DeviceTransform
     // https://github.com/NVIDIA/cccl/issues/8805 for data. We use choose_signed_offset to just check if it can hold the
     // value passed by the user, but otherwise ignore the chosen signed offset type.
     using offset_t = ::cuda::std::int64_t;
-
-    // num_items may be a plain integer or cuda::aligned_size_t<N> (the cuda::memcpy_async-style opt-in promising N-byte
-    // pointer alignment + size divisibility). Unwrap to a plain integer for the offset machinery (choose_signed_offset
-    // needs an integral type); the tile hook below reads the alignment promise. No-op for a plain integer.
-    constexpr auto num_items_align = ::cuda::__get_size_align_v<NumItemsT>;
-    using count_t                  = ::cuda::std::conditional_t<(num_items_align > 1), ::cuda::std::size_t, NumItemsT>;
-    const count_t count            = static_cast<count_t>(num_items);
-    if (const cudaError_t error = detail::choose_signed_offset<count_t>::is_exceeding_offset_type(count))
+    if (const cudaError_t error = detail::choose_signed_offset<NumItemsT>::is_exceeding_offset_type(num_items))
     {
       return error;
     }
@@ -113,32 +105,21 @@ struct DeviceTransform
     const auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env).get();
 
 #if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
-    // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible and the device is sm_80+ we route to the
-    // tile kernel:
-    //  - if num_items is a cuda::aligned_size_t<N>=16, the caller has promised 16-byte pointer
-    //    alignment + divisibility, so we commit to tile at compile time and skip the runtime check;
-    //  - otherwise we check the alignment/divisibility preconditions at runtime and fall through to
-    //    the standard CUB dispatch below if they do not hold (CUB's kernels handle the unaligned/tail
-    //    case, so this is a graceful fallback, not an error).
-    // device_supports_tile() enforces the sm_80+ hardware floor at runtime; below it (or if the capability
-    // query fails) we fall through to the standard CUB dispatch.
+    // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible and the device is sm_80+, we check the
+    // alignment/divisibility preconditions at runtime and route to the tile kernel; we fall through to the standard
+    // CUB dispatch below if they do not hold (CUB's kernels handle the unaligned/tail case, so this is a graceful
+    // fallback, not an error). device_supports_tile() enforces the sm_80+ hardware floor at runtime; below it (or if
+    // the capability query fails) we fall through to the standard CUB dispatch.
     if constexpr (StableAddress == detail::transform::requires_stable_address::no
                   && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
                   && cub::detail::transform::tile::
                     tile_dispatch_eligible_v<TransformOp, RandomAccessIteratorOut, RandomAccessIteratorsIn...>)
     {
-      if (cub::detail::transform::tile::device_supports_tile())
+      if (cub::detail::transform::tile::device_supports_tile()
+          && cub::detail::transform::tile::runtime_preconditions_valid(inputs, output, static_cast<offset_t>(num_items)))
       {
-        if constexpr (num_items_align >= 16)
-        {
-          return cub::detail::transform::tile::dispatch<TransformOp>(
-            inputs, output, static_cast<offset_t>(count), stream);
-        }
-        else if (cub::detail::transform::tile::runtime_preconditions_valid(inputs, output, static_cast<offset_t>(count)))
-        {
-          return cub::detail::transform::tile::dispatch<TransformOp>(
-            inputs, output, static_cast<offset_t>(count), stream);
-        }
+        return cub::detail::transform::tile::dispatch<TransformOp>(
+          inputs, output, static_cast<offset_t>(num_items), stream);
       }
     }
 #endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
@@ -161,7 +142,7 @@ struct DeviceTransform
     return detail::transform::dispatch<StableAddress>(
       ::cuda::std::move(inputs),
       ::cuda::std::move(output),
-      static_cast<offset_t>(count),
+      static_cast<offset_t>(num_items),
       ::cuda::std::move(predicate),
       ::cuda::std::move(transform_op),
       stream,

From da688b6beb678f1f2d54bd1d966e37e35e465d92 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 23 Jun 2026 13:17:47 -0700
Subject: [PATCH 76/83] tile: drop internal DeviceTransform struct; fold
 tile-size pick into free-function dispatch

---
 .../dispatch/dispatch_transform_tile.cuh      | 55 +++++--------------
 1 file changed, 15 insertions(+), 40 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 48393cc6dcc..0370c258448 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -4,10 +4,8 @@
 // Internal dispatch helpers for cub::DeviceTransform's tile path:
 //   tile_dispatch_eligible_v  -- compile-time predicate the hook consults
 //   runtime_preconditions_valid  -- runtime alignment + divisibility predicate
-//   dispatch                  -- bridge that launches the tile kernel with
-//                                the trait's substitute functor
-//   DeviceTransform           -- internal tile-local Transform/Fill wrappers
-//                                used by `dispatch`
+//   dispatch                  -- bridge that picks the tile size and launches
+//                                the tile kernel with the trait's substitute functor
 // User-facing extension points (tile_eligible / tile_mufu_heavy) live in
 // dispatch_transform_tile_traits.cuh under cub::transform.
 // Requires CTK 13.4 or newer and nvcc invoked with --enable-tile.
@@ -75,37 +73,6 @@ template <int TileSize, typename Fn, typename Out, typename... Ins, ::cuda::std:
   return CubDebug(::cudaGetLastError());
 }
 
-struct DeviceTransform
-{
-  template <int TileSize = 0, bool MufuHeavy = false, typename Fn, typename Out, typename... Ins>
-  [[nodiscard]] static ::cudaError_t Transform(
-    ::cuda::std::tuple<Ins*...> inputs, Out* output, ::cuda::std::int64_t num_items, Fn, ::cudaStream_t stream = nullptr)
-  {
-    constexpr int chosen =
-      (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size<Out, Ins...>(MufuHeavy);
-    return cub::detail::transform::tile::launch_impl<chosen, Fn>(
-      inputs, output, num_items, stream, ::cuda::std::index_sequence_for<Ins...>{});
-  }
-
-  // Fill
-  template <int TileSize = 0, typename T>
-  [[nodiscard]] static ::cudaError_t
-  Fill(T* output, ::cuda::std::int64_t num_items, T value, ::cudaStream_t stream = nullptr)
-  {
-    if (num_items <= 0)
-    {
-      return ::cudaSuccess;
-    }
-    constexpr int chosen = (TileSize > 0) ? TileSize : cub::detail::transform::tile::pick_tile_size<T>();
-    // One CTA per tile; see launch_impl -- num_blocks can't exceed the unsigned grid x-dim for
-    // any device-sized num_items.
-    const ::cuda::std::int64_t num_blocks = ::cuda::ceil_div(num_items, ::cuda::std::int64_t{chosen});
-    cub::detail::transform::tile::fill_kernel<chosen, T>
-      <<<static_cast<unsigned>(num_blocks), 1, 0, stream>>>(num_items, output, value);
-    return CubDebug(::cudaGetLastError());
-  }
-};
-
 // Combined compile-time predicate for whether (Op, OutIter, InIters...) can use the tile path. We use this with
 // `if constexpr` for dispatch: when true the hook tries the tile kernel first and, on runtime alignment/divisibility
 // failure, falls through to the standard CUB dispatch; when false the tile branch is discarded entirely.
@@ -154,21 +121,29 @@ template <typename TransformOp, typename OutIter, typename... InIters, typename
 [[nodiscard]] CUB_RUNTIME_FUNCTION ::cudaError_t
 dispatch(::cuda::std::tuple<InIters...> inputs, OutIter output, OffsetT num_items, ::cudaStream_t stream)
 {
-  auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output);
-  auto in_ptrs = ::cuda::std::apply(
+  const auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output);
+  const auto in_ptrs = ::cuda::std::apply(
     [](auto... iters) {
       return ::cuda::std::make_tuple(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters)...);
     },
     inputs);
-  // The tile functor to run for TransformOp: its registered tile_operator mirror.
+
+  // The tile functor to run for TransformOp: its registered tile_operator mirror (a scalar functor can't be
+  // invoked on ct::tile).
   using tile_op_t = cub::transform::tile_operator_t<TransformOp>;
   static_assert(::cuda::std::is_empty_v<tile_op_t>,
                 "tile_operator type must be stateless (the tile kernel default-constructs it)");
   static_assert(::cuda::std::is_trivially_default_constructible_v<tile_op_t>,
                 "tile_operator type must be trivially default constructible");
 
-  return DeviceTransform::Transform<0, cub::transform::tile_mufu_heavy_v<TransformOp>, tile_op_t>(
-    in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), tile_op_t{}, stream);
+  // Pick the tile size from the element types (no caller override -- mirrors the regular path, where the policy
+  // drives the size), then launch.
+  constexpr int tile_size =
+    cub::detail::transform::tile::pick_tile_size<::cuda::std::iter_value_t<OutIter>,
+                                                 ::cuda::std::iter_value_t<InIters>...>(
+      cub::transform::tile_mufu_heavy_v<TransformOp>);
+  return cub::detail::transform::tile::launch_impl<tile_size, tile_op_t>(
+    in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), stream, ::cuda::std::index_sequence_for<InIters...>{});
 }
 } // namespace detail::transform::tile
 

From ebede8815781d203dcec7eb0a2cff494526ba99c Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 23 Jun 2026 13:20:22 -0700
Subject: [PATCH 77/83] tile: remove unused fill_kernel (no Fill hook wired)

---
 .../dispatch/kernels/kernel_transform_tile.cuh     | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index bd36bd1843d..b9627748cb3 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -43,7 +43,7 @@ template <int TileSize, typename T, typename N>
   return ct::partition_view{span, ct::shape<TileSize>{}};
 }
 
-// Tile DSL kernels backing cub::DeviceTransform's tile path. They assume 16-byte pointer alignment + 16-divisible
+// Tile DSL kernel backing cub::DeviceTransform's tile path. It assumes 16-byte pointer alignment + 16-divisible
 // num_items (so the compiler picks LDG.E.128); the dispatch header honors that. NV_IF_TARGET(NV_PROVIDES_SM_80)
 // guards the body -- tile needs sm_80+, so sub-80 arches get a no-op kernel (dispatch only launches it on sm_80+).
 //   assume_divisible<16>     -- num_items % 16 == 0, so the tile DSL can elide tail handling.
@@ -62,18 +62,6 @@ transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, co
      const auto out_view                       = make_aligned_partition_view<TileSize>(out, n);
      out_view.store_masked(Fn{}(make_aligned_partition_view<TileSize>(ins, n).load_masked(bx)...), bx);));
 }
-
-template <int TileSize, typename T>
-__tile_global__ void fill_kernel(const ::cuda::std::int64_t num_items, T* __restrict__ out, const T value)
-{
-  namespace ct = ::cuda::tiles;
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_80,
-    (const auto bx = ct::bid().x; const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
-     const auto out_view                       = make_aligned_partition_view<TileSize>(out, n);
-     using tile_t                              = ct::tile<T, ct::shape<TileSize>>;
-     out_view.store_masked(ct::full<tile_t>(value), bx);));
-}
 } // namespace detail::transform::tile
 
 CUB_NAMESPACE_END

From cd8d429a93cfdce838ed0d8fc21ea1bf1112a362 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 23 Jun 2026 13:26:30 -0700
Subject: [PATCH 78/83] tile: drop redundant inline comments in dispatch

---
 cub/cub/device/dispatch/dispatch_transform_tile.cuh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 0370c258448..8a44d0f9e9f 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -128,16 +128,12 @@ dispatch(::cuda::std::tuple<InIters...> inputs, OutIter output, OffsetT num_item
     },
     inputs);
 
-  // The tile functor to run for TransformOp: its registered tile_operator mirror (a scalar functor can't be
-  // invoked on ct::tile).
   using tile_op_t = cub::transform::tile_operator_t<TransformOp>;
   static_assert(::cuda::std::is_empty_v<tile_op_t>,
                 "tile_operator type must be stateless (the tile kernel default-constructs it)");
   static_assert(::cuda::std::is_trivially_default_constructible_v<tile_op_t>,
                 "tile_operator type must be trivially default constructible");
 
-  // Pick the tile size from the element types (no caller override -- mirrors the regular path, where the policy
-  // drives the size), then launch.
   constexpr int tile_size =
     cub::detail::transform::tile::pick_tile_size<::cuda::std::iter_value_t<OutIter>,
                                                  ::cuda::std::iter_value_t<InIters>...>(

From 8a8724bfc18a57cce076bf45b398a0e463c0752e Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 23 Jun 2026 13:26:30 -0700
Subject: [PATCH 79/83] tile: use Apache-2.0 license header for new bench files

---
 cub/benchmarks/bench/transform/tile/babelstream.cu | 2 +-
 cub/benchmarks/bench/transform/tile/copy.cu        | 2 +-
 cub/benchmarks/bench/transform/tile/grayscale.cu   | 2 +-
 cub/benchmarks/bench/transform/tile/pytorch.cu     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu
index 45260565b2a..412d5957da3 100644
--- a/cub/benchmarks/bench/transform/tile/babelstream.cu
+++ b/cub/benchmarks/bench/transform/tile/babelstream.cu
@@ -1,5 +1,5 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
-// SPDX-License-Identifier: BSD-3-Clause
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 // Tile variant of the BabelStream transform bench. The lambdas of the base benchmark are replaced by
 // named, stateless ops that register a tile_operator substitute (gated). Under --enable-tile +
diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu
index e6b869f15e0..a0c32e3d16d 100644
--- a/cub/benchmarks/bench/transform/tile/copy.cu
+++ b/cub/benchmarks/bench/transform/tile/copy.cu
@@ -1,5 +1,5 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
-// SPDX-License-Identifier: BSD-3-Clause
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 // Pure copy (identity transform) -- measures plain load/store bandwidth through the tile
 // load_masked/store_masked path. The identity op registers a tile_operator substitute (gated); under
diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu
index f7e2a581ae0..fbc539fa31c 100644
--- a/cub/benchmarks/bench/transform/tile/grayscale.cu
+++ b/cub/benchmarks/bench/transform/tile/grayscale.cu
@@ -1,5 +1,5 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
-// SPDX-License-Identifier: BSD-3-Clause
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 // Tile variant of the grayscale transform bench. Unlike the base bench (a single rgb_t<T> struct
 // input), this uses three separate R/G/B streams so the inputs are plain element types the tile path
diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu
index a63a89f68da..deff05e2852 100644
--- a/cub/benchmarks/bench/transform/tile/pytorch.cu
+++ b/cub/benchmarks/bench/transform/tile/pytorch.cu
@@ -1,5 +1,5 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
-// SPDX-License-Identifier: BSD-3-Clause
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 // Tile variant of the PyTorch-style transform benches. Each named op registers a tile_operator
 // substitute (gated); MUFU-heavy ops also opt into tile_mufu_heavy_v so the tile policy picker caps

From 20fd23705687ea3204d84b21e2c937853387590d Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 23 Jun 2026 13:28:35 -0700
Subject: [PATCH 80/83] tile: clang-format dispatch and fix stale struct
 reference in comment

---
 .../device/dispatch/dispatch_transform_tile.cuh   | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 8a44d0f9e9f..6803aa66541 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -114,7 +114,7 @@ runtime_preconditions_valid(::cuda::std::tuple<InIters...> const& inputs, OutIte
   return aligned_out && aligned_in && (num_items % items_divisor) == 0;
 }
 
-// Bridge from cub::DeviceTransform::__transform_internal to the tile DeviceTransform. Precondition (the caller
+// Bridge from cub::DeviceTransform::__transform_internal to the tile kernel. Precondition (the caller
 // checks it): tile_dispatch_eligible_v is true AND runtime_preconditions_valid returned true. Launches the kernel
 // with tile_operator_t<Op> -- Op's registered __tile__ mirror (a scalar functor can't be invoked on ct::tile).
 template <typename TransformOp, typename OutIter, typename... InIters, typename OffsetT>
@@ -134,12 +134,15 @@ dispatch(::cuda::std::tuple<InIters...> inputs, OutIter output, OffsetT num_item
   static_assert(::cuda::std::is_trivially_default_constructible_v<tile_op_t>,
                 "tile_operator type must be trivially default constructible");
 
-  constexpr int tile_size =
-    cub::detail::transform::tile::pick_tile_size<::cuda::std::iter_value_t<OutIter>,
-                                                 ::cuda::std::iter_value_t<InIters>...>(
-      cub::transform::tile_mufu_heavy_v<TransformOp>);
+  constexpr int tile_size = cub::detail::transform::tile::pick_tile_size<::cuda::std::iter_value_t<OutIter>,
+                                                                         ::cuda::std::iter_value_t<InIters>...>(
+    cub::transform::tile_mufu_heavy_v<TransformOp>);
   return cub::detail::transform::tile::launch_impl<tile_size, tile_op_t>(
-    in_ptrs, out_ptr, static_cast<::cuda::std::int64_t>(num_items), stream, ::cuda::std::index_sequence_for<InIters...>{});
+    in_ptrs,
+    out_ptr,
+    static_cast<::cuda::std::int64_t>(num_items),
+    stream,
+    ::cuda::std::index_sequence_for<InIters...>{});
 }
 } // namespace detail::transform::tile
 

From 581fcf4e364e22d82abc3c8f164d719d3b142594 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 23 Jun 2026 13:43:53 -0700
Subject: [PATCH 81/83] tile: mark out_ptr const in runtime_preconditions_valid

---
 cub/cub/device/dispatch/dispatch_transform_tile.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
index 6803aa66541..c319f429682 100644
--- a/cub/cub/device/dispatch/dispatch_transform_tile.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -101,7 +101,7 @@ runtime_preconditions_valid(::cuda::std::tuple<InIters...> const& inputs, OutIte
   constexpr int byte_align    = 16;
   constexpr int items_divisor = 16;
 
-  auto out_ptr           = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output);
+  const auto out_ptr     = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output);
   const bool aligned_out = ::cuda::std::is_sufficiently_aligned<byte_align>(out_ptr);
   const bool aligned_in  = ::cuda::std::apply(
     [](auto... iters) {

From 3ce1647aaf13c870aafe6aad0af2edd3594515e0 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 23 Jun 2026 13:43:53 -0700
Subject: [PATCH 82/83] tile: drop __restrict__ from kernel params (API permits
 in-place transforms)

---
 cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index b9627748cb3..d93dbd5bd2a 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -53,7 +53,7 @@ template <int TileSize, typename T, typename N>
 // miscompiles a templated __tile__ helper called via a lambda under --expt-relaxed-constexpr (invalid IR).
 template <int TileSize, typename Fn, typename Out, typename... Ins>
 __tile_global__ void
-transform_kernel(const ::cuda::std::int64_t num_items, Out* __restrict__ out, const Ins* __restrict__... ins)
+transform_kernel(const ::cuda::std::int64_t num_items, Out* out, const Ins*... ins)
 {
   namespace ct = ::cuda::tiles;
   NV_IF_TARGET(

From f7729e726b21400e55fa953f4c36231e5fb992a8 Mon Sep 17 00:00:00 2001
From: Nan An <nan@nvidia.com>
Date: Tue, 23 Jun 2026 13:51:08 -0700
Subject: [PATCH 83/83] tile: reflow transform_kernel signature (clang-format)

---
 cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
index d93dbd5bd2a..b2e8f2b5e68 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -52,8 +52,7 @@ template <int TileSize, typename T, typename N>
 // NOTE: make_aligned_partition_view is invoked directly -- do NOT wrap these calls in a lambda: nvcc 13.4
 // miscompiles a templated __tile__ helper called via a lambda under --expt-relaxed-constexpr (invalid IR).
 template <int TileSize, typename Fn, typename Out, typename... Ins>
-__tile_global__ void
-transform_kernel(const ::cuda::std::int64_t num_items, Out* out, const Ins*... ins)
+__tile_global__ void transform_kernel(const ::cuda::std::int64_t num_items, Out* out, const Ins*... ins)
 {
   namespace ct = ::cuda::tiles;
   NV_IF_TARGET(