diff --git a/cub/CMakeLists.txt b/cub/CMakeLists.txt
index 4c8c778f7fe..820614d1e65 100644
--- a/cub/CMakeLists.txt
+++ b/cub/CMakeLists.txt
@@ -10,6 +10,34 @@ option(CUB_ENABLE_HEADER_TESTING "Test that all public headers compile." ON)
 option(CUB_ENABLE_TESTING "Build CUB testing suite." ON)
 option(CUB_ENABLE_EXAMPLES "Build CUB examples." ON)
 
+# Build cub::DeviceTransform's tile-DSL path (test + benches) under `nvcc --enable-tile`. Defaults ON when the
+# toolkit can build it (CTK 13.4+), so 13.4+ configs -- including CI -- exercise the tile path automatically; OFF
+# and compiled out below 13.4. The sm_80+ floor is enforced at runtime (dispatch cc check + NV_IF_TARGET in the
+# kernels), so a 13.4+ build still runs correctly on any GPU.
+set(_cccl_tile_transform_default OFF)
+if (
+  "${CMAKE_CUDA_COMPILER_ID}" STREQUAL "NVIDIA"
+  AND NOT "${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS 13.4
+)
+  set(_cccl_tile_transform_default ON)
+endif()
+option(
+  CCCL_ENABLE_TILE_TRANSFORM_DISPATCH
+  "Build cub::DeviceTransform's tile path (requires nvcc --enable-tile)."
+  ${_cccl_tile_transform_default}
+)
+if (
+  CCCL_ENABLE_TILE_TRANSFORM_DISPATCH
+  AND "${CMAKE_CUDA_COMPILER_ID}" STREQUAL "NVIDIA"
+  AND "${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS 13.4
+)
+  message(
+    FATAL_ERROR
+    "CCCL_ENABLE_TILE_TRANSFORM_DISPATCH requires CUDA 13.4+ (nvcc --enable-tile). "
+    "Found ${CMAKE_CUDA_COMPILER_VERSION}."
+  )
+endif()
+
 option(CUB_ENABLE_TUNING "Build CUB tuning suite." OFF)
 if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
   set(CUB_ENABLE_TUNING OFF)
diff --git a/cub/benchmarks/CMakeLists.txt b/cub/benchmarks/CMakeLists.txt
index e54cf1c80db..c9e9e7893cd 100644
--- a/cub/benchmarks/CMakeLists.txt
+++ b/cub/benchmarks/CMakeLists.txt
@@ -91,6 +91,22 @@ function(add_bench target_name bench_name bench_src)
       cccl.nvbench_helper
       nvbench::main
   )
+
+  # Tile-DSL transform benches: build under --enable-tile + the dispatch opt-in when requested. Gated by
+  # CCCL_ENABLE_TILE_TRANSFORM_DISPATCH (default OFF) so CI builds the tile/ benches with the tile path off.
+  if (
+    CCCL_ENABLE_TILE_TRANSFORM_DISPATCH
+    AND "${bench_src}" MATCHES "/transform/tile/"
+  )
+    target_compile_options(
+      ${bench_target}
+      PRIVATE "$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--enable-tile>"
+    )
+    target_compile_definitions(
+      ${bench_target}
+      PRIVATE CCCL_ENABLE_TILE_TRANSFORM_DISPATCH
+    )
+  endif()
 endfunction()
 
 function(add_bench_dir bench_dir)
diff --git a/cub/benchmarks/bench/transform/tile/babelstream.cu b/cub/benchmarks/bench/transform/tile/babelstream.cu
new file mode 100644
index 00000000000..412d5957da3
--- /dev/null
+++ b/cub/benchmarks/bench/transform/tile/babelstream.cu
@@ -0,0 +1,230 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Tile variant of the BabelStream transform bench. The lambdas of the base benchmark are replaced by
+// named, stateless ops that register a tile_operator substitute (gated). Under --enable-tile +
+// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH the dispatch hook routes them to the tile kernel; otherwise this
+// is the standard CUB transform path. This file disappears once tile dispatch is fully transparent.
+
+#include "../common.h"
+
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+#  include <cuda_tile.h>
+#endif
+
+// Stateless scalar ops, used at the call site in both build modes. Constants are baked in so the ops
+// stay stateless (the tile substitute must be trivially default constructible): with startScalar == -2,
+// `c * scalar` is `-(c + c)`, `b + scalar * c` is `b - c - c`, etc.
+struct mul_op
+{
+  template <class B>
+  __host__ __device__ auto operator()(B b) const
+  {
+    return -(b + b);
+  }
+};
+struct add_op
+{
+  template <class A, class B>
+  __host__ __device__ auto operator()(A a, B b) const
+  {
+    return a + b;
+  }
+};
+struct triad_op
+{
+  template <class B, class C>
+  __host__ __device__ auto operator()(B b, C c) const
+  {
+    return b - c - c;
+  }
+};
+struct nstream_op
+{
+  template <class A, class B, class C>
+  __host__ __device__ auto operator()(A a, B b, C c) const
+  {
+    return a + b - c - c;
+  }
+};
+
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+struct tile_mul_op
+{
+  template <class B>
+  __tile__ auto operator()(B b) const
+  {
+    return -(b + b);
+  }
+};
+struct tile_add_op
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return a + b;
+  }
+};
+struct tile_triad_op
+{
+  template <class B, class C>
+  __tile__ auto operator()(B b, C c) const
+  {
+    return b - c - c;
+  }
+};
+struct tile_nstream_op
+{
+  template <class A, class B, class C>
+  __tile__ auto operator()(A a, B b, C c) const
+  {
+    return a + b - c - c;
+  }
+};
+
+CUB_NAMESPACE_BEGIN
+namespace transform
+{
+template <class T>
+inline constexpr bool tile_eligible_v<mul_op, T, 1> = true;
+template <class T>
+inline constexpr bool tile_eligible_v<add_op, T, 2> = true;
+template <class T>
+inline constexpr bool tile_eligible_v<triad_op, T, 2> = true;
+template <class T>
+inline constexpr bool tile_eligible_v<nstream_op, T, 3> = true;
+template <>
+struct tile_operator<mul_op>
+{
+  using type = tile_mul_op;
+};
+template <>
+struct tile_operator<add_op>
+{
+  using type = tile_add_op;
+};
+template <>
+struct tile_operator<triad_op>
+{
+  using type = tile_triad_op;
+};
+template <>
+struct tile_operator<nstream_op>
+{
+  using type = tile_nstream_op;
+};
+} // namespace transform
+CUB_NAMESPACE_END
+#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+
+// The tile path does not support __int128 (no tensor_span/partition_view for it), so the type axis
+// omits it relative to the base babelstream bench.
+#ifdef TUNE_T
+using element_types = nvbench::type_list<TUNE_T>;
+#else
+using element_types = nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::float32_t, nvbench::float64_t>;
+#endif
+
+inline auto array_size_powers = nvbench::range(16, 32, 4);
+
+// Same constant inputs as the base bench so nstream maintains a consistent workload.
+inline constexpr auto startA      = 11;
+inline constexpr auto startB      = 2;
+inline constexpr auto startC      = 1;
+inline constexpr auto startScalar = -2;
+static_assert(startA == (startA + startB + startScalar * startC), "nstream must have a consistent workload");
+
+template <typename T>
+static void mul(nvbench::state& state, nvbench::type_list<T>)
+try
+{
+  const auto n = state.get_int64("Elements{io}");
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(n);
+  state.add_global_memory_writes<T>(n);
+  bench_transform(state, cuda::std::tuple{c.begin()}, b.begin(), n, mul_op{});
+}
+catch (const std::bad_alloc&)
+{
+  state.skip("Skipping: out of memory.");
+}
+
+NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(element_types))
+  .set_name("tile_mul")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
+
+template <typename T>
+static void add(nvbench::state& state, nvbench::type_list<T>)
+try
+{
+  const auto n = state.get_int64("Elements{io}");
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(2 * n);
+  state.add_global_memory_writes<T>(n);
+  bench_transform(state, cuda::std::tuple{a.begin(), b.begin()}, c.begin(), n, add_op{});
+}
+catch (const std::bad_alloc&)
+{
+  state.skip("Skipping: out of memory.");
+}
+
+NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(element_types))
+  .set_name("tile_add")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
+
+template <typename T>
+static void triad(nvbench::state& state, nvbench::type_list<T>)
+try
+{
+  const auto n = state.get_int64("Elements{io}");
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(2 * n);
+  state.add_global_memory_writes<T>(n);
+  bench_transform(state, cuda::std::tuple{b.begin(), c.begin()}, a.begin(), n, triad_op{});
+}
+catch (const std::bad_alloc&)
+{
+  state.skip("Skipping: out of memory.");
+}
+
+NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(element_types))
+  .set_name("tile_triad")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
+
+template <typename T>
+static void nstream(nvbench::state& state, nvbench::type_list<T>)
+try
+{
+  const auto n = state.get_int64("Elements{io}");
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(3 * n);
+  state.add_global_memory_writes<T>(n);
+  bench_transform(state, cuda::std::tuple{a.begin(), b.begin(), c.begin()}, a.begin(), n, nstream_op{});
+}
+catch (const std::bad_alloc&)
+{
+  state.skip("Skipping: out of memory.");
+}
+
+NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types))
+  .set_name("tile_nstream")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
diff --git a/cub/benchmarks/bench/transform/tile/copy.cu b/cub/benchmarks/bench/transform/tile/copy.cu
new file mode 100644
index 00000000000..a0c32e3d16d
--- /dev/null
+++ b/cub/benchmarks/bench/transform/tile/copy.cu
@@ -0,0 +1,77 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Pure copy (identity transform) -- measures plain load/store bandwidth through the tile
+// load_masked/store_masked path. The identity op registers a tile_operator substitute (gated); under
+// --enable-tile + CCCL_ENABLE_TILE_TRANSFORM_DISPATCH the dispatch hook routes it to the tile kernel,
+// otherwise it falls through to CUB's standard transform. This file disappears once tile dispatch is
+// fully transparent.
+
+#include "../common.h"
+
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+#  include <cuda_tile.h>
+#endif
+
+struct identity
+{
+  template <class T>
+  __host__ __device__ auto operator()(T v) const
+  {
+    return v;
+  }
+};
+
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+struct tile_identity
+{
+  template <class T>
+  __tile__ auto operator()(T v) const
+  {
+    return v;
+  }
+};
+
+CUB_NAMESPACE_BEGIN
+namespace transform
+{
+template <class T>
+inline constexpr bool tile_eligible_v<identity, T, 1> = true;
+template <>
+struct tile_operator<identity>
+{
+  using type = tile_identity;
+};
+} // namespace transform
+CUB_NAMESPACE_END
+#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+
+#ifdef TUNE_T
+using element_types = nvbench::type_list<TUNE_T>;
+#else
+using element_types = nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::int32_t, nvbench::float64_t>;
+#endif
+
+template <typename T>
+static void copy(nvbench::state& state, nvbench::type_list<T>)
+try
+{
+  const auto n = state.get_int64("Elements{io}");
+
+  thrust::device_vector<T> in = generate(n);
+  thrust::device_vector<T> out(n, thrust::no_init);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(n);
+  state.add_global_memory_writes<T>(n);
+  bench_transform(state, cuda::std::tuple{in.begin()}, out.begin(), n, identity{});
+}
+catch (const std::bad_alloc&)
+{
+  state.skip("Skipping: out of memory.");
+}
+
+NVBENCH_BENCH_TYPES(copy, NVBENCH_TYPE_AXES(element_types))
+  .set_name("tile_copy")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 32, 4));
diff --git a/cub/benchmarks/bench/transform/tile/grayscale.cu b/cub/benchmarks/bench/transform/tile/grayscale.cu
new file mode 100644
index 00000000000..fbc539fa31c
--- /dev/null
+++ b/cub/benchmarks/bench/transform/tile/grayscale.cu
@@ -0,0 +1,84 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Tile variant of the grayscale transform bench. Unlike the base bench (a single rgb_t<T> struct
+// input), this uses three separate R/G/B streams so the inputs are plain element types the tile path
+// can vectorize. The named rgb_to_y op registers a tile_operator substitute (gated). This file
+// disappears once tile dispatch is fully transparent.
+
+#include "../common.h"
+
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+#  include <cuda_tile.h>
+#endif
+
+struct rgb_to_y
+{
+  template <class R, class G, class B>
+  __host__ __device__ auto operator()(R r, G g, B b) const
+  {
+    constexpr float w_r = 0.2989f;
+    constexpr float w_g = 0.587f;
+    constexpr float w_b = 0.114f;
+    return w_r * r + w_g * g + w_b * b;
+  }
+};
+
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+struct tile_rgb_to_y
+{
+  template <class R, class G, class B>
+  __tile__ auto operator()(R r, G g, B b) const
+  {
+    constexpr float w_r = 0.2989f;
+    constexpr float w_g = 0.587f;
+    constexpr float w_b = 0.114f;
+    return w_r * r + w_g * g + w_b * b;
+  }
+};
+
+CUB_NAMESPACE_BEGIN
+namespace transform
+{
+template <class T>
+inline constexpr bool tile_eligible_v<rgb_to_y, T, 3> = true;
+template <>
+struct tile_operator<rgb_to_y>
+{
+  using type = tile_rgb_to_y;
+};
+} // namespace transform
+CUB_NAMESPACE_END
+#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+
+#ifdef TUNE_T
+using value_types = nvbench::type_list<TUNE_T>;
+#else
+using value_types = nvbench::type_list<nvbench::float32_t, nvbench::float64_t>;
+#endif
+
+template <typename T>
+static void grayscale(nvbench::state& state, nvbench::type_list<T>)
+try
+{
+  const auto n = state.get_int64("Elements{io}");
+
+  thrust::device_vector<T> r = generate(n);
+  thrust::device_vector<T> g = generate(n);
+  thrust::device_vector<T> b = generate(n);
+  thrust::device_vector<T> out(n, thrust::no_init);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(3 * n); // matches the base bench's rgb_t<T> = 3 * sizeof(T)
+  state.add_global_memory_writes<T>(n);
+  bench_transform(state, cuda::std::tuple{r.begin(), g.begin(), b.begin()}, out.begin(), n, rgb_to_y{});
+}
+catch (const std::bad_alloc&)
+{
+  state.skip("Skipping: out of memory.");
+}
+
+NVBENCH_BENCH_TYPES(grayscale, NVBENCH_TYPE_AXES(value_types))
+  .set_name("tile_grayscale")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 32, 4));
diff --git a/cub/benchmarks/bench/transform/tile/pytorch.cu b/cub/benchmarks/bench/transform/tile/pytorch.cu
new file mode 100644
index 00000000000..deff05e2852
--- /dev/null
+++ b/cub/benchmarks/bench/transform/tile/pytorch.cu
@@ -0,0 +1,493 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Tile variant of the PyTorch-style transform benches. Each named op registers a tile_operator
+// substitute (gated); MUFU-heavy ops also opt into tile_mufu_heavy_v so the tile policy picker caps
+// items/thread at the vector width on sub-4-byte types. Under --enable-tile +
+// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH the dispatch hook routes them to the tile kernel; otherwise this
+// is the standard CUB path. This file disappears once tile dispatch is fully transparent.
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#include <cuda/std/cmath>
+
+#include "../common.h"
+
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+#  include <cuda_tile.h>
+#endif
+
+// Scalar ops the user passes to Transform. Sub-4-byte input types compute in float and cast back,
+// matching the tile substitutes below.
+template <class T>
+__host__ __device__ float to_f(T v)
+{
+  return static_cast<float>(v);
+}
+template <class T>
+__host__ __device__ T from_f(float f)
+{
+  return static_cast<T>(f);
+}
+
+struct relu_op
+{
+  template <class T>
+  __host__ __device__ T operator()(T v) const
+  {
+    float f = to_f(v);
+    return from_f<T>(f > 0.0f ? f : 0.0f);
+  }
+};
+struct sigmoid_op
+{
+  template <class T>
+  __host__ __device__ T operator()(T v) const
+  {
+    float f = to_f(v);
+    return from_f<T>(1.0f / (1.0f + ::cuda::std::exp(-f)));
+  }
+};
+struct tanh_op
+{
+  template <class T>
+  __host__ __device__ T operator()(T v) const
+  {
+    return from_f<T>(::cuda::std::tanh(to_f(v)));
+  }
+};
+struct gelu_op
+{
+  template <class T>
+  __host__ __device__ T operator()(T v) const
+  {
+    constexpr float k0 = 0.7978845608028654f, k1 = 0.044715f;
+    float f = to_f(v);
+    return from_f<T>(0.5f * f * (1.0f + ::cuda::std::tanh(k0 * (f + k1 * f * f * f))));
+  }
+};
+struct sin_op
+{
+  template <class T>
+  __host__ __device__ T operator()(T v) const
+  {
+    return from_f<T>(::cuda::std::sin(to_f(v)));
+  }
+};
+struct exp_op
+{
+  template <class T>
+  __host__ __device__ T operator()(T v) const
+  {
+    return from_f<T>(::cuda::std::exp(to_f(v)));
+  }
+};
+
+struct binary_add
+{
+  template <class A, class B>
+  __host__ __device__ auto operator()(A a, B b) const
+  {
+    return a + b;
+  }
+};
+struct binary_sub
+{
+  template <class A, class B>
+  __host__ __device__ auto operator()(A a, B b) const
+  {
+    return a - b;
+  }
+};
+struct binary_mul
+{
+  template <class A, class B>
+  __host__ __device__ auto operator()(A a, B b) const
+  {
+    return a * b;
+  }
+};
+struct binary_div
+{
+  template <class A, class B>
+  __host__ __device__ auto operator()(A a, B b) const
+  {
+    return a / b;
+  }
+};
+struct binary_le
+{
+  template <class A, class B>
+  __host__ __device__ A operator()(A a, B b) const
+  {
+    return static_cast<A>(a <= b);
+  }
+};
+struct binary_ge
+{
+  template <class A, class B>
+  __host__ __device__ A operator()(A a, B b) const
+  {
+    return static_cast<A>(a >= b);
+  }
+};
+struct binary_fmin
+{
+  template <class A, class B>
+  __host__ __device__ auto operator()(A a, B b) const
+  {
+    return a < b ? a : b;
+  }
+};
+struct binary_fmax
+{
+  template <class A, class B>
+  __host__ __device__ auto operator()(A a, B b) const
+  {
+    return a > b ? a : b;
+  }
+};
+
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+namespace ct = ::cuda::tiles;
+
+template <class T>
+__tile__ auto as_float(T v)
+{
+  return ct::element_cast<float>(v);
+}
+template <class T, class F>
+__tile__ auto from_float(F f)
+{
+  return ct::element_cast<ct::tile_element_t<T>>(f);
+}
+
+struct tile_relu
+{
+  template <class T>
+  __tile__ auto operator()(T v) const
+  {
+    auto f = as_float(v);
+    return from_float<T>(ct::select(f > 0.0f, f, f - f));
+  }
+};
+struct tile_sigmoid
+{
+  template <class T>
+  __tile__ auto operator()(T v) const
+  {
+    auto f = as_float(v);
+    return from_float<T>(1.0f / (1.0f + ct::exp(-f)));
+  }
+};
+struct tile_tanh
+{
+  template <class T>
+  __tile__ auto operator()(T v) const
+  {
+    return from_float<T>(ct::tanh(as_float(v)));
+  }
+};
+struct tile_gelu
+{
+  template <class T>
+  __tile__ auto operator()(T v) const
+  {
+    constexpr float k0 = 0.7978845608028654f, k1 = 0.044715f;
+    auto f = as_float(v);
+    return from_float<T>(0.5f * f * (1.0f + ct::tanh(k0 * (f + k1 * f * f * f))));
+  }
+};
+struct tile_sin
+{
+  template <class T>
+  __tile__ auto operator()(T v) const
+  {
+    return from_float<T>(ct::sin(as_float(v)));
+  }
+};
+struct tile_exp
+{
+  template <class T>
+  __tile__ auto operator()(T v) const
+  {
+    return from_float<T>(ct::exp(as_float(v)));
+  }
+};
+
+struct tile_binary_add
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return a + b;
+  }
+};
+struct tile_binary_sub
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return a - b;
+  }
+};
+struct tile_binary_mul
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return a * b;
+  }
+};
+struct tile_binary_div
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return a / b;
+  }
+};
+struct tile_binary_le
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return ct::element_cast<ct::tile_element_t<A>>(a <= b);
+  }
+};
+struct tile_binary_ge
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return ct::element_cast<ct::tile_element_t<A>>(a >= b);
+  }
+};
+struct tile_binary_fmin
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return ct::select(a < b, a, b);
+  }
+};
+struct tile_binary_fmax
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return ct::select(a > b, a, b);
+  }
+};
+
+CUB_NAMESPACE_BEGIN
+namespace transform
+{
+// Unary
+template <class T>
+inline constexpr bool tile_eligible_v<relu_op, T, 1> = true;
+template <class T>
+inline constexpr bool tile_eligible_v<sigmoid_op, T, 1> = true;
+template <class T>
+inline constexpr bool tile_eligible_v<tanh_op, T, 1> = true;
+template <class T>
+inline constexpr bool tile_eligible_v<gelu_op, T, 1> = true;
+template <class T>
+inline constexpr bool tile_eligible_v<sin_op, T, 1> = true;
+template <class T>
+inline constexpr bool tile_eligible_v<exp_op, T, 1> = true;
+template <>
+struct tile_operator<relu_op>
+{
+  using type = tile_relu;
+};
+template <>
+struct tile_operator<sigmoid_op>
+{
+  using type = tile_sigmoid;
+};
+template <>
+struct tile_operator<tanh_op>
+{
+  using type = tile_tanh;
+};
+template <>
+struct tile_operator<gelu_op>
+{
+  using type = tile_gelu;
+};
+template <>
+struct tile_operator<sin_op>
+{
+  using type = tile_sin;
+};
+template <>
+struct tile_operator<exp_op>
+{
+  using type = tile_exp;
+};
+
+// MUFU-heavy unary ops: hint the tile policy picker to cap items/thread at the vector width on
+// sub-4-byte types.
+template <>
+inline constexpr bool tile_mufu_heavy_v<sigmoid_op> = true;
+template <>
+inline constexpr bool tile_mufu_heavy_v<tanh_op> = true;
+template <>
+inline constexpr bool tile_mufu_heavy_v<gelu_op> = true;
+template <>
+inline constexpr bool tile_mufu_heavy_v<sin_op> = true;
+template <>
+inline constexpr bool tile_mufu_heavy_v<exp_op> = true;
+
+// Binary
+template <class T>
+inline constexpr bool tile_eligible_v<binary_add, T, 2> = true;
+template <class T>
+inline constexpr bool tile_eligible_v<binary_sub, T, 2> = true;
+template <class T>
+inline constexpr bool tile_eligible_v<binary_mul, T, 2> = true;
+template <class T>
+inline constexpr bool tile_eligible_v<binary_div, T, 2> = true;
+template <class T>
+inline constexpr bool tile_eligible_v<binary_le, T, 2> = true;
+template <class T>
+inline constexpr bool tile_eligible_v<binary_ge, T, 2> = true;
+template <class T>
+inline constexpr bool tile_eligible_v<binary_fmin, T, 2> = true;
+template <class T>
+inline constexpr bool tile_eligible_v<binary_fmax, T, 2> = true;
+template <>
+struct tile_operator<binary_add>
+{
+  using type = tile_binary_add;
+};
+template <>
+struct tile_operator<binary_sub>
+{
+  using type = tile_binary_sub;
+};
+template <>
+struct tile_operator<binary_mul>
+{
+  using type = tile_binary_mul;
+};
+template <>
+struct tile_operator<binary_div>
+{
+  using type = tile_binary_div;
+};
+template <>
+struct tile_operator<binary_le>
+{
+  using type = tile_binary_le;
+};
+template <>
+struct tile_operator<binary_ge>
+{
+  using type = tile_binary_ge;
+};
+template <>
+struct tile_operator<binary_fmin>
+{
+  using type = tile_binary_fmin;
+};
+template <>
+struct tile_operator<binary_fmax>
+{
+  using type = tile_binary_fmax;
+};
+} // namespace transform
+CUB_NAMESPACE_END
+#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+
+#ifdef TUNE_T
+using element_types = nvbench::type_list<TUNE_T>;
+#else
+using element_types = nvbench::type_list<
+#  if _CCCL_HAS_NVFP16() && _CCCL_CTK_AT_LEAST(12, 2)
+  __half,
+#  endif
+#  if _CCCL_HAS_NVBF16() && _CCCL_CTK_AT_LEAST(12, 2)
+  __nv_bfloat16,
+#  endif
+  nvbench::float32_t>;
+#endif
+
+template <typename Op, typename T>
+static void run_unary(nvbench::state& state)
+try
+{
+  const auto n = state.get_int64("Elements{io}");
+  thrust::device_vector<T> in(n, T(1));
+  thrust::device_vector<T> out(n, thrust::no_init);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(n);
+  state.add_global_memory_writes<T>(n);
+  bench_transform(state, cuda::std::tuple{in.begin()}, out.begin(), n, Op{});
+}
+catch (const std::bad_alloc&)
+{
+  state.skip("Skipping: out of memory.");
+}
+
+template <typename Op, typename T>
+static void run_binary(nvbench::state& state)
+try
+{
+  const auto n = state.get_int64("Elements{io}");
+  thrust::device_vector<T> a(n, T(1));
+  thrust::device_vector<T> b(n, T(1));
+  thrust::device_vector<T> out(n, thrust::no_init);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(2 * n);
+  state.add_global_memory_writes<T>(n);
+  bench_transform(state, cuda::std::tuple{a.begin(), b.begin()}, out.begin(), n, Op{});
+}
+catch (const std::bad_alloc&)
+{
+  state.skip("Skipping: out of memory.");
+}
+
+inline auto pt_sizes = nvbench::range(16, 32, 4);
+
+#define UNARY_BENCH(name, op)                                            \
+  template <typename T>                                                  \
+  static void name##_bench(nvbench::state& state, nvbench::type_list<T>) \
+  {                                                                      \
+    run_unary<op, T>(state);                                             \
+  }                                                                      \
+  NVBENCH_BENCH_TYPES(name##_bench, NVBENCH_TYPE_AXES(element_types))    \
+    .set_name("tile_" #name)                                             \
+    .set_type_axes_names({"T{ct}"})                                      \
+    .add_int64_power_of_two_axis("Elements{io}", pt_sizes)
+
+UNARY_BENCH(relu, relu_op);
+UNARY_BENCH(sigmoid, sigmoid_op);
+UNARY_BENCH(tanh, tanh_op);
+UNARY_BENCH(gelu, gelu_op);
+UNARY_BENCH(sin, sin_op);
+UNARY_BENCH(exp, exp_op);
+
+#define BINARY_BENCH(name, op)                                           \
+  template <typename T>                                                  \
+  static void name##_bench(nvbench::state& state, nvbench::type_list<T>) \
+  {                                                                      \
+    run_binary<op, T>(state);                                            \
+  }                                                                      \
+  NVBENCH_BENCH_TYPES(name##_bench, NVBENCH_TYPE_AXES(element_types))    \
+    .set_name("tile_pt_" #name)                                          \
+    .set_type_axes_names({"T{ct}"})                                      \
+    .add_int64_power_of_two_axis("Elements{io}", pt_sizes)
+
+BINARY_BENCH(add, binary_add);
+BINARY_BENCH(sub, binary_sub);
+BINARY_BENCH(mul, binary_mul);
+BINARY_BENCH(div, binary_div);
+BINARY_BENCH(le, binary_le);
+BINARY_BENCH(ge, binary_ge);
+BINARY_BENCH(fmin, binary_fmin);
+BINARY_BENCH(fmax, binary_fmax);
diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index d8ad0354bfc..5a8cfdad497 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -15,8 +15,13 @@
 
 #include <cub/detail/choose_offset.cuh>
 #include <cub/device/dispatch/dispatch_transform.cuh>
+#include <cub/device/dispatch/dispatch_transform_tile_config.cuh>
 #include <cub/util_namespace.cuh>
 
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+#  include <cub/device/dispatch/dispatch_transform_tile.cuh>
+#endif
+
 #include <cuda/__execution/tune.h>
 #include <cuda/__functional/address_stability.h>
 #include <cuda/__functional/always_true_false.h>
@@ -99,6 +104,26 @@ struct DeviceTransform
 
     const auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env).get();
 
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+    // Opt-in tile path. When the (Op, T, NIn) combo is trait-eligible and the device is sm_80+, we check the
+    // alignment/divisibility preconditions at runtime and route to the tile kernel; we fall through to the standard
+    // CUB dispatch below if they do not hold (CUB's kernels handle the unaligned/tail case, so this is a graceful
+    // fallback, not an error). device_supports_tile() enforces the sm_80+ hardware floor at runtime; below it (or if
+    // the capability query fails) we fall through to the standard CUB dispatch.
+    if constexpr (StableAddress == detail::transform::requires_stable_address::no
+                  && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
+                  && cub::detail::transform::tile::
+                    tile_dispatch_eligible_v<TransformOp, RandomAccessIteratorOut, RandomAccessIteratorsIn...>)
+    {
+      if (cub::detail::transform::tile::device_supports_tile()
+          && cub::detail::transform::tile::runtime_preconditions_valid(inputs, output, static_cast<offset_t>(num_items)))
+      {
+        return cub::detail::transform::tile::dispatch<TransformOp>(
+          inputs, output, static_cast<offset_t>(num_items), stream);
+      }
+    }
+#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+
     using tuning_env =
       ::cuda::std::execution::__query_result_or_t<Env, ::cuda::execution::__get_tuning_t, ::cuda::std::execution::env<>>;
     using default_policy_selector =
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile.cuh b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
new file mode 100644
index 00000000000..c319f429682
--- /dev/null
+++ b/cub/cub/device/dispatch/dispatch_transform_tile.cuh
@@ -0,0 +1,151 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Internal dispatch helpers for cub::DeviceTransform's tile path:
+//   tile_dispatch_eligible_v  -- compile-time predicate the hook consults
+//   runtime_preconditions_valid  -- runtime alignment + divisibility predicate
+//   dispatch                  -- bridge that picks the tile size and launches
+//                                the tile kernel with the trait's substitute functor
+// User-facing extension points (tile_eligible / tile_mufu_heavy) live in
+// dispatch_transform_tile_traits.cuh under cub::transform.
+// Requires CTK 13.4 or newer and nvcc invoked with --enable-tile.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#include <cub/device/dispatch/dispatch_transform_tile_config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_CUB_HAS_TILE_TRANSFORM()
+
+#  include <cub/device/dispatch/dispatch_transform_tile_traits.cuh>
+#  include <cub/device/dispatch/kernels/kernel_transform_tile.cuh>
+#  include <cub/device/dispatch/tuning/tuning_transform_tile.cuh>
+#  include <cub/util_debug.cuh>
+#  include <cub/util_device.cuh>
+
+#  include <thrust/type_traits/is_contiguous_iterator.h>
+#  include <thrust/type_traits/unwrap_contiguous_iterator.h>
+
+#  include <cuda/__cmath/ceil_div.h>
+#  include <cuda/__device/compute_capability.h>
+#  include <cuda/std/__iterator/readable_traits.h>
+#  include <cuda/std/__memory/is_sufficiently_aligned.h>
+#  include <cuda/std/__tuple_dir/apply.h>
+#  include <cuda/std/__type_traits/is_empty.h>
+#  include <cuda/std/__type_traits/is_trivially_default_constructible.h>
+#  include <cuda/std/__utility/integer_sequence.h>
+#  include <cuda/std/cstdint>
+#  include <cuda/std/tuple>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail::transform::tile
+{
+template <int TileSize, typename Fn, typename Out, typename... Ins, ::cuda::std::size_t... Idx>
+[[nodiscard]] ::cudaError_t launch_impl(
+  ::cuda::std::tuple<Ins*...> inputs,
+  Out* output,
+  ::cuda::std::int64_t num_items,
+  ::cudaStream_t stream,
+  ::cuda::std::index_sequence<Idx...>)
+{
+  if (num_items <= 0)
+  {
+    return ::cudaSuccess;
+  }
+
+  // One CTA per tile. The cast to the unsigned grid x-dim can't truncate: num_blocks > 2^32-1
+  // would need num_items > TileSize * 2^32 (>= 2^40 elements), more than any device can hold.
+  const ::cuda::std::int64_t num_blocks = ::cuda::ceil_div(num_items, ::cuda::std::int64_t{TileSize});
+
+  cub::detail::transform::tile::transform_kernel<TileSize, Fn>
+    <<<static_cast<unsigned>(num_blocks), 1, 0, stream>>>(num_items, output, ::cuda::std::get<Idx>(inputs)...);
+
+  return CubDebug(::cudaGetLastError());
+}
+
+// Combined compile-time predicate for whether (Op, OutIter, InIters...) can use the tile path. We use this with
+// `if constexpr` for dispatch: when true the hook tries the tile kernel first and, on runtime alignment/divisibility
+// failure, falls through to the standard CUB dispatch; when false the tile branch is discarded entirely.
+template <typename Op, typename OutIter, typename... InIters>
+inline constexpr bool tile_dispatch_eligible_v =
+  THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutIter>
+  && (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<InIters> && ...)
+  && cub::transform::tile_eligible_v<Op, ::cuda::std::iter_value_t<OutIter>, sizeof...(InIters)>;
+
+// Runtime arch gate: tile needs sm_80+. False (fall back to CUB) below sm_80 or if the cc query fails.
+[[nodiscard]] CUB_RUNTIME_FUNCTION inline bool device_supports_tile()
+{
+  ::cuda::compute_capability cc{};
+  return cub::detail::ptx_compute_cap(cc) == ::cudaSuccess && cc >= ::cuda::compute_capability{8, 0};
+}
+
+// Runtime precondition the tile hook checks before dispatching: 16-byte pointer alignment + num_items % 16 == 0
+// (the kernels assume_aligned<16>/assume_divisible<16>, so violating these is UB). False -> fall back to CUB.
+template <typename OutIter, typename... InIters, typename OffsetT>
+[[nodiscard]] CUB_RUNTIME_FUNCTION bool
+runtime_preconditions_valid(::cuda::std::tuple<InIters...> const& inputs, OutIter output, OffsetT num_items)
+{
+  // Pointer alignment is in bytes (for LDG.E.128); the kernel's
+  // ct::assume_divisible<N> applies to num_items as an element count. These
+  // are both 16 today by coincidence but live on different axes.
+  constexpr int byte_align    = 16;
+  constexpr int items_divisor = 16;
+
+  const auto out_ptr     = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output);
+  const bool aligned_out = ::cuda::std::is_sufficiently_aligned<byte_align>(out_ptr);
+  const bool aligned_in  = ::cuda::std::apply(
+    [](auto... iters) {
+      return (
+        (::cuda::std::is_sufficiently_aligned<byte_align>(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters)))
+        && ...);
+    },
+    inputs);
+
+  return aligned_out && aligned_in && (num_items % items_divisor) == 0;
+}
+
+// Bridge from cub::DeviceTransform::__transform_internal to the tile kernel. Precondition (the caller
+// checks it): tile_dispatch_eligible_v is true AND runtime_preconditions_valid returned true. Launches the kernel
+// with tile_operator_t<Op> -- Op's registered __tile__ mirror (a scalar functor can't be invoked on ct::tile).
+template <typename TransformOp, typename OutIter, typename... InIters, typename OffsetT>
+[[nodiscard]] CUB_RUNTIME_FUNCTION ::cudaError_t
+dispatch(::cuda::std::tuple<InIters...> inputs, OutIter output, OffsetT num_items, ::cudaStream_t stream)
+{
+  const auto out_ptr = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(output);
+  const auto in_ptrs = ::cuda::std::apply(
+    [](auto... iters) {
+      return ::cuda::std::make_tuple(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(iters)...);
+    },
+    inputs);
+
+  using tile_op_t = cub::transform::tile_operator_t<TransformOp>;
+  static_assert(::cuda::std::is_empty_v<tile_op_t>,
+                "tile_operator type must be stateless (the tile kernel default-constructs it)");
+  static_assert(::cuda::std::is_trivially_default_constructible_v<tile_op_t>,
+                "tile_operator type must be trivially default constructible");
+
+  constexpr int tile_size = cub::detail::transform::tile::pick_tile_size<::cuda::std::iter_value_t<OutIter>,
+                                                                         ::cuda::std::iter_value_t<InIters>...>(
+    cub::transform::tile_mufu_heavy_v<TransformOp>);
+  return cub::detail::transform::tile::launch_impl<tile_size, tile_op_t>(
+    in_ptrs,
+    out_ptr,
+    static_cast<::cuda::std::int64_t>(num_items),
+    stream,
+    ::cuda::std::index_sequence_for<InIters...>{});
+}
+} // namespace detail::transform::tile
+
+CUB_NAMESPACE_END
+
+#endif // _CCCL_CUB_HAS_TILE_TRANSFORM()
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
new file mode 100644
index 00000000000..0c06b091335
--- /dev/null
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_config.cuh
@@ -0,0 +1,35 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Single source of truth for the compile-time gates the tile transform headers
+// share. Two macros:
+//
+//   _CCCL_CUB_HAS_TILE_TRANSFORM()
+//     True when nvcc is in tile mode (--enable-tile / _CCCL_TILE_COMPILATION()) AND CTK 13.4+. The sm_80+
+//     requirement is handled at runtime + NV_IF_TARGET in the kernels, not here. When false, the tile headers
+//     (kernel / tuning / dispatch / traits) are skipped entirely.
+//
+//   _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+//     True when the dispatch hook in cub::DeviceTransform should fire. Same as
+//     _CCCL_CUB_HAS_TILE_TRANSFORM() plus the user opt-in macro
+//     CCCL_ENABLE_TILE_TRANSFORM_DISPATCH.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#define _CCCL_CUB_HAS_TILE_TRANSFORM() (_CCCL_TILE_COMPILATION() && _CCCL_CTK_AT_LEAST(13, 4))
+
+#if _CCCL_CUB_HAS_TILE_TRANSFORM() && defined(CCCL_ENABLE_TILE_TRANSFORM_DISPATCH)
+#  define _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() 1
+#else
+#  define _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED() 0
+#endif
diff --git a/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
new file mode 100644
index 00000000000..ad4e05926e0
--- /dev/null
+++ b/cub/cub/device/dispatch/dispatch_transform_tile_traits.cuh
@@ -0,0 +1,146 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Compile-time policy for cub::DeviceTransform's tile path.
+//
+// PUBLIC EXTENSION POINTS (cub::transform) -- two independent axes:
+//   tile_eligible_v<Op, T, NIn> -- specialize to true to opt a (functor type,
+//                                   element type, input arity) combo into the
+//                                   tile dispatch path. Eligibility only.
+//   tile_operator<Op>           -- the __tile__ functor the tile kernel runs
+//                                   for Op. No default: every tile-eligible Op
+//                                   must specialize it with `using type = <a
+//                                   stateless __tile__ functor mirroring Op>`,
+//                                   because a scalar functor (e.g.
+//                                   cuda::std::plus<__half>) cannot be invoked
+//                                   on ct::tile. Omitting it is a clear
+//                                   static_assert, not a cryptic kernel error.
+//   tile_operator_t<Op>         -- alias for tile_operator<Op>::type.
+//   tile_mufu_heavy_v<Op>       -- specialize to true to flag Op as MUFU-heavy; the tile policy picker uses it.
+//
+// Eligibility ("may this combo use the tile path?") and substitution ("which
+// __tile__ functor do we actually run?") are separate traits, so an eligible op
+// always registers both: tile_eligible_v<Op,T,NIn> and tile_operator<Op>.
+//
+// INTERNAL (cub::detail::transform::tile):
+//   tile_plus, tile_multiplies   -- shipped tile-friendly substitutes used by
+//                                    the built-in specializations below.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#include <cub/device/dispatch/dispatch_transform_tile_config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_CUB_HAS_TILE_TRANSFORM()
+
+#  include <cuda/std/__cstddef/types.h>
+#  include <cuda/std/__functional/operations.h>
+#  include <cuda/std/__type_traits/integral_constant.h>
+
+#  include <cuda_tile.h>
+
+CUB_NAMESPACE_BEGIN
+
+// Public extension surface.
+namespace transform
+{
+// Opt a (functor type, element type, input arity) combo into the tile dispatch path: specialize this to
+// true for the combo. Eligibility only -- the __tile__ functor to actually run is named by tile_operator<Op>.
+template <typename Op, typename T, ::cuda::std::size_t NIn>
+inline constexpr bool tile_eligible_v = false;
+
+// The __tile__ functor the tile kernel runs for Op.
+template <typename Op>
+struct tile_operator
+{
+  static_assert(sizeof(Op) == 0,
+                "cub::transform::tile_operator<Op> must be specialized for every tile-eligible Op: "
+                "provide `using type = <stateless __tile__ functor mirroring Op>`.");
+};
+
+template <typename Op>
+using tile_operator_t = typename tile_operator<Op>::type;
+
+// Hint that Op uses MUFU (multi-function unit, sin/cos/exp/log/tanh/rcp/rsq); specialize to true to make the tile
+// policy picker cap items/thread so MUFU pipes are not oversaturated.
+template <typename Op>
+inline constexpr bool tile_mufu_heavy_v = false;
+} // namespace transform
+
+// Internal substitutes shipped by CCCL.
+namespace detail::transform::tile
+{
+// Tile-friendly mirrors of common cuda::std ops. Each has a __tile__
+// templated operator() so it can be invoked from inside transform_kernel
+// where the arguments are ct::tile<T, ...> rather than scalar T.
+struct tile_plus
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return a + b;
+  }
+};
+
+struct tile_multiplies
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return a * b;
+  }
+};
+} // namespace detail::transform::tile
+
+// Built-in trait specializations live in the public namespace alongside the
+// trait, but reference the internal substitute functors.
+namespace transform
+{
+// cuda::std::plus / multiplies are scalar ops, so each is marked eligible and given a tile_operator mirror.
+#  if _CCCL_HAS_NVFP16()
+template <>
+inline constexpr bool tile_eligible_v<::cuda::std::plus<::__half>, ::__half, 2> = true;
+template <>
+inline constexpr bool tile_eligible_v<::cuda::std::multiplies<::__half>, ::__half, 2> = true;
+template <>
+struct tile_operator<::cuda::std::plus<::__half>>
+{
+  using type = cub::detail::transform::tile::tile_plus;
+};
+template <>
+struct tile_operator<::cuda::std::multiplies<::__half>>
+{
+  using type = cub::detail::transform::tile::tile_multiplies;
+};
+#  endif // _CCCL_HAS_NVFP16()
+
+#  if _CCCL_HAS_NVBF16()
+template <>
+inline constexpr bool tile_eligible_v<::cuda::std::plus<::__nv_bfloat16>, ::__nv_bfloat16, 2> = true;
+template <>
+inline constexpr bool tile_eligible_v<::cuda::std::multiplies<::__nv_bfloat16>, ::__nv_bfloat16, 2> = true;
+template <>
+struct tile_operator<::cuda::std::plus<::__nv_bfloat16>>
+{
+  using type = cub::detail::transform::tile::tile_plus;
+};
+template <>
+struct tile_operator<::cuda::std::multiplies<::__nv_bfloat16>>
+{
+  using type = cub::detail::transform::tile::tile_multiplies;
+};
+#  endif // _CCCL_HAS_NVBF16()
+} // namespace transform
+
+CUB_NAMESPACE_END
+
+#endif // _CCCL_CUB_HAS_TILE_TRANSFORM()
diff --git a/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
new file mode 100644
index 00000000000..b2e8f2b5e68
--- /dev/null
+++ b/cub/cub/device/dispatch/kernels/kernel_transform_tile.cuh
@@ -0,0 +1,68 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#include <cub/device/dispatch/dispatch_transform_tile_config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_CUB_HAS_TILE_TRANSFORM()
+
+#  include <cuda/std/cstdint>
+
+#  include <nv/target>
+
+#  include <cuda_tile.h>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail::transform::tile
+{
+// Build a tile partition_view for a 1D contiguous buffer. The two annotations are load-bearing:
+//   assume_aligned<16>      -- promises the pointer is 16-byte aligned, so the compiler can pick LDG.E.128 vectorized
+//                              loads/stores.
+//   ct::extents<int64_t,..> -- explicit element type on the extent; CTAD would deduce uint32_t and wrap at 2^32.
+//                              int64_t lets us cover the full num_items range.
+// The caller is responsible for honoring assume_aligned<16>; the dispatch header's runtime_preconditions_valid
+// enforces this before launching either kernel.
+template <int TileSize, typename T, typename N>
+[[nodiscard]] __tile__ auto make_aligned_partition_view(T* ptr, N n)
+{
+  namespace ct         = ::cuda::tiles;
+  const auto ptr_align = ct::assume_aligned<16>(ptr);
+  auto span            = ct::tensor_span{ptr_align, ct::extents<::cuda::std::int64_t, ct::dynamic_extent>{n}};
+  return ct::partition_view{span, ct::shape<TileSize>{}};
+}
+
+// Tile DSL kernel backing cub::DeviceTransform's tile path. It assumes 16-byte pointer alignment + 16-divisible
+// num_items (so the compiler picks LDG.E.128); the dispatch header honors that. NV_IF_TARGET(NV_PROVIDES_SM_80)
+// guards the body -- tile needs sm_80+, so sub-80 arches get a no-op kernel (dispatch only launches it on sm_80+).
+//   assume_divisible<16>     -- num_items % 16 == 0, so the tile DSL can elide tail handling.
+//   assume_bounded_below<0>  -- num_items >= 0; enables sign-comparison simplifications.
+//
+// NOTE: make_aligned_partition_view is invoked directly -- do NOT wrap these calls in a lambda: nvcc 13.4
+// miscompiles a templated __tile__ helper called via a lambda under --expt-relaxed-constexpr (invalid IR).
+template <int TileSize, typename Fn, typename Out, typename... Ins>
+__tile_global__ void transform_kernel(const ::cuda::std::int64_t num_items, Out* out, const Ins*... ins)
+{
+  namespace ct = ::cuda::tiles;
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_80,
+    (const auto bx = ct::bid().x; const auto n = ct::assume_bounded_below<0>(ct::assume_divisible<16>(num_items));
+     const auto out_view                       = make_aligned_partition_view<TileSize>(out, n);
+     out_view.store_masked(Fn{}(make_aligned_partition_view<TileSize>(ins, n).load_masked(bx)...), bx);));
+}
+} // namespace detail::transform::tile
+
+CUB_NAMESPACE_END
+
+#endif // _CCCL_CUB_HAS_TILE_TRANSFORM()
diff --git a/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
new file mode 100644
index 00000000000..8a11ad60f7a
--- /dev/null
+++ b/cub/cub/device/dispatch/tuning/tuning_transform_tile.cuh
@@ -0,0 +1,85 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Policy picker for cub::DeviceTransform's tile path. Shares the
+// bytes-in-flight target used by CUB's non-tile algorithms (calls
+// tuning_transform.cuh's cc_to_min_bytes_in_flight) but expresses the
+// answer as a TileSize, since tile kernels partition by compile-time
+// shape rather than threads*items.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#include <cub/device/dispatch/dispatch_transform_tile_config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_CUB_HAS_TILE_TRANSFORM()
+
+#  include <cub/device/dispatch/tuning/tuning_transform.cuh>
+
+#  include <cuda/__cmath/ceil_div.h>
+#  include <cuda/__cmath/pow2.h>
+#  include <cuda/__device/compute_capability.h>
+#  include <cuda/std/__algorithm/max.h>
+#  include <cuda/std/__algorithm/min.h>
+#  include <cuda/std/__cstddef/types.h>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail::transform::tile
+{
+// mufu_heavy=true tells the policy the functor body has heavy MUFU usage.
+// for small data types, vectorized load will make them arrive packed in
+// registers and the compiler unpacks them and packs them back. reducing the
+// compute work per thread helps here. need profiling to know the exact cause.
+template <typename Out, typename... Ins>
+constexpr int pick_tile_size(bool mufu_heavy = false, ::cuda::compute_capability cc = {10, 0})
+{
+  constexpr int threads_per_block    = 128;
+  constexpr int vector_bytes         = 16; // LDG.E.128 -> 16 bytes
+  constexpr int max_items_per_thread = 32;
+  constexpr int max_occupancy        = 16;
+
+  constexpr auto min_elem     = ::cuda::std::min({sizeof(Out), sizeof(Ins)...});
+  constexpr int items_for_vec = ::cuda::ceil_div(vector_bytes, min_elem);
+
+  // Fill (zero inputs) keeps the same latency target by counting output bytes.
+  constexpr auto bytes_per_iter = (sizeof...(Ins) > 0) ? (sizeof(Ins) + ... + ::cuda::std::size_t{0}) : sizeof(Out);
+  const int target              = cub::detail::transform::cc_to_min_bytes_in_flight(cc);
+  const int items_for_latency =
+    static_cast<int>(::cuda::ceil_div(target, max_occupancy * threads_per_block * bytes_per_iter));
+
+  int items = ::cuda::std::max(items_for_vec, items_for_latency);
+  items     = static_cast<int>(::cuda::next_power_of_two(static_cast<unsigned>(items)));
+  if (items > max_items_per_thread)
+  {
+    items = max_items_per_thread;
+  }
+
+  if (mufu_heavy && min_elem < 4)
+  {
+    // Elements that fit in one 16-byte vector load -> items/thread cap for MUFU-heavy sub-4B ops.
+    // min_elem is size_t, so cast the quotient once here to keep this an int item count (matches
+    // items below, so the comparison/assignment stay int-vs-int: no sign-compare, no use-site casts).
+    constexpr int vec_items_cap = static_cast<int>(vector_bytes / min_elem); // 16 for I8, 8 for I16/half/bf16
+    if (items > vec_items_cap)
+    {
+      items = vec_items_cap;
+    }
+  }
+
+  return items * threads_per_block;
+}
+} // namespace detail::transform::tile
+
+CUB_NAMESPACE_END
+
+#endif // _CCCL_CUB_HAS_TILE_TRANSFORM()
diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt
index ce46a86b93e..0214861c053 100644
--- a/cub/test/CMakeLists.txt
+++ b/cub/test/CMakeLists.txt
@@ -172,6 +172,22 @@ function(
       target_compile_options(${test_target} PRIVATE -ftemplate-depth=1000) # for handling large type lists
     endif()
 
+    # Tile-DSL transform test: compile under --enable-tile and turn on the dispatch hook. Gated by
+    # CCCL_ENABLE_TILE_TRANSFORM_DISPATCH (default OFF) so CI keeps the tile path compiled out.
+    if (
+      CCCL_ENABLE_TILE_TRANSFORM_DISPATCH
+      AND "${test_src}" MATCHES "test_device_transform_tile\\.cu$"
+    )
+      target_compile_options(
+        ${test_target}
+        PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--enable-tile>
+      )
+      target_compile_definitions(
+        ${test_target}
+        PRIVATE CCCL_ENABLE_TILE_TRANSFORM_DISPATCH
+      )
+    endif()
+
     # enable lambdas for all API examples
     if ("${test_src}" MATCHES "test.+_api\\.cu$")
       target_compile_options(
diff --git a/cub/test/catch2_test_device_transform_tile.cu b/cub/test/catch2_test_device_transform_tile.cu
new file mode 100644
index 00000000000..f77eea0e31b
--- /dev/null
+++ b/cub/test/catch2_test_device_transform_tile.cu
@@ -0,0 +1,143 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+
+#include <cub/device/device_transform.cuh>
+
+#include <c2h/catch2_test_helper.h>
+
+// The tile dispatch path only exists when nvcc is invoked with --enable-tile and the user opts in via
+// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH. In any other build this file compiles to a single skipped test.
+#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+#  include <algorithm>
+
+#  include <cuda_tile.h>
+
+#  include "catch2_test_launch_helper.h"
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::Transform, transform_many);
+
+namespace ct = ::cuda::tiles;
+
+// Each scalar op (passed to Transform, used by the CUB fallback) pairs with a tile-side mirror
+// registered through tile_operator. The bodies use tile-tile arithmetic and wrap for unsigned types,
+// so the tile result is bit-exact with the host reference.
+
+// Unary: v * v.
+struct square_op
+{
+  template <class T>
+  __host__ __device__ T operator()(T v) const
+  {
+    return static_cast<T>(v * v);
+  }
+};
+struct tile_square_op
+{
+  template <class T>
+  __tile__ auto operator()(T v) const
+  {
+    return v * v;
+  }
+};
+
+// Binary: a + b.
+struct add_op
+{
+  template <class A, class B>
+  __host__ __device__ auto operator()(A a, B b) const
+  {
+    return static_cast<A>(a + b);
+  }
+};
+struct tile_add_op
+{
+  template <class A, class B>
+  __tile__ auto operator()(A a, B b) const
+  {
+    return a + b;
+  }
+};
+
+CUB_NAMESPACE_BEGIN
+namespace transform
+{
+template <class T>
+inline constexpr bool tile_eligible_v<square_op, T, 1> = true;
+template <>
+struct tile_operator<square_op>
+{
+  using type = tile_square_op;
+};
+
+template <class T>
+inline constexpr bool tile_eligible_v<add_op, T, 2> = true;
+template <>
+struct tile_operator<add_op>
+{
+  using type = tile_add_op;
+};
+} // namespace transform
+CUB_NAMESPACE_END
+
+// Unsigned types so arithmetic wraps deterministically and matches the host reference bit-for-bit.
+using tile_types = c2h::type_list<::cuda::std::uint32_t, ::cuda::std::uint64_t>;
+
+// Sizes span the runtime preconditions: multiples of 16 (with aligned c2h buffers) take the tile
+// kernel; the others fall back to the standard CUB dispatch. Both must produce identical results.
+#  define TILE_TRANSFORM_SIZES GENERATE(::cuda::std::int64_t{0}, 16, 32, 128, 1024, 4096, 65536, 17, 127, 1000)
+
+C2H_TEST("DeviceTransform tile dispatch: unary scalar op routed through its tile_operator substitute",
+         "[device][transform][tile]",
+         tile_types)
+{
+  using type                           = c2h::get<0, TestType>;
+  const ::cuda::std::int64_t num_items = TILE_TRANSFORM_SIZES;
+  CAPTURE(c2h::type_name<type>(), num_items);
+
+  c2h::device_vector<type> in(num_items, thrust::no_init);
+  c2h::gen(C2H_SEED(2), in);
+  c2h::device_vector<type> result(num_items, thrust::no_init);
+
+  transform_many(::cuda::std::make_tuple(in.begin()), result.begin(), num_items, square_op{});
+
+  c2h::host_vector<type> in_h = in;
+  c2h::host_vector<type> reference_h(num_items, thrust::no_init);
+  std::transform(in_h.begin(), in_h.end(), reference_h.begin(), square_op{});
+  REQUIRE(reference_h == result);
+}
+
+C2H_TEST("DeviceTransform tile dispatch: binary scalar op routed through its tile_operator substitute",
+         "[device][transform][tile]",
+         tile_types)
+{
+  using type                           = c2h::get<0, TestType>;
+  const ::cuda::std::int64_t num_items = TILE_TRANSFORM_SIZES;
+  CAPTURE(c2h::type_name<type>(), num_items);
+
+  c2h::device_vector<type> a(num_items, thrust::no_init);
+  c2h::device_vector<type> b(num_items, thrust::no_init);
+  c2h::gen(C2H_SEED(2), a);
+  c2h::gen(C2H_SEED(2), b);
+  c2h::device_vector<type> result(num_items, thrust::no_init);
+
+  transform_many(::cuda::std::make_tuple(a.begin(), b.begin()), result.begin(), num_items, add_op{});
+
+  c2h::host_vector<type> a_h = a;
+  c2h::host_vector<type> b_h = b;
+  c2h::host_vector<type> reference_h(num_items, thrust::no_init);
+  std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), add_op{});
+  REQUIRE(reference_h == result);
+}
+
+#else // !_CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
+
+C2H_TEST("DeviceTransform tile dispatch requires --enable-tile", "[device][transform][tile]")
+{
+  SUCCEED("tile transform dispatch not enabled in this build");
+}
+
+#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h
index f6cd76cc929..69d1509ebd2 100644
--- a/libcudacxx/include/cuda/std/__cccl/builtin.h
+++ b/libcudacxx/include/cuda/std/__cccl/builtin.h
@@ -112,6 +112,11 @@
 #  define _CCCL_BUILTIN_ASSUME_ALIGNED(...) __builtin_assume_aligned(__VA_ARGS__)
 #endif // _CCCL_HAS_BUILTIN(__builtin_assume_aligned)
 
+#if _CCCL_TILE_COMPILATION() // __builtin_assume_aligned is not supported in tile mode
+#  undef _CCCL_BUILTIN_ASSUME_ALIGNED
+#  define _CCCL_BUILTIN_ASSUME_ALIGNED(_Ptr, ...) (_Ptr)
+#endif // _CCCL_TILE_COMPILATION()
+
 #if _CCCL_CHECK_BUILTIN(builtin_constant_p) || _CCCL_COMPILER(GCC)
 #  define _CCCL_BUILTIN_CONSTANT_P(...) __builtin_constant_p(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(builtin_constant_p)
diff --git a/libcudacxx/include/cuda/std/__new/launder.h b/libcudacxx/include/cuda/std/__new/launder.h
index e2f3af192a0..3d67950fc18 100644
--- a/libcudacxx/include/cuda/std/__new/launder.h
+++ b/libcudacxx/include/cuda/std/__new/launder.h
@@ -32,6 +32,10 @@
 #  define _CCCL_BUILTIN_LAUNDER(...) __builtin_launder(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(builtin_launder) || _CCCL_COMPILER(GCC, >=, 7) || _CCCL_COMPILER(MSVC)
 
+#if _CCCL_TILE_COMPILATION() // __builtin_launder is not supported in tile mode
+#  undef _CCCL_BUILTIN_LAUNDER
+#endif // _CCCL_TILE_COMPILATION()
+
 _CCCL_BEGIN_NAMESPACE_CUDA_STD
 
 template <class _Tp>