Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
83 commits
Select commit Hold shift + click to select a range
6e91c89
tile DeviceTransform policy picker
nanan-nvidia Jun 1, 2026
4980aab
tile DeviceTransform kernels + public API
nanan-nvidia Jun 1, 2026
848a645
bench_init RNG helper
nanan-nvidia Jun 1, 2026
4632136
babelstream tile bench
nanan-nvidia Jun 1, 2026
ea6ae70
pytorch tile bench
nanan-nvidia Jun 1, 2026
57c712c
copy/grayscale/fill tile benches
nanan-nvidia Jun 1, 2026
7ecb113
tile DeviceTransform tests
nanan-nvidia Jun 1, 2026
d5eff22
move tile DeviceTransform header into CUB public path
nanan-nvidia Jun 3, 2026
5f98857
move tile dispatch into cub::detail::transform::tile namespace
nanan-nvidia Jun 3, 2026
8ae89ef
add tile dispatch trait header
nanan-nvidia Jun 3, 2026
37c8b71
split tile dispatch into kernel and tuning headers
nanan-nvidia Jun 3, 2026
7fb935f
wire tile dispatch hook into cub::DeviceTransform
nanan-nvidia Jun 3, 2026
906fbec
drop runtime gates from tile dispatch helper
nanan-nvidia Jun 3, 2026
249131e
add runtime alignment check before routing to tile
nanan-nvidia Jun 3, 2026
744dbb3
drop _CCCL_TILE from _CCCL_API to unblock CUB under enable-tile
nanan-nvidia Jun 3, 2026
74fd6cc
fall back to standard CUB dispatch when tile preconditions fail
nanan-nvidia Jun 3, 2026
b7e8c92
migrate tile benches and tests to cub::DeviceTransform
nanan-nvidia Jun 4, 2026
4afa1b3
use int64 extents in tile kernels and clean up runtime precondition c…
nanan-nvidia Jun 5, 2026
f761178
lift tile_eligible and tile_mufu_heavy to cub::transform namespace
nanan-nvidia Jun 9, 2026
eb6bd04
purge outdated comments from before runtime fallback was added
nanan-nvidia Jun 9, 2026
f69e0ed
move kernel doc-comment next to the kernel and reflow to 100 col
nanan-nvidia Jun 9, 2026
207ba0e
gate tile transform headers on a single config macro
nanan-nvidia Jun 9, 2026
48b949a
tidy kernel_transform_tile.cuh: use cuda::std::int64_t and drop _-suf…
nanan-nvidia Jun 9, 2026
bb091d0
factor out make_partition_view helper and document assume_* annotations
nanan-nvidia Jun 9, 2026
514b536
const-qualify scalar parameters in transform_kernel and fill_kernel
nanan-nvidia Jun 9, 2026
607f7a0
rename runtime_preconditions_ok to runtime_preconditions_valid
nanan-nvidia Jun 9, 2026
a5d3eca
trim tile traits header includes
nanan-nvidia Jun 9, 2026
8eaa495
annotate tile-path return-valued helpers with [[nodiscard]]
nanan-nvidia Jun 9, 2026
0055590
drop redundant __detail sub-namespace from tile dispatch helper
nanan-nvidia Jun 10, 2026
c7ee05c
use cub::detail::it_value_t and drop hand-rolled unwrap helper
nanan-nvidia Jun 10, 2026
91d3945
drop redundant template keyword on DeviceTransform::Transform call
nanan-nvidia Jun 10, 2026
3c67f7d
wrap kernel-launch error checks with CubDebug
nanan-nvidia Jun 10, 2026
fd9b7a2
fully qualify tile kernel-launch names and use unsigned in casts
nanan-nvidia Jun 10, 2026
0793ffe
document tile_mufu_heavy with a usage hint
nanan-nvidia Jun 10, 2026
3e206e6
use ::cuda::ceil_div for block-count math in tile dispatch
nanan-nvidia Jun 10, 2026
e56f0d6
reuse CUB's cc_to_min_bytes_in_flight, take compute_capability object
nanan-nvidia Jun 10, 2026
e9e9939
use ::cuda::std::min initializer list instead of hand-rolled variadic…
nanan-nvidia Jun 10, 2026
cc77ef2
drop int() casts on sizeof and use ::cuda::std::max
nanan-nvidia Jun 10, 2026
69d2339
simplify _CCCL_CUB_HAS_TILE_TRANSFORM to just _CCCL_TILE_COMPILATION
nanan-nvidia Jun 10, 2026
4c2d0c5
fully qualify cub::detail/cub::transform refs + ::cuda* runtime types
nanan-nvidia Jun 10, 2026
9d861ec
reflow kernel_transform_tile.cuh comments to 120-column limit
nanan-nvidia Jun 10, 2026
b83cc70
anchor make_partition_view with using-decl; inline stateless Fn
nanan-nvidia Jun 10, 2026
60db7ce
anchor remaining intra-namespace helper calls
nanan-nvidia Jun 11, 2026
85394e1
rename make_partition_view -> make_aligned_partition_view
nanan-nvidia Jun 11, 2026
63f6bda
mark out_view const in both tile kernels
nanan-nvidia Jun 11, 2026
0a81752
include specific libcu++ headers instead of <cuda/cmath> umbrella
nanan-nvidia Jun 11, 2026
33a783a
make byte_cap constexpr in pick_tile_size
nanan-nvidia Jun 11, 2026
37cf303
drop redundant static_cast<int> on items_for_vec
nanan-nvidia Jun 11, 2026
0fb0a4f
define gate macro as literal 1/0 to avoid expansion-to-defined UB
nanan-nvidia Jun 11, 2026
768ab0c
make the vector-width cap an int so the cap comparison needs no casts
nanan-nvidia Jun 11, 2026
418e592
drop CUB_NS_QUALIFIER from tile_eligible substitutes (sweep missed them)
nanan-nvidia Jun 11, 2026
5123278
document why num_blocks fits the unsigned grid x-dim
nanan-nvidia Jun 11, 2026
1fa7289
clang-format dispatch_transform_tile.cuh
nanan-nvidia Jun 11, 2026
34568d4
split tile_op_t alias into an intermediate out_value_t
nanan-nvidia Jun 11, 2026
f80bb25
separate tile eligibility from the tile-operator substitute
nanan-nvidia Jun 11, 2026
dcb838d
add gated c2h test for the tile transform dispatch path
nanan-nvidia Jun 12, 2026
14a30fc
use thrust::device_vector in copy bench
nanan-nvidia Jun 12, 2026
67af257
rewrite tile babelstream bench to CUB conventions
nanan-nvidia Jun 12, 2026
2a1983a
rewrite remaining tile benches to CUB conventions; drop redundant files
nanan-nvidia Jun 12, 2026
0141839
accept cuda::aligned_size_t<16> as a compile-time tile-commit hint
nanan-nvidia Jun 12, 2026
99042d3
clang-format kernel_transform_tile.cuh and tuning_transform_tile.cuh
nanan-nvidia Jun 12, 2026
4a5ec54
guard fp16/bf16 in tile pytorch bench on CTK 12.2+
nanan-nvidia Jun 12, 2026
76ac55f
drop redundant comment on the gate macro
nanan-nvidia Jun 12, 2026
67b9489
address review nits in tile transform dispatch
nanan-nvidia Jun 12, 2026
8614d45
collapse tile_eligible to a single variable template
nanan-nvidia Jun 12, 2026
81f2133
avoid nvcc 13.4 tile lambda-linkage miscompile: call partition-view h…
nanan-nvidia Jun 12, 2026
393ce96
add opt-in CMake option to build cub::DeviceTransform's tile path und…
nanan-nvidia Jun 12, 2026
76d02eb
gate cub::DeviceTransform tile path on CTK 13.4
nanan-nvidia Jun 12, 2026
6e71364
gate the tile path on sm_80 (NV_IF_TARGET + runtime cc check)
nanan-nvidia Jun 13, 2026
a1c01c7
default CCCL_ENABLE_TILE_TRANSFORM_DISPATCH ON for CTK 13.4+
nanan-nvidia Jun 13, 2026
b9d49d0
trim verbose comments in the tile transform headers
nanan-nvidia Jun 13, 2026
f76647e
tidy tile bench type axes
nanan-nvidia Jun 14, 2026
b1df2c6
collapse tile_mufu_heavy to a single variable template
nanan-nvidia Jun 15, 2026
7428773
libcudacxx: undef tile-unsupported builtins instead of stripping _CCC…
nanan-nvidia Jun 15, 2026
0c13142
drop aligned_size_t tile-commit hint so all DeviceTransform changes a…
nanan-nvidia Jun 22, 2026
da688b6
tile: drop internal DeviceTransform struct; fold tile-size pick into …
nanan-nvidia Jun 23, 2026
ebede88
tile: remove unused fill_kernel (no Fill hook wired)
nanan-nvidia Jun 23, 2026
cd8d429
tile: drop redundant inline comments in dispatch
nanan-nvidia Jun 23, 2026
8a8724b
tile: use Apache-2.0 license header for new bench files
nanan-nvidia Jun 23, 2026
20fd237
tile: clang-format dispatch and fix stale struct reference in comment
nanan-nvidia Jun 23, 2026
581fcf4
tile: mark out_ptr const in runtime_preconditions_valid
nanan-nvidia Jun 23, 2026
3ce1647
tile: drop __restrict__ from kernel params (API permits in-place tran…
nanan-nvidia Jun 23, 2026
f7729e7
tile: reflow transform_kernel signature (clang-format)
nanan-nvidia Jun 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions cub/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,34 @@ option(CUB_ENABLE_HEADER_TESTING "Test that all public headers compile." ON)
option(CUB_ENABLE_TESTING "Build CUB testing suite." ON)
option(CUB_ENABLE_EXAMPLES "Build CUB examples." ON)

# Build cub::DeviceTransform's tile-DSL path (test + benches) under `nvcc --enable-tile`. Defaults ON when the
# toolkit can build it (CTK 13.4+), so 13.4+ configs -- including CI -- exercise the tile path automatically; OFF
# and compiled out below 13.4. The sm_80+ floor is enforced at runtime (dispatch cc check + NV_IF_TARGET in the
# kernels), so a 13.4+ build still runs correctly on any GPU.
set(_cccl_tile_transform_default OFF)
if (
"${CMAKE_CUDA_COMPILER_ID}" STREQUAL "NVIDIA"
AND NOT "${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS 13.4
)
set(_cccl_tile_transform_default ON)
endif()
option(
CCCL_ENABLE_TILE_TRANSFORM_DISPATCH
"Build cub::DeviceTransform's tile path (requires nvcc --enable-tile)."
${_cccl_tile_transform_default}
)
if (
CCCL_ENABLE_TILE_TRANSFORM_DISPATCH
AND "${CMAKE_CUDA_COMPILER_ID}" STREQUAL "NVIDIA"
AND "${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS 13.4
)
message(
FATAL_ERROR
"CCCL_ENABLE_TILE_TRANSFORM_DISPATCH requires CUDA 13.4+ (nvcc --enable-tile). "
"Found ${CMAKE_CUDA_COMPILER_VERSION}."
)
endif()

option(CUB_ENABLE_TUNING "Build CUB tuning suite." OFF)
if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
set(CUB_ENABLE_TUNING OFF)
Expand Down
16 changes: 16 additions & 0 deletions cub/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,22 @@ function(add_bench target_name bench_name bench_src)
cccl.nvbench_helper
nvbench::main
)

# Tile-DSL transform benches: build under --enable-tile + the dispatch opt-in when requested. Gated by
# CCCL_ENABLE_TILE_TRANSFORM_DISPATCH (default OFF) so CI builds the tile/ benches with the tile path off.
if (
CCCL_ENABLE_TILE_TRANSFORM_DISPATCH
AND "${bench_src}" MATCHES "/transform/tile/"
)
target_compile_options(
${bench_target}
PRIVATE "$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--enable-tile>"
)
target_compile_definitions(
${bench_target}
PRIVATE CCCL_ENABLE_TILE_TRANSFORM_DISPATCH
)
endif()
endfunction()

function(add_bench_dir bench_dir)
Expand Down
230 changes: 230 additions & 0 deletions cub/benchmarks/bench/transform/tile/babelstream.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

// Tile variant of the BabelStream transform bench. The lambdas of the base benchmark are replaced by
// named, stateless ops that register a tile_operator substitute (gated). Under --enable-tile +
// CCCL_ENABLE_TILE_TRANSFORM_DISPATCH the dispatch hook routes them to the tile kernel; otherwise this
// is the standard CUB transform path. This file disappears once tile dispatch is fully transparent.

#include "../common.h"

#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
# include <cuda_tile.h>
#endif

// Stateless scalar ops, used at the call site in both build modes. Constants are baked in so the ops
// stay stateless (the tile substitute must be trivially default constructible): with startScalar == -2,
// `c * scalar` is `-(c + c)`, `b + scalar * c` is `b - c - c`, etc.
struct mul_op
{
template <class B>
__host__ __device__ auto operator()(B b) const
{
return -(b + b);
}
};
Comment on lines +18 to +25

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important: We should not need new operator definitions. We should just specialize tile_operator for cuda::plus etc. Unless there is a reason this is not possible.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unfortunately, we need to keep them independent for now

struct add_op
{
template <class A, class B>
__host__ __device__ auto operator()(A a, B b) const
{
return a + b;
}
};
struct triad_op
{
template <class B, class C>
__host__ __device__ auto operator()(B b, C c) const
{
return b - c - c;
}
};
struct nstream_op
{
template <class A, class B, class C>
__host__ __device__ auto operator()(A a, B b, C c) const
{
return a + b - c - c;
}
};

#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
struct tile_mul_op
{
template <class B>
__tile__ auto operator()(B b) const
{
return -(b + b);
}
};
struct tile_add_op
{
template <class A, class B>
__tile__ auto operator()(A a, B b) const
{
return a + b;
}
};
struct tile_triad_op
{
template <class B, class C>
__tile__ auto operator()(B b, C c) const
{
return b - c - c;
}
};
struct tile_nstream_op
{
template <class A, class B, class C>
__tile__ auto operator()(A a, B b, C c) const
{
return a + b - c - c;
}
};

CUB_NAMESPACE_BEGIN
namespace transform
{
template <class T>
inline constexpr bool tile_eligible_v<mul_op, T, 1> = true;
template <class T>
inline constexpr bool tile_eligible_v<add_op, T, 2> = true;
template <class T>
inline constexpr bool tile_eligible_v<triad_op, T, 2> = true;
template <class T>
inline constexpr bool tile_eligible_v<nstream_op, T, 3> = true;
template <>
struct tile_operator<mul_op>
{
using type = tile_mul_op;
};
template <>
struct tile_operator<add_op>
{
using type = tile_add_op;
};
template <>
struct tile_operator<triad_op>
{
using type = tile_triad_op;
};
template <>
struct tile_operator<nstream_op>
{
using type = tile_nstream_op;
};
} // namespace transform
CUB_NAMESPACE_END
#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()

// The tile path does not support __int128 (no tensor_span/partition_view for it), so the type axis
// omits it relative to the base babelstream bench.
#ifdef TUNE_T
using element_types = nvbench::type_list<TUNE_T>;
#else
using element_types = nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::float32_t, nvbench::float64_t>;
#endif

inline auto array_size_powers = nvbench::range(16, 32, 4);

// Same constant inputs as the base bench so nstream maintains a consistent workload.
inline constexpr auto startA = 11;
inline constexpr auto startB = 2;
inline constexpr auto startC = 1;
inline constexpr auto startScalar = -2;
static_assert(startA == (startA + startB + startScalar * startC), "nstream must have a consistent workload");

template <typename T>
static void mul(nvbench::state& state, nvbench::type_list<T>)
try
{
const auto n = state.get_int64("Elements{io}");
thrust::device_vector<T> b(n, startB);
thrust::device_vector<T> c(n, startC);

state.add_element_count(n);
state.add_global_memory_reads<T>(n);
state.add_global_memory_writes<T>(n);
bench_transform(state, cuda::std::tuple{c.begin()}, b.begin(), n, mul_op{});
}
catch (const std::bad_alloc&)
{
state.skip("Skipping: out of memory.");
}

NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(element_types))
.set_name("tile_mul")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements{io}", array_size_powers);

template <typename T>
static void add(nvbench::state& state, nvbench::type_list<T>)
try
{
const auto n = state.get_int64("Elements{io}");
thrust::device_vector<T> a(n, startA);
thrust::device_vector<T> b(n, startB);
thrust::device_vector<T> c(n, startC);

state.add_element_count(n);
state.add_global_memory_reads<T>(2 * n);
state.add_global_memory_writes<T>(n);
bench_transform(state, cuda::std::tuple{a.begin(), b.begin()}, c.begin(), n, add_op{});
}
catch (const std::bad_alloc&)
{
state.skip("Skipping: out of memory.");
}

NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(element_types))
.set_name("tile_add")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements{io}", array_size_powers);

template <typename T>
static void triad(nvbench::state& state, nvbench::type_list<T>)
try
{
const auto n = state.get_int64("Elements{io}");
thrust::device_vector<T> a(n, startA);
thrust::device_vector<T> b(n, startB);
thrust::device_vector<T> c(n, startC);

state.add_element_count(n);
state.add_global_memory_reads<T>(2 * n);
state.add_global_memory_writes<T>(n);
bench_transform(state, cuda::std::tuple{b.begin(), c.begin()}, a.begin(), n, triad_op{});
}
catch (const std::bad_alloc&)
{
state.skip("Skipping: out of memory.");
}

NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(element_types))
.set_name("tile_triad")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements{io}", array_size_powers);

template <typename T>
static void nstream(nvbench::state& state, nvbench::type_list<T>)
try
{
const auto n = state.get_int64("Elements{io}");
thrust::device_vector<T> a(n, startA);
thrust::device_vector<T> b(n, startB);
thrust::device_vector<T> c(n, startC);

state.add_element_count(n);
state.add_global_memory_reads<T>(3 * n);
state.add_global_memory_writes<T>(n);
bench_transform(state, cuda::std::tuple{a.begin(), b.begin(), c.begin()}, a.begin(), n, nstream_op{});
}
catch (const std::bad_alloc&)
{
state.skip("Skipping: out of memory.");
}

NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types))
.set_name("tile_nstream")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements{io}", array_size_powers);
77 changes: 77 additions & 0 deletions cub/benchmarks/bench/transform/tile/copy.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

// Pure copy (identity transform) -- measures plain load/store bandwidth through the tile
// load_masked/store_masked path. The identity op registers a tile_operator substitute (gated); under
// --enable-tile + CCCL_ENABLE_TILE_TRANSFORM_DISPATCH the dispatch hook routes it to the tile kernel,
// otherwise it falls through to CUB's standard transform. This file disappears once tile dispatch is
// fully transparent.

#include "../common.h"

#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
# include <cuda_tile.h>
#endif

struct identity
{
template <class T>
__host__ __device__ auto operator()(T v) const
{
return v;
}
};

#if _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()
struct tile_identity
{
template <class T>
__tile__ auto operator()(T v) const
{
return v;
}
};

CUB_NAMESPACE_BEGIN
namespace transform
{
template <class T>
inline constexpr bool tile_eligible_v<identity, T, 1> = true;
template <>
struct tile_operator<identity>
{
using type = tile_identity;
};
} // namespace transform
CUB_NAMESPACE_END
#endif // _CCCL_CUB_TILE_TRANSFORM_DISPATCH_ENABLED()

#ifdef TUNE_T
using element_types = nvbench::type_list<TUNE_T>;
#else
using element_types = nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::int32_t, nvbench::float64_t>;
#endif

template <typename T>
static void copy(nvbench::state& state, nvbench::type_list<T>)
try
{
const auto n = state.get_int64("Elements{io}");

thrust::device_vector<T> in = generate(n);
thrust::device_vector<T> out(n, thrust::no_init);

state.add_element_count(n);
state.add_global_memory_reads<T>(n);
state.add_global_memory_writes<T>(n);
bench_transform(state, cuda::std::tuple{in.begin()}, out.begin(), n, identity{});
}
catch (const std::bad_alloc&)
{
state.skip("Skipping: out of memory.");
}

NVBENCH_BENCH_TYPES(copy, NVBENCH_TYPE_AXES(element_types))
.set_name("tile_copy")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 32, 4));
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Loading
Loading