Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
a79569e
Add initial free threading cuda.compute implementation
NaderAlAwar Jun 12, 2026
e461b59
Decouple from numba.cuda further in tests in favor of cupy and cuda.c…
NaderAlAwar Jun 12, 2026
d97d809
Stop wrapping binary search comparator in python callable
NaderAlAwar Jun 12, 2026
230bfad
Merge branch 'opkind-less-python-callable-fix' into test-minimal-cuda…
NaderAlAwar Jun 12, 2026
7bbf5bf
Merge branch 'test-minimal-cuda-cccl-extra' into free-threaded-cuda-cccl
NaderAlAwar Jun 12, 2026
e0c72ab
Add CI for 3.14t
NaderAlAwar Jun 12, 2026
dda2d2a
Add benchmarks to measure host side overhead
NaderAlAwar Jun 12, 2026
1786eb0
Use pytest-benchmark instead
NaderAlAwar Jun 12, 2026
fab2f71
Merge branch 'cuda-compute-host-benchmarks' into free-threaded-cuda-cccl
NaderAlAwar Jun 12, 2026
a921939
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jun 12, 2026
fadb32a
Merge branch 'cuda-compute-host-benchmarks' into free-threaded-cuda-cccl
NaderAlAwar Jun 12, 2026
a4858f9
Merge branch 'free-threaded-cuda-cccl' of github.com:NaderAlAwar/cccl…
NaderAlAwar Jun 12, 2026
1571d76
Add a case that accepts a stream
NaderAlAwar Jun 12, 2026
8172fd6
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jun 12, 2026
85342b9
Merge branch 'cuda-compute-host-benchmarks' into free-threaded-cuda-cccl
NaderAlAwar Jun 12, 2026
c564d3e
Address comments
NaderAlAwar Jun 12, 2026
dd27462
Merge branch 'main' into test-minimal-cuda-cccl-extra
NaderAlAwar Jun 12, 2026
79df8e0
fix merge conflict
NaderAlAwar Jun 12, 2026
c2e7c01
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jun 12, 2026
b09c9c6
Fix pre-commit
NaderAlAwar Jun 12, 2026
4e5b9cc
Document new caching behavior and add examples
NaderAlAwar Jun 15, 2026
cd0858e
Update developer documentation
NaderAlAwar Jun 15, 2026
7c19d58
Merge remote-tracking branch 'upstream/main' into test-minimal-cuda-c…
NaderAlAwar Jun 22, 2026
15652c0
Move no numba tests to separate file
NaderAlAwar Jun 23, 2026
df98ed6
Merge branch 'test-minimal-cuda-cccl-extra' into free-threaded-cuda-cccl
NaderAlAwar Jun 23, 2026
8ad101e
Merge branch 'main' into free-threaded-cuda-cccl
NaderAlAwar Jun 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions c/parallel/src/transform.cu
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ struct transform_kernel_source
cub::detail::transform::cuda_expected<cub::detail::transform::async_config>
CacheAsyncConfiguration(const ActionT& action)
{
#if defined(CCCL_PYTHON_FREE_THREADED)
return action();
#else // defined(CCCL_PYTHON_FREE_THREADED)
auto cache = reinterpret_cast<transform::cache*>(build.cache);
if (cache == nullptr)
{
Expand All @@ -135,12 +138,16 @@ struct transform_kernel_source
cache->async_config = action();
}
return *cache->async_config;
#endif // defined(CCCL_PYTHON_FREE_THREADED)
}

template <class ActionT>
cub::detail::transform::cuda_expected<cub::detail::transform::prefetch_config>
CachePrefetchConfiguration(const ActionT& action)
{
#if defined(CCCL_PYTHON_FREE_THREADED)
return action();
#else // defined(CCCL_PYTHON_FREE_THREADED)
auto cache = reinterpret_cast<transform::cache*>(build.cache);
if (cache == nullptr)
{
Expand All @@ -151,6 +158,7 @@ struct transform_kernel_source
cache->prefetch_config = action();
}
return *cache->prefetch_config;
#endif // defined(CCCL_PYTHON_FREE_THREADED)
}

CUkernel TransformKernel() const
Expand Down Expand Up @@ -346,7 +354,9 @@ static_assert(device_transform_policy()(detail::current_tuning_cc()) == {9}, "Ho
return CUDA_ERROR_OUT_OF_MEMORY;
}
std::memcpy(runtime_policy.get(), &policy_sel, sizeof(policy_sel));
auto cache_obj = std::make_unique<transform::cache>();
#if !defined(CCCL_PYTHON_FREE_THREADED)
auto cache_obj = std::make_unique<transform::cache>();
#endif // !defined(CCCL_PYTHON_FREE_THREADED)
auto kernel_name_copy = std::unique_ptr<char[]>(duplicate_c_string(kernel_lowered_name));

build_ptr->loaded_bytes_per_iteration = static_cast<int>(input_it.value_type.size);
Expand All @@ -372,7 +382,11 @@ static_assert(device_transform_policy()(detail::current_tuning_cc()) == {9}, "Ho
build_ptr->payload_kind = CCCL_PAYLOAD_CUBIN;
}

build_ptr->cache = cache_obj.release();
#if defined(CCCL_PYTHON_FREE_THREADED)
build_ptr->cache = nullptr;
#else // defined(CCCL_PYTHON_FREE_THREADED)
build_ptr->cache = cache_obj.release();
#endif // defined(CCCL_PYTHON_FREE_THREADED)
build_ptr->transform_kernel_lowered_name = kernel_name_copy.release();
build_ptr->runtime_policy = runtime_policy.release();
build_ptr->runtime_policy_size = sizeof(policy_sel);
Expand Down Expand Up @@ -642,7 +656,9 @@ static_assert(device_transform_policy()(detail::current_tuning_cc()) == {12}, "H
return CUDA_ERROR_OUT_OF_MEMORY;
}
std::memcpy(runtime_policy.get(), &policy_sel, sizeof(policy_sel));
auto cache_obj = std::make_unique<transform::cache>();
#if !defined(CCCL_PYTHON_FREE_THREADED)
auto cache_obj = std::make_unique<transform::cache>();
#endif // !defined(CCCL_PYTHON_FREE_THREADED)
auto kernel_name_copy = std::unique_ptr<char[]>(duplicate_c_string(kernel_lowered_name));

build_ptr->loaded_bytes_per_iteration = static_cast<int>((input1_it.value_type.size + input2_it.value_type.size));
Expand All @@ -668,7 +684,11 @@ static_assert(device_transform_policy()(detail::current_tuning_cc()) == {12}, "H
build_ptr->payload_kind = CCCL_PAYLOAD_CUBIN;
}

build_ptr->cache = cache_obj.release();
#if defined(CCCL_PYTHON_FREE_THREADED)
build_ptr->cache = nullptr;
#else // defined(CCCL_PYTHON_FREE_THREADED)
build_ptr->cache = cache_obj.release();
#endif // defined(CCCL_PYTHON_FREE_THREADED)
build_ptr->transform_kernel_lowered_name = kernel_name_copy.release();
build_ptr->runtime_policy = runtime_policy.release();
build_ptr->runtime_policy_size = sizeof(policy_sel);
Expand Down
7 changes: 5 additions & 2 deletions ci/matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,10 @@ workflows:
- {jobs: ['test'], project: 'cccl_c_stf', ctk: '13.X', cxx: 'gcc13', gpu: ['rtx2080', 'l4', 'h100']}
# Python -- pinned to gcc13 / msvc2022 for consistency across CTK images
- {jobs: ['test'], project: 'python', ctk: ['12.X', '13.X'], py_version: ['3.10'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']}
- {jobs: ['test'], project: 'python', ctk: ['12.X','13.0', '13.X'], py_version: ['3.14'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']}
- {jobs: ['test'], project: 'python', py_version: '3.14', gpu: 'h100', cxx: 'gcc13'}
- {jobs: ['test'], project: 'python', ctk: ['12.X','13.0', '13.X'], py_version: ['3.14', '3.14t'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']}
- {jobs: ['test'], project: 'python', py_version: ['3.14', '3.14t'], gpu: 'h100', cxx: 'gcc13'}
- {jobs: ['test_py_compute_minimal'], project: 'python', ctk: '13.X', py_version: '3.14', gpu: 'l4', cxx: 'gcc13'}
- {jobs: ['test_py_compute_minimal'], project: 'python', ctk: '13.X', py_version: '3.14t', gpu: 'l4', cxx: 'gcc13'}
# CCCL packaging:
- {jobs: ['test'], project: 'packaging', ctk: '12.0', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080', args: '-min-cmake'}
- {jobs: ['test'], project: 'packaging', ctk: '12.X', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080'}
Expand Down Expand Up @@ -544,6 +546,7 @@ jobs:
test_py_headers: { name: "Test cuda.cccl.headers", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_headers'} }
test_py_coop: { name: "Test cuda.coop._experimental", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_coop'} }
test_py_par: { name: "Test cuda.compute", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute'} }
test_py_compute_minimal: { name: "Test cuda.compute minimal", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute_minimal'} }
test_py_examples: { name: "Test cuda.cccl.examples", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_examples'} }

# Run jobs for 'target' project (ci/util/build_and_test_targets.sh):
Expand Down
36 changes: 36 additions & 0 deletions ci/test_cuda_compute_minimal_python.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env bash

set -euo pipefail

ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
repo_root="$(cd "$ci_dir/.." && pwd)"
source "$ci_dir/pyenv_helper.sh"

# Parse common arguments
source "$ci_dir/util/python/common_arg_parser.sh"
parse_python_args "$@"
require_py_version "Usage: $0 -py-version <python_version>"

cuda_major_version=$(nvcc --version | grep release | awk '{print $6}' | tr -d ',' | cut -d '.' -f 1 | cut -d 'V' -f 2)

# Setup Python environment
setup_python_env "${py_version}"

# Fetch or build the cuda_cccl wheel:
if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
wheel_artifact_name=$("$ci_dir/util/workflow/get_wheel_artifact_name.sh")
"$ci_dir/util/artifacts/download.sh" "${wheel_artifact_name}" "${repo_root}/"
wheelhouse_dir="${repo_root}/wheelhouse"
else
"$ci_dir/build_cuda_cccl_python.sh" -py-version "${py_version}"
wheelhouse_dir="${repo_root}/wheelhouse"
fi

# Install cuda_cccl with the minimal CUDA extra. This intentionally avoids the
# full cu* extras because those pull in numba/numba-cuda.
CUDA_CCCL_WHEEL_PATH="$(ls "${wheelhouse_dir}"/cuda_cccl-*.whl)"
python -m pip install "${CUDA_CCCL_WHEEL_PATH}[minimal-cu${cuda_major_version}]"
python -m pip install pytest pytest-xdist "cupy-cuda${cuda_major_version}x"

cd "${repo_root}/python/cuda_cccl/tests/"
python -m pytest -n 6 -v compute/test_no_numba.py
4 changes: 2 additions & 2 deletions ci/test_cuda_compute_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,5 @@ if [[ "${CCCL_PYTHON_USE_V2:-}" =~ ^(1|true|TRUE|on|ON)$ ]]; then
fi

cd "/home/coder/cccl/python/cuda_cccl/tests/"
python -m pytest "${pytest_extra[@]}" -n 6 -v compute/ -m "not large"
python -m pytest "${pytest_extra[@]}" -n 0 -v compute/ -m "large"
python -m pytest "${pytest_extra[@]}" -n 6 -v compute/ -m "not large and not free_threading"
python -m pytest "${pytest_extra[@]}" -n 0 -v compute/ -m "large and not free_threading"
5 changes: 3 additions & 2 deletions ci/windows/build_common_python.psm1
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ function Get-Python {
Returns the path of the Python interpreter satisfying the supplied
version, installing it via uv if necessary.
.PARAMETER Version
A string in the form 'M.m' (e.g., '3.10', '3.13').
A string in the form 'M.m' (e.g., '3.10', '3.13') or a free-threaded
version such as '3.14t'.
#>
[CmdletBinding()]
param(
[Parameter(Mandatory, Position = 0)]
[ValidatePattern('^\d+\.\d+$')]
[ValidatePattern('^\d+\.\d+t?$')]
[string]$Version
)

Expand Down
5 changes: 3 additions & 2 deletions ci/windows/build_cuda_cccl_python.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@

.PARAMETER PyVersion
**Required.** The Python version to use for building the wheel, expressed
as `<major>.<minor>` (e.g. `3.11`).
as `<major>.<minor>` (e.g. `3.11`) or a free-threaded version such as
`3.14t`.

.PARAMETER OnlyCudaMajor
Optional. Restricts the build to a single CUDA major version (`12` or `13`).
Expand Down Expand Up @@ -49,7 +50,7 @@
Param(
[Parameter(Mandatory = $true)]
[Alias("py-version")]
[ValidatePattern("^\d+\.\d+$")]
[ValidatePattern("^\d+\.\d+t?$")]
[string]$PyVersion,

[Parameter(Mandatory = $false)]
Expand Down
54 changes: 27 additions & 27 deletions ci/windows/test_cuda_cccl_examples_python.ps1
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
Param(
[Parameter(Mandatory = $true)]
[Alias("py-version")]
[ValidatePattern("^\d+\.\d+$")]
[string]$PyVersion
)
$ErrorActionPreference = "Stop"
# Import shared helpers
Import-Module "$PSScriptRoot/build_common.psm1"
Import-Module "$PSScriptRoot/build_common_python.psm1"
$python = Get-Python -Version $PyVersion
$cudaMajor = Get-CudaMajor
$repoRoot = Get-RepoRoot
${wheelPath} = Get-CudaCcclWheel
& $python -m pip install -U pip pytest pytest-xdist
& $python -m pip install "${wheelPath}[test-cu$cudaMajor]"
Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
try {
& $python -m pytest -n 6 test_examples.py
}
finally { Pop-Location }
Param(
[Parameter(Mandatory = $true)]
[Alias("py-version")]
[ValidatePattern("^\d+\.\d+t?$")]
[string]$PyVersion
)

$ErrorActionPreference = "Stop"

# Import shared helpers
Import-Module "$PSScriptRoot/build_common.psm1"
Import-Module "$PSScriptRoot/build_common_python.psm1"

$python = Get-Python -Version $PyVersion
$cudaMajor = Get-CudaMajor

$repoRoot = Get-RepoRoot

${wheelPath} = Get-CudaCcclWheel
& $python -m pip install -U pip pytest pytest-xdist
& $python -m pip install "${wheelPath}[test-cu$cudaMajor]"

Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
try {
& $python -m pytest -n 6 test_examples.py
}
finally { Pop-Location }
54 changes: 27 additions & 27 deletions ci/windows/test_cuda_cccl_headers_python.ps1
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
Param(
[Parameter(Mandatory = $true)]
[Alias("py-version")]
[ValidatePattern("^\d+\.\d+$")]
[string]$PyVersion
)
$ErrorActionPreference = "Stop"
# Import shared helpers
Import-Module "$PSScriptRoot/build_common.psm1"
Import-Module "$PSScriptRoot/build_common_python.psm1"
$python = Get-Python -Version $PyVersion
$cudaMajor = Get-CudaMajor
$repoRoot = Get-RepoRoot
${wheelPath} = Get-CudaCcclWheel
& $python -m pip install -U pip pytest pytest-xdist
& $python -m pip install "${wheelPath}[test-cu$cudaMajor]"
Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
try {
& $python -m pytest -n auto -v headers/
}
finally { Pop-Location }
Param(
[Parameter(Mandatory = $true)]
[Alias("py-version")]
[ValidatePattern("^\d+\.\d+t?$")]
[string]$PyVersion
)

$ErrorActionPreference = "Stop"

# Import shared helpers
Import-Module "$PSScriptRoot/build_common.psm1"
Import-Module "$PSScriptRoot/build_common_python.psm1"

$python = Get-Python -Version $PyVersion
$cudaMajor = Get-CudaMajor

$repoRoot = Get-RepoRoot

${wheelPath} = Get-CudaCcclWheel
& $python -m pip install -U pip pytest pytest-xdist
& $python -m pip install "${wheelPath}[test-cu$cudaMajor]"

Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
try {
& $python -m pytest -n auto -v headers/
}
finally { Pop-Location }
58 changes: 29 additions & 29 deletions ci/windows/test_cuda_compute_python.ps1
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
Param(
[Parameter(Mandatory = $true)]
[Alias("py-version")]
[ValidatePattern("^\d+\.\d+$")]
[string]$PyVersion
)
$ErrorActionPreference = "Stop"
# Import shared helpers
Import-Module "$PSScriptRoot/build_common.psm1"
Import-Module "$PSScriptRoot/build_common_python.psm1"
$python = Get-Python -Version $PyVersion
$cudaMajor = Get-CudaMajor
$repoRoot = Get-RepoRoot
$wheelPath = Get-CudaCcclWheel
& $python -m pip install -U pip pytest pytest-xdist
& $python -m pip install "$wheelPath[test-cu$cudaMajor]"
Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
try {
& $python -m pytest -n 6 -v compute/ -m "not large"
& $python -m pytest -n 0 -v compute/ -m "large"
}
finally { Pop-Location }
Param(
[Parameter(Mandatory = $true)]
[Alias("py-version")]
[ValidatePattern("^\d+\.\d+t?$")]
[string]$PyVersion
)

$ErrorActionPreference = "Stop"

# Import shared helpers
Import-Module "$PSScriptRoot/build_common.psm1"
Import-Module "$PSScriptRoot/build_common_python.psm1"

$python = Get-Python -Version $PyVersion
$cudaMajor = Get-CudaMajor

$repoRoot = Get-RepoRoot

$wheelPath = Get-CudaCcclWheel

& $python -m pip install -U pip pytest pytest-xdist
& $python -m pip install "$wheelPath[test-cu$cudaMajor]"

Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
try {
& $python -m pytest -n 6 -v compute/ -m "not large and not free_threading"
& $python -m pytest -n 0 -v compute/ -m "large and not free_threading"
}
finally { Pop-Location }
Loading