NVIDIA · NaderAlAwar · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
@@ -125,6 +125,9 @@ struct transform_kernel_source
   cub::detail::transform::cuda_expected<cub::detail::transform::async_config>
   CacheAsyncConfiguration(const ActionT& action)
   {
+#if defined(CCCL_PYTHON_FREE_THREADED)
+    return action();
+#else // defined(CCCL_PYTHON_FREE_THREADED)
     auto cache = reinterpret_cast<transform::cache*>(build.cache);
     if (cache == nullptr)
     {
@@ -135,12 +138,16 @@ struct transform_kernel_source
       cache->async_config = action();
     }
     return *cache->async_config;
+#endif // defined(CCCL_PYTHON_FREE_THREADED)
   }
 
   template <class ActionT>
   cub::detail::transform::cuda_expected<cub::detail::transform::prefetch_config>
   CachePrefetchConfiguration(const ActionT& action)
   {
+#if defined(CCCL_PYTHON_FREE_THREADED)
+    return action();
+#else // defined(CCCL_PYTHON_FREE_THREADED)
     auto cache = reinterpret_cast<transform::cache*>(build.cache);
     if (cache == nullptr)
     {
@@ -151,6 +158,7 @@ struct transform_kernel_source
       cache->prefetch_config = action();
     }
     return *cache->prefetch_config;
+#endif // defined(CCCL_PYTHON_FREE_THREADED)
   }
 
   CUkernel TransformKernel() const
@@ -346,7 +354,9 @@ static_assert(device_transform_policy()(detail::current_tuning_cc()) == {9}, "Ho
     return CUDA_ERROR_OUT_OF_MEMORY;
   }
   std::memcpy(runtime_policy.get(), &policy_sel, sizeof(policy_sel));
-  auto cache_obj        = std::make_unique<transform::cache>();
+#if !defined(CCCL_PYTHON_FREE_THREADED)
+  auto cache_obj = std::make_unique<transform::cache>();
+#endif // !defined(CCCL_PYTHON_FREE_THREADED)
   auto kernel_name_copy = std::unique_ptr<char[]>(duplicate_c_string(kernel_lowered_name));
 
   build_ptr->loaded_bytes_per_iteration = static_cast<int>(input_it.value_type.size);
@@ -372,7 +382,11 @@ static_assert(device_transform_policy()(detail::current_tuning_cc()) == {9}, "Ho
     build_ptr->payload_kind  = CCCL_PAYLOAD_CUBIN;
   }
 
-  build_ptr->cache                         = cache_obj.release();
+#if defined(CCCL_PYTHON_FREE_THREADED)
+  build_ptr->cache = nullptr;
+#else // defined(CCCL_PYTHON_FREE_THREADED)
+  build_ptr->cache = cache_obj.release();
+#endif // defined(CCCL_PYTHON_FREE_THREADED)
   build_ptr->transform_kernel_lowered_name = kernel_name_copy.release();
   build_ptr->runtime_policy                = runtime_policy.release();
   build_ptr->runtime_policy_size           = sizeof(policy_sel);
@@ -642,7 +656,9 @@ static_assert(device_transform_policy()(detail::current_tuning_cc()) == {12}, "H
     return CUDA_ERROR_OUT_OF_MEMORY;
   }
   std::memcpy(runtime_policy.get(), &policy_sel, sizeof(policy_sel));
-  auto cache_obj        = std::make_unique<transform::cache>();
+#if !defined(CCCL_PYTHON_FREE_THREADED)
+  auto cache_obj = std::make_unique<transform::cache>();
+#endif // !defined(CCCL_PYTHON_FREE_THREADED)
   auto kernel_name_copy = std::unique_ptr<char[]>(duplicate_c_string(kernel_lowered_name));
 
   build_ptr->loaded_bytes_per_iteration = static_cast<int>((input1_it.value_type.size + input2_it.value_type.size));
@@ -668,7 +684,11 @@ static_assert(device_transform_policy()(detail::current_tuning_cc()) == {12}, "H
     build_ptr->payload_kind  = CCCL_PAYLOAD_CUBIN;
   }
 
-  build_ptr->cache                         = cache_obj.release();
+#if defined(CCCL_PYTHON_FREE_THREADED)
+  build_ptr->cache = nullptr;
+#else // defined(CCCL_PYTHON_FREE_THREADED)
+  build_ptr->cache = cache_obj.release();
+#endif // defined(CCCL_PYTHON_FREE_THREADED)
   build_ptr->transform_kernel_lowered_name = kernel_name_copy.release();
   build_ptr->runtime_policy                = runtime_policy.release();
   build_ptr->runtime_policy_size           = sizeof(policy_sel);

@@ -87,8 +87,10 @@ workflows:
     - {jobs: ['test'], project: 'cccl_c_stf', ctk: '13.X', cxx: 'gcc13', gpu: ['rtx2080', 'l4', 'h100']}
     # Python -- pinned to gcc13 / msvc2022 for consistency across CTK images
     - {jobs: ['test'], project: 'python', ctk: ['12.X',        '13.X'], py_version: ['3.10'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']}
-    - {jobs: ['test'], project: 'python', ctk: ['12.X','13.0', '13.X'], py_version: ['3.14'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']}
-    - {jobs: ['test'], project: 'python', py_version: '3.14', gpu: 'h100', cxx: 'gcc13'}
+    - {jobs: ['test'], project: 'python', ctk: ['12.X','13.0', '13.X'], py_version: ['3.14', '3.14t'], gpu: 'l4', cxx: ['gcc13', 'msvc2022']}
+    - {jobs: ['test'], project: 'python', py_version: ['3.14', '3.14t'], gpu: 'h100', cxx: 'gcc13'}
+    - {jobs: ['test_py_compute_minimal'], project: 'python', ctk: '13.X', py_version: '3.14', gpu: 'l4', cxx: 'gcc13'}
+    - {jobs: ['test_py_compute_minimal'], project: 'python', ctk: '13.X', py_version: '3.14t', gpu: 'l4', cxx: 'gcc13'}
     # CCCL packaging:
     - {jobs: ['test'], project: 'packaging', ctk: '12.0', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080', args: '-min-cmake'}
     - {jobs: ['test'], project: 'packaging', ctk: '12.X', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080'}
@@ -544,6 +546,7 @@ jobs:
   test_py_headers:  { name: "Test cuda.cccl.headers",      gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_headers'} }
   test_py_coop:     { name: "Test cuda.coop._experimental",  gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_coop'} }
   test_py_par:      { name: "Test cuda.compute",     gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute'} }
+  test_py_compute_minimal: { name: "Test cuda.compute minimal", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute_minimal'} }
   test_py_examples: { name: "Test cuda.cccl.examples",     gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_examples'} }
 
   # Run jobs for 'target' project (ci/util/build_and_test_targets.sh):

@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+repo_root="$(cd "$ci_dir/.." && pwd)"
+source "$ci_dir/pyenv_helper.sh"
+
+# Parse common arguments
+source "$ci_dir/util/python/common_arg_parser.sh"
+parse_python_args "$@"
+require_py_version "Usage: $0 -py-version <python_version>"
+
+cuda_major_version=$(nvcc --version | grep release | awk '{print $6}' | tr -d ',' | cut -d '.' -f 1 | cut -d 'V' -f 2)
+
+# Setup Python environment
+setup_python_env "${py_version}"
+
+# Fetch or build the cuda_cccl wheel:
+if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
+  wheel_artifact_name=$("$ci_dir/util/workflow/get_wheel_artifact_name.sh")
+  "$ci_dir/util/artifacts/download.sh" "${wheel_artifact_name}" "${repo_root}/"
+  wheelhouse_dir="${repo_root}/wheelhouse"
+else
+  "$ci_dir/build_cuda_cccl_python.sh" -py-version "${py_version}"
+  wheelhouse_dir="${repo_root}/wheelhouse"
+fi
+
+# Install cuda_cccl with the minimal CUDA extra. This intentionally avoids the
+# full cu* extras because those pull in numba/numba-cuda.
+CUDA_CCCL_WHEEL_PATH="$(ls "${wheelhouse_dir}"/cuda_cccl-*.whl)"
+python -m pip install "${CUDA_CCCL_WHEEL_PATH}[minimal-cu${cuda_major_version}]"
+python -m pip install pytest pytest-xdist "cupy-cuda${cuda_major_version}x"
+
+cd "${repo_root}/python/cuda_cccl/tests/"
+python -m pytest -n 6 -v compute/test_no_numba.py
@@ -35,5 +35,5 @@ if [[ "${CCCL_PYTHON_USE_V2:-}" =~ ^(1|true|TRUE|on|ON)$ ]]; then
 fi
 
 cd "/home/coder/cccl/python/cuda_cccl/tests/"
-python -m pytest "${pytest_extra[@]}" -n 6 -v compute/ -m "not large"
-python -m pytest "${pytest_extra[@]}" -n 0 -v compute/ -m "large"
+python -m pytest "${pytest_extra[@]}" -n 6 -v compute/ -m "not large and not free_threading"
+python -m pytest "${pytest_extra[@]}" -n 0 -v compute/ -m "large and not free_threading"
@@ -4,12 +4,13 @@ function Get-Python {
         Returns the path of the Python interpreter satisfying the supplied
         version, installing it via uv if necessary.
     .PARAMETER Version
-        A string in the form 'M.m' (e.g., '3.10', '3.13').
+        A string in the form 'M.m' (e.g., '3.10', '3.13') or a free-threaded
+        version such as '3.14t'.
     #>
     [CmdletBinding()]
     param(
         [Parameter(Mandatory, Position = 0)]
-        [ValidatePattern('^\d+\.\d+$')]
+        [ValidatePattern('^\d+\.\d+t?$')]
         [string]$Version
     )
 

@@ -21,7 +21,8 @@
 
 .PARAMETER PyVersion
     **Required.** The Python version to use for building the wheel, expressed
-    as `<major>.<minor>` (e.g. `3.11`).
+    as `<major>.<minor>` (e.g. `3.11`) or a free-threaded version such as
+    `3.14t`.
 
 .PARAMETER OnlyCudaMajor
     Optional. Restricts the build to a single CUDA major version (`12` or `13`).
@@ -49,7 +50,7 @@
 Param(
     [Parameter(Mandatory = $true)]
     [Alias("py-version")]
-    [ValidatePattern("^\d+\.\d+$")]
+    [ValidatePattern("^\d+\.\d+t?$")]
     [string]$PyVersion,
 
     [Parameter(Mandatory = $false)]

@@ -1,27 +1,27 @@
-Param(
-    [Parameter(Mandatory = $true)]
-    [Alias("py-version")]
-    [ValidatePattern("^\d+\.\d+$")]
-    [string]$PyVersion
-)
-
-$ErrorActionPreference = "Stop"
-
-# Import shared helpers
-Import-Module "$PSScriptRoot/build_common.psm1"
-Import-Module "$PSScriptRoot/build_common_python.psm1"
-
-$python = Get-Python -Version $PyVersion
-$cudaMajor = Get-CudaMajor
-
-$repoRoot = Get-RepoRoot
-
-${wheelPath} = Get-CudaCcclWheel
-& $python -m pip install -U pip pytest pytest-xdist
-& $python -m pip install "${wheelPath}[test-cu$cudaMajor]"
-
-Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
-try {
-    & $python -m pytest -n 6 test_examples.py
-}
-finally { Pop-Location }
+Param(
+    [Parameter(Mandatory = $true)]
+    [Alias("py-version")]
+    [ValidatePattern("^\d+\.\d+t?$")]
+    [string]$PyVersion
+)
+
+$ErrorActionPreference = "Stop"
+
+# Import shared helpers
+Import-Module "$PSScriptRoot/build_common.psm1"
+Import-Module "$PSScriptRoot/build_common_python.psm1"
+
+$python = Get-Python -Version $PyVersion
+$cudaMajor = Get-CudaMajor
+
+$repoRoot = Get-RepoRoot
+
+${wheelPath} = Get-CudaCcclWheel
+& $python -m pip install -U pip pytest pytest-xdist
+& $python -m pip install "${wheelPath}[test-cu$cudaMajor]"
+
+Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
+try {
+    & $python -m pytest -n 6 test_examples.py
+}
+finally { Pop-Location }
@@ -1,27 +1,27 @@
-Param(
-    [Parameter(Mandatory = $true)]
-    [Alias("py-version")]
-    [ValidatePattern("^\d+\.\d+$")]
-    [string]$PyVersion
-)
-
-$ErrorActionPreference = "Stop"
-
-# Import shared helpers
-Import-Module "$PSScriptRoot/build_common.psm1"
-Import-Module "$PSScriptRoot/build_common_python.psm1"
-
-$python = Get-Python -Version $PyVersion
-$cudaMajor = Get-CudaMajor
-
-$repoRoot = Get-RepoRoot
-
-${wheelPath} = Get-CudaCcclWheel
-& $python -m pip install -U pip pytest pytest-xdist
-& $python -m pip install "${wheelPath}[test-cu$cudaMajor]"
-
-Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
-try {
-    & $python -m pytest -n auto -v headers/
-}
-finally { Pop-Location }
+Param(
+    [Parameter(Mandatory = $true)]
+    [Alias("py-version")]
+    [ValidatePattern("^\d+\.\d+t?$")]
+    [string]$PyVersion
+)
+
+$ErrorActionPreference = "Stop"
+
+# Import shared helpers
+Import-Module "$PSScriptRoot/build_common.psm1"
+Import-Module "$PSScriptRoot/build_common_python.psm1"
+
+$python = Get-Python -Version $PyVersion
+$cudaMajor = Get-CudaMajor
+
+$repoRoot = Get-RepoRoot
+
+${wheelPath} = Get-CudaCcclWheel
+& $python -m pip install -U pip pytest pytest-xdist
+& $python -m pip install "${wheelPath}[test-cu$cudaMajor]"
+
+Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
+try {
+    & $python -m pytest -n auto -v headers/
+}
+finally { Pop-Location }
@@ -1,29 +1,29 @@
-Param(
-    [Parameter(Mandatory = $true)]
-    [Alias("py-version")]
-    [ValidatePattern("^\d+\.\d+$")]
-    [string]$PyVersion
-)
-
-$ErrorActionPreference = "Stop"
-
-# Import shared helpers
-Import-Module "$PSScriptRoot/build_common.psm1"
-Import-Module "$PSScriptRoot/build_common_python.psm1"
-
-$python = Get-Python -Version $PyVersion
-$cudaMajor = Get-CudaMajor
-
-$repoRoot = Get-RepoRoot
-
-$wheelPath = Get-CudaCcclWheel
-
-& $python -m pip install -U pip pytest pytest-xdist
-& $python -m pip install "$wheelPath[test-cu$cudaMajor]"
-
-Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
-try {
-    & $python -m pytest -n 6 -v compute/ -m "not large"
-    & $python -m pytest -n 0 -v compute/ -m "large"
-}
-finally { Pop-Location }
+Param(
+    [Parameter(Mandatory = $true)]
+    [Alias("py-version")]
+    [ValidatePattern("^\d+\.\d+t?$")]
+    [string]$PyVersion
+)
+
+$ErrorActionPreference = "Stop"
+
+# Import shared helpers
+Import-Module "$PSScriptRoot/build_common.psm1"
+Import-Module "$PSScriptRoot/build_common_python.psm1"
+
+$python = Get-Python -Version $PyVersion
+$cudaMajor = Get-CudaMajor
+
+$repoRoot = Get-RepoRoot
+
+$wheelPath = Get-CudaCcclWheel
+
+& $python -m pip install -U pip pytest pytest-xdist
+& $python -m pip install "$wheelPath[test-cu$cudaMajor]"
+
+Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
+try {
+    & $python -m pytest -n 6 -v compute/ -m "not large and not free_threading"
+    & $python -m pytest -n 0 -v compute/ -m "large and not free_threading"
+}
+finally { Pop-Location }