From bb3bbdd653aa53bd6bdad1d63e7dea4e3c07662e Mon Sep 17 00:00:00 2001 From: 39ali Date: Sun, 26 Apr 2026 23:21:07 +0300 Subject: [PATCH] atomicInc/atomicDec with bounds --- Cargo.toml | 2 + crates/cuda_std/src/atomic/mid.rs | 30 +++ .../simple_atomic_intrinsics/Cargo.toml | 10 + .../simple_atomic_intrinsics/build.rs | 17 ++ .../kernels/Cargo.toml | 10 + .../kernels/src/lib.rs | 37 ++++ .../simple_atomic_intrinsics/src/main.rs | 178 ++++++++++++++++++ 7 files changed, 284 insertions(+) create mode 100644 samples/introduction/simple_atomic_intrinsics/Cargo.toml create mode 100644 samples/introduction/simple_atomic_intrinsics/build.rs create mode 100644 samples/introduction/simple_atomic_intrinsics/kernels/Cargo.toml create mode 100644 samples/introduction/simple_atomic_intrinsics/kernels/src/lib.rs create mode 100644 samples/introduction/simple_atomic_intrinsics/src/main.rs diff --git a/Cargo.toml b/Cargo.toml index 108b31a6..794234ec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -56,6 +56,8 @@ members = [ "samples/introduction/async_api/kernels", "samples/introduction/matmul", "samples/introduction/matmul/kernels", + "samples/introduction/simple_atomic_intrinsics", + "samples/introduction/simple_atomic_intrinsics/kernels", "tests/compiletests", "tests/compiletests/deps-helper", diff --git a/crates/cuda_std/src/atomic/mid.rs b/crates/cuda_std/src/atomic/mid.rs index ad451bff..2157bcac 100644 --- a/crates/cuda_std/src/atomic/mid.rs +++ b/crates/cuda_std/src/atomic/mid.rs @@ -310,3 +310,33 @@ macro_rules! impl_cas { impl_cas! { u32, u64, i32, i64, f32, f64 } + +#[gpu_only] +#[allow(clippy::missing_safety_doc)] +/// Performs a bounded increment like CUDA's atomicInc: if *ptr >= bound then 0 else *ptr+1, returns old value. +pub unsafe fn atomic_inc_bounded_relaxed_u32_device(ptr: *mut u32, bound: u32) -> u32 { + loop { + let old = intrinsics::atomic_load_relaxed_32_device(ptr); + let new = if old >= bound { 0 } else { old + 1 }; + if intrinsics::atomic_fetch_cas_relaxed_u32_device(ptr, old, new) == old { + return old; + } + } +} + +#[gpu_only] +#[allow(clippy::missing_safety_doc)] +/// Performs a bounded decrement like CUDA's atomicDec: if *ptr == 0 || *ptr > bound then bound else *ptr-1, returns old value. +pub unsafe fn atomic_dec_bounded_relaxed_u32_device(ptr: *mut u32, bound: u32) -> u32 { + loop { + let old = intrinsics::atomic_load_relaxed_32_device(ptr); + let new = if old == 0 || old > bound { + bound + } else { + old - 1 + }; + if intrinsics::atomic_fetch_cas_relaxed_u32_device(ptr, old, new) == old { + return old; + } + } +} diff --git a/samples/introduction/simple_atomic_intrinsics/Cargo.toml b/samples/introduction/simple_atomic_intrinsics/Cargo.toml new file mode 100644 index 00000000..82b6b9d4 --- /dev/null +++ b/samples/introduction/simple_atomic_intrinsics/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "simple-atomic-intrinsics" +version = "0.1.0" +edition = "2024" + +[dependencies] +cust = { path = "../../../crates/cust" } + +[build-dependencies] +cuda_builder = { workspace = true, default-features = false } diff --git a/samples/introduction/simple_atomic_intrinsics/build.rs b/samples/introduction/simple_atomic_intrinsics/build.rs new file mode 100644 index 00000000..7f23bac1 --- /dev/null +++ b/samples/introduction/simple_atomic_intrinsics/build.rs @@ -0,0 +1,17 @@ +use std::env; +use std::path; + +use cuda_builder::CudaBuilder; + +fn main() { + println!("cargo::rerun-if-changed=build.rs"); + println!("cargo::rerun-if-changed=kernels"); + + let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap()); + let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); + + CudaBuilder::new(manifest_dir.join("kernels")) + .copy_to(out_path.join("kernels.ptx")) + .build() + .unwrap(); +} diff --git a/samples/introduction/simple_atomic_intrinsics/kernels/Cargo.toml b/samples/introduction/simple_atomic_intrinsics/kernels/Cargo.toml new file mode 100644 index 00000000..7af023c7 --- /dev/null +++ b/samples/introduction/simple_atomic_intrinsics/kernels/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "simple-atomic-intrinsics-kernels" +version = "0.1.0" +edition = "2024" + +[dependencies] +cuda_std = { path = "../../../../crates/cuda_std" } + +[lib] +crate-type = ["cdylib", "rlib"] diff --git a/samples/introduction/simple_atomic_intrinsics/kernels/src/lib.rs b/samples/introduction/simple_atomic_intrinsics/kernels/src/lib.rs new file mode 100644 index 00000000..3ff9d623 --- /dev/null +++ b/samples/introduction/simple_atomic_intrinsics/kernels/src/lib.rs @@ -0,0 +1,37 @@ +#![allow( + improper_ctypes_definitions, + clippy::missing_safety_doc, + unsafe_op_in_unsafe_fn +)] + +use cuda_std::atomic::{intrinsics, mid}; +use cuda_std::prelude::*; + +#[kernel] +pub unsafe fn test_kernel(data: *mut i32) { + let tid = (thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x()) as i32; + + // Arithmetic atomics + + intrinsics::atomic_fetch_add_relaxed_i32_device(data.add(0), 10); + intrinsics::atomic_fetch_sub_relaxed_i32_device(data.add(1), 10); + intrinsics::atomic_fetch_exch_relaxed_i32_device(data.add(2), tid); + intrinsics::atomic_fetch_max_relaxed_i32_device(data.add(3), tid); + intrinsics::atomic_fetch_min_relaxed_i32_device(data.add(4), tid); + + mid::atomic_inc_bounded_relaxed_u32_device(data.add(5) as *mut u32, 17); + + mid::atomic_dec_bounded_relaxed_u32_device(data.add(6) as *mut u32, 137); + + intrinsics::atomic_fetch_cas_relaxed_i32_device(data.add(7), tid - 1, tid); + + // Bitwise atomics + + intrinsics::atomic_fetch_and_relaxed_i32_device(data.add(8), 2 * tid + 7); + + // Match CUDA's `1 << tid` wrapping behaviour for tid >= 32 (PTX shl.b32 masks + // the shift count to 5 bits, same as Rust's wrapping_shl). + intrinsics::atomic_fetch_or_relaxed_i32_device(data.add(9), 1i32.wrapping_shl(tid as u32)); + + intrinsics::atomic_fetch_xor_relaxed_i32_device(data.add(10), tid); +} diff --git a/samples/introduction/simple_atomic_intrinsics/src/main.rs b/samples/introduction/simple_atomic_intrinsics/src/main.rs new file mode 100644 index 00000000..87fb8d0e --- /dev/null +++ b/samples/introduction/simple_atomic_intrinsics/src/main.rs @@ -0,0 +1,178 @@ +/* Demonstrates trivial use of global-memory atomic device functions, mirroring + * NVIDIA's simpleAtomicIntrinsics CUDA sample. + * + * A 64×256 grid (16 384 threads) each performs eleven atomic operations on a + * shared 11-element i32 array and the host verifies the results. + */ + +use cust::memory::{CopyDestination, DeviceBuffer}; +use cust::module::Module; +use cust::stream::{Stream, StreamFlags}; +use std::error::Error; +use std::time::Instant; + +static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx")); + +const NUM_BLOCKS: u32 = 64; +const NUM_THREADS: u32 = 256; +const NUM_DATA: usize = 11; + +fn compute_gold(gpu_data: &[i32; NUM_DATA], total_threads: usize) -> bool { + let len = total_threads; + let mut ok = true; + + // slot 0 – atomicAdd(+10): sum of len additions of 10 + let expected = 10 * len as i32; + if gpu_data[0] != expected { + println!("atomicAdd failed: expected {expected}, got {}", gpu_data[0]); + ok = false; + } + + // slot 1 – atomicSub(-10) + let expected = -(10 * len as i32); + if gpu_data[1] != expected { + println!("atomicSub failed: expected {expected}, got {}", gpu_data[1]); + ok = false; + } + + // slot 2 – atomicExch: final value must be a valid tid in [0, len) + if !(0..len as i32).contains(&gpu_data[2]) { + println!("atomicExch failed: got {}", gpu_data[2]); + ok = false; + } + + // slot 3 – atomicMax: sequential max of 0..len starting from -(1<<8) + let expected = { + let mut v = -(1i32 << 8); + for i in 0..len { + v = v.max(i as i32); + } + v + }; + if gpu_data[3] != expected { + println!("atomicMax failed: expected {expected}, got {}", gpu_data[3]); + ok = false; + } + + // slot 4 – atomicMin + let expected = { + let mut v = 1i32 << 8; + for i in 0..len { + v = v.min(i as i32); + } + v + }; + if gpu_data[4] != expected { + println!("atomicMin failed: expected {expected}, got {}", gpu_data[4]); + ok = false; + } + + // slot 5 – atomicInc(limit=17): each thread does bounded inc, final value in [0, 16] + if !(0..=16).contains(&gpu_data[5]) { + println!("atomicInc failed: expected [0, 16], got {}", gpu_data[5]); + ok = false; + } + + // slot 6 – atomicDec(limit=137): each thread does bounded dec, final value in [0, 137] + if !(0..=137).contains(&gpu_data[6]) { + println!("atomicDec failed: expected [0, 137], got {}", gpu_data[6]); + ok = false; + } + + // slot 7 – atomicCAS: final value must be a valid tid in [0, len) + if !(0..len as i32).contains(&gpu_data[7]) { + println!("atomicCAS failed: got {}", gpu_data[7]); + ok = false; + } + + // slot 8 – atomicAnd(2*tid+7) starting from 0xff + let expected = { + let mut v = 0xffi32; + for i in 0..len { + v &= 2 * i as i32 + 7; + } + v + }; + if gpu_data[8] != expected { + println!("atomicAnd failed: expected {expected}, got {}", gpu_data[8]); + ok = false; + } + + // slot 9 – atomicOr(1< Result<(), Box> { + println!("simpleAtomicIntrinsics starting..."); + + let _ctx = cust::quick_init()?; + let module = Module::from_ptx(PTX, &[])?; + let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?; + + let mut h_data = [0i32; NUM_DATA]; + // AND and XOR tests start with 0xff in their slots + h_data[8] = 0xff; + h_data[10] = 0xff; + + let d_data = DeviceBuffer::from_slice(&h_data)?; + + let kernel = module.get_function("test_kernel")?; + + let start = Instant::now(); + + unsafe { + cust::launch!( + kernel<<>>(d_data.as_device_ptr()) + )?; + } + + stream.synchronize()?; + + let elapsed_ms = start.elapsed().as_secs_f64() * 1000.0; + println!("Processing time: {elapsed_ms:.3} ms"); + + d_data.copy_to(&mut h_data)?; + + let total_threads = (NUM_BLOCKS * NUM_THREADS) as usize; + let passed = compute_gold(&h_data, total_threads); + + println!( + "simpleAtomicIntrinsics completed, returned {}", + if passed { "OK" } else { "ERROR!" } + ); + + if !passed { + std::process::exit(1); + } + + Ok(()) +}