Rust-GPU · 39ali · Apr 26, 2026
diff --git a/Cargo.toml b/Cargo.toml
@@ -56,6 +56,8 @@ members = [
   "samples/introduction/async_api/kernels",
   "samples/introduction/matmul",
   "samples/introduction/matmul/kernels",
+  "samples/introduction/simple_atomic_intrinsics",
+  "samples/introduction/simple_atomic_intrinsics/kernels",
 
   "tests/compiletests",
   "tests/compiletests/deps-helper",

diff --git a/crates/cuda_std/src/atomic/mid.rs b/crates/cuda_std/src/atomic/mid.rs
@@ -310,3 +310,33 @@ macro_rules! impl_cas {
 impl_cas! {
     u32, u64, i32, i64, f32, f64
 }
+
+#[gpu_only]
+#[allow(clippy::missing_safety_doc)]
+/// Performs a bounded increment like CUDA's atomicInc: if *ptr >= bound then 0 else *ptr+1, returns old value.
+pub unsafe fn atomic_inc_bounded_relaxed_u32_device(ptr: *mut u32, bound: u32) -> u32 {
+    loop {
+        let old = intrinsics::atomic_load_relaxed_32_device(ptr);
+        let new = if old >= bound { 0 } else { old + 1 };
+        if intrinsics::atomic_fetch_cas_relaxed_u32_device(ptr, old, new) == old {
+            return old;
+        }
+    }
+}
+
+#[gpu_only]
+#[allow(clippy::missing_safety_doc)]
+/// Performs a bounded decrement like CUDA's atomicDec: if *ptr == 0 || *ptr > bound then bound else *ptr-1, returns old value.
+pub unsafe fn atomic_dec_bounded_relaxed_u32_device(ptr: *mut u32, bound: u32) -> u32 {
+    loop {
+        let old = intrinsics::atomic_load_relaxed_32_device(ptr);
+        let new = if old == 0 || old > bound {
+            bound
+        } else {
+            old - 1
+        };
+        if intrinsics::atomic_fetch_cas_relaxed_u32_device(ptr, old, new) == old {
+            return old;
+        }
+    }
+}
diff --git a/samples/introduction/simple_atomic_intrinsics/Cargo.toml b/samples/introduction/simple_atomic_intrinsics/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "simple-atomic-intrinsics"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+cust = { path = "../../../crates/cust" }
+
+[build-dependencies]
+cuda_builder = { workspace = true, default-features = false }
diff --git a/samples/introduction/simple_atomic_intrinsics/build.rs b/samples/introduction/simple_atomic_intrinsics/build.rs
@@ -0,0 +1,17 @@
+use std::env;
+use std::path;
+
+use cuda_builder::CudaBuilder;
+
+fn main() {
+    println!("cargo::rerun-if-changed=build.rs");
+    println!("cargo::rerun-if-changed=kernels");
+
+    let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
+    let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
+
+    CudaBuilder::new(manifest_dir.join("kernels"))
+        .copy_to(out_path.join("kernels.ptx"))
+        .build()
+        .unwrap();
+}
diff --git a/samples/introduction/simple_atomic_intrinsics/kernels/Cargo.toml b/samples/introduction/simple_atomic_intrinsics/kernels/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "simple-atomic-intrinsics-kernels"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+cuda_std = { path = "../../../../crates/cuda_std" }
+
+[lib]
+crate-type = ["cdylib", "rlib"]
diff --git a/samples/introduction/simple_atomic_intrinsics/kernels/src/lib.rs b/samples/introduction/simple_atomic_intrinsics/kernels/src/lib.rs
@@ -0,0 +1,37 @@
+#![allow(
+    improper_ctypes_definitions,
+    clippy::missing_safety_doc,
+    unsafe_op_in_unsafe_fn
+)]
+
+use cuda_std::atomic::{intrinsics, mid};
+use cuda_std::prelude::*;
+
+#[kernel]
+pub unsafe fn test_kernel(data: *mut i32) {
+    let tid = (thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x()) as i32;
+
+    // Arithmetic atomics
+
+    intrinsics::atomic_fetch_add_relaxed_i32_device(data.add(0), 10);
+    intrinsics::atomic_fetch_sub_relaxed_i32_device(data.add(1), 10);
+    intrinsics::atomic_fetch_exch_relaxed_i32_device(data.add(2), tid);
+    intrinsics::atomic_fetch_max_relaxed_i32_device(data.add(3), tid);
+    intrinsics::atomic_fetch_min_relaxed_i32_device(data.add(4), tid);
+
+    mid::atomic_inc_bounded_relaxed_u32_device(data.add(5) as *mut u32, 17);
+
+    mid::atomic_dec_bounded_relaxed_u32_device(data.add(6) as *mut u32, 137);
+
+    intrinsics::atomic_fetch_cas_relaxed_i32_device(data.add(7), tid - 1, tid);
+
+    // Bitwise atomics
+
+    intrinsics::atomic_fetch_and_relaxed_i32_device(data.add(8), 2 * tid + 7);
+
+    // Match CUDA's `1 << tid` wrapping behaviour for tid >= 32 (PTX shl.b32 masks
+    // the shift count to 5 bits, same as Rust's wrapping_shl).
+    intrinsics::atomic_fetch_or_relaxed_i32_device(data.add(9), 1i32.wrapping_shl(tid as u32));
+
+    intrinsics::atomic_fetch_xor_relaxed_i32_device(data.add(10), tid);
+}
diff --git a/samples/introduction/simple_atomic_intrinsics/src/main.rs b/samples/introduction/simple_atomic_intrinsics/src/main.rs
@@ -0,0 +1,178 @@
+/* Demonstrates trivial use of global-memory atomic device functions, mirroring
+ * NVIDIA's simpleAtomicIntrinsics CUDA sample.
+ *
+ * A 64×256 grid (16 384 threads) each performs eleven atomic operations on a
+ * shared 11-element i32 array and the host verifies the results.
+ */
+
+use cust::memory::{CopyDestination, DeviceBuffer};
+use cust::module::Module;
+use cust::stream::{Stream, StreamFlags};
+use std::error::Error;
+use std::time::Instant;
+
+static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
+
+const NUM_BLOCKS: u32 = 64;
+const NUM_THREADS: u32 = 256;
+const NUM_DATA: usize = 11;
+
+fn compute_gold(gpu_data: &[i32; NUM_DATA], total_threads: usize) -> bool {
+    let len = total_threads;
+    let mut ok = true;
+
+    // slot 0 – atomicAdd(+10): sum of len additions of 10
+    let expected = 10 * len as i32;
+    if gpu_data[0] != expected {
+        println!("atomicAdd failed: expected {expected}, got {}", gpu_data[0]);
+        ok = false;
+    }
+
+    // slot 1 – atomicSub(-10)
+    let expected = -(10 * len as i32);
+    if gpu_data[1] != expected {
+        println!("atomicSub failed: expected {expected}, got {}", gpu_data[1]);
+        ok = false;
+    }
+
+    // slot 2 – atomicExch: final value must be a valid tid in [0, len)
+    if !(0..len as i32).contains(&gpu_data[2]) {
+        println!("atomicExch failed: got {}", gpu_data[2]);
+        ok = false;
+    }
+
+    // slot 3 – atomicMax: sequential max of 0..len starting from -(1<<8)
+    let expected = {
+        let mut v = -(1i32 << 8);
+        for i in 0..len {
+            v = v.max(i as i32);
+        }
+        v
+    };
+    if gpu_data[3] != expected {
+        println!("atomicMax failed: expected {expected}, got {}", gpu_data[3]);
+        ok = false;
+    }
+
+    // slot 4 – atomicMin
+    let expected = {
+        let mut v = 1i32 << 8;
+        for i in 0..len {
+            v = v.min(i as i32);
+        }
+        v
+    };
+    if gpu_data[4] != expected {
+        println!("atomicMin failed: expected {expected}, got {}", gpu_data[4]);
+        ok = false;
+    }
+
+    // slot 5 – atomicInc(limit=17): each thread does bounded inc, final value in [0, 16]
+    if !(0..=16).contains(&gpu_data[5]) {
+        println!("atomicInc failed: expected [0, 16], got {}", gpu_data[5]);
+        ok = false;
+    }
+
+    // slot 6 – atomicDec(limit=137): each thread does bounded dec, final value in [0, 137]
+    if !(0..=137).contains(&gpu_data[6]) {
+        println!("atomicDec failed: expected [0, 137], got {}", gpu_data[6]);
+        ok = false;
+    }
+
+    // slot 7 – atomicCAS: final value must be a valid tid in [0, len)
+    if !(0..len as i32).contains(&gpu_data[7]) {
+        println!("atomicCAS failed: got {}", gpu_data[7]);
+        ok = false;
+    }
+
+    // slot 8 – atomicAnd(2*tid+7) starting from 0xff
+    let expected = {
+        let mut v = 0xffi32;
+        for i in 0..len {
+            v &= 2 * i as i32 + 7;
+        }
+        v
+    };
+    if gpu_data[8] != expected {
+        println!("atomicAnd failed: expected {expected}, got {}", gpu_data[8]);
+        ok = false;
+    }
+
+    // slot 9 – atomicOr(1<<tid) starting from 0.
+    // For tid ≥ 32 the PTX shl.b32 wraps (modulo 32), same as wrapping_shl.
+    let expected = {
+        let mut v = 0i32;
+        for i in 0..len {
+            v |= 1i32.wrapping_shl(i as u32);
+        }
+        v
+    };
+    if gpu_data[9] != expected {
+        println!("atomicOr failed: expected {expected}, got {}", gpu_data[9]);
+        ok = false;
+    }
+
+    // slot 10 – atomicXor(tid) starting from 0xff
+    let expected = {
+        let mut v = 0xffi32;
+        for i in 0..len {
+            v ^= i as i32;
+        }
+        v
+    };
+    if gpu_data[10] != expected {
+        println!(
+            "atomicXor failed: expected {expected}, got {}",
+            gpu_data[10]
+        );
+        ok = false;
+    }
+
+    ok
+}
+
+fn main() -> Result<(), Box<dyn Error>> {
+    println!("simpleAtomicIntrinsics starting...");
+
+    let _ctx = cust::quick_init()?;
+    let module = Module::from_ptx(PTX, &[])?;
+    let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?;
+
+    let mut h_data = [0i32; NUM_DATA];
+    // AND and XOR tests start with 0xff in their slots
+    h_data[8] = 0xff;
+    h_data[10] = 0xff;
+
+    let d_data = DeviceBuffer::from_slice(&h_data)?;
+
+    let kernel = module.get_function("test_kernel")?;
+
+    let start = Instant::now();
+
+    unsafe {
+        cust::launch!(
+            kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_data.as_device_ptr())
+        )?;
+    }
+
+    stream.synchronize()?;
+
+    let elapsed_ms = start.elapsed().as_secs_f64() * 1000.0;
+    println!("Processing time: {elapsed_ms:.3} ms");
+
+    d_data.copy_to(&mut h_data)?;
+
+    let total_threads = (NUM_BLOCKS * NUM_THREADS) as usize;
+    let passed = compute_gold(&h_data, total_threads);
+
+    println!(
+        "simpleAtomicIntrinsics completed, returned {}",
+        if passed { "OK" } else { "ERROR!" }
+    );
+
+    if !passed {
+        std::process::exit(1);
+    }
+
+    Ok(())
+}