Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ members = [
"samples/introduction/async_api/kernels",
"samples/introduction/matmul",
"samples/introduction/matmul/kernels",
"samples/introduction/simple_atomic_intrinsics",
"samples/introduction/simple_atomic_intrinsics/kernels",

"tests/compiletests",
"tests/compiletests/deps-helper",
Expand Down
30 changes: 30 additions & 0 deletions crates/cuda_std/src/atomic/mid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -310,3 +310,33 @@ macro_rules! impl_cas {
impl_cas! {
u32, u64, i32, i64, f32, f64
}

#[gpu_only]
#[allow(clippy::missing_safety_doc)]
/// Performs a bounded increment like CUDA's atomicInc: if *ptr >= bound then 0 else *ptr+1, returns old value.
pub unsafe fn atomic_inc_bounded_relaxed_u32_device(ptr: *mut u32, bound: u32) -> u32 {
loop {
let old = intrinsics::atomic_load_relaxed_32_device(ptr);
let new = if old >= bound { 0 } else { old + 1 };
if intrinsics::atomic_fetch_cas_relaxed_u32_device(ptr, old, new) == old {
return old;
}
}
}

#[gpu_only]
#[allow(clippy::missing_safety_doc)]
/// Performs a bounded decrement like CUDA's atomicDec: if *ptr == 0 || *ptr > bound then bound else *ptr-1, returns old value.
pub unsafe fn atomic_dec_bounded_relaxed_u32_device(ptr: *mut u32, bound: u32) -> u32 {
loop {
let old = intrinsics::atomic_load_relaxed_32_device(ptr);
let new = if old == 0 || old > bound {
bound
} else {
old - 1
};
if intrinsics::atomic_fetch_cas_relaxed_u32_device(ptr, old, new) == old {
return old;
}
}
}
10 changes: 10 additions & 0 deletions samples/introduction/simple_atomic_intrinsics/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[package]
name = "simple-atomic-intrinsics"
version = "0.1.0"
edition = "2024"

[dependencies]
cust = { path = "../../../crates/cust" }

[build-dependencies]
cuda_builder = { workspace = true, default-features = false }
17 changes: 17 additions & 0 deletions samples/introduction/simple_atomic_intrinsics/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
use std::env;
use std::path;

use cuda_builder::CudaBuilder;

fn main() {
println!("cargo::rerun-if-changed=build.rs");
println!("cargo::rerun-if-changed=kernels");

let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());

CudaBuilder::new(manifest_dir.join("kernels"))
.copy_to(out_path.join("kernels.ptx"))
.build()
.unwrap();
}
10 changes: 10 additions & 0 deletions samples/introduction/simple_atomic_intrinsics/kernels/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[package]
name = "simple-atomic-intrinsics-kernels"
version = "0.1.0"
edition = "2024"

[dependencies]
cuda_std = { path = "../../../../crates/cuda_std" }

[lib]
crate-type = ["cdylib", "rlib"]
37 changes: 37 additions & 0 deletions samples/introduction/simple_atomic_intrinsics/kernels/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#![allow(
improper_ctypes_definitions,
clippy::missing_safety_doc,
unsafe_op_in_unsafe_fn
)]

use cuda_std::atomic::{intrinsics, mid};
use cuda_std::prelude::*;

#[kernel]
pub unsafe fn test_kernel(data: *mut i32) {
let tid = (thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x()) as i32;

// Arithmetic atomics

intrinsics::atomic_fetch_add_relaxed_i32_device(data.add(0), 10);
intrinsics::atomic_fetch_sub_relaxed_i32_device(data.add(1), 10);
intrinsics::atomic_fetch_exch_relaxed_i32_device(data.add(2), tid);
intrinsics::atomic_fetch_max_relaxed_i32_device(data.add(3), tid);
intrinsics::atomic_fetch_min_relaxed_i32_device(data.add(4), tid);

mid::atomic_inc_bounded_relaxed_u32_device(data.add(5) as *mut u32, 17);

mid::atomic_dec_bounded_relaxed_u32_device(data.add(6) as *mut u32, 137);

intrinsics::atomic_fetch_cas_relaxed_i32_device(data.add(7), tid - 1, tid);

// Bitwise atomics

intrinsics::atomic_fetch_and_relaxed_i32_device(data.add(8), 2 * tid + 7);

// Match CUDA's `1 << tid` wrapping behaviour for tid >= 32 (PTX shl.b32 masks
// the shift count to 5 bits, same as Rust's wrapping_shl).
intrinsics::atomic_fetch_or_relaxed_i32_device(data.add(9), 1i32.wrapping_shl(tid as u32));

intrinsics::atomic_fetch_xor_relaxed_i32_device(data.add(10), tid);
}
178 changes: 178 additions & 0 deletions samples/introduction/simple_atomic_intrinsics/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
/* Demonstrates trivial use of global-memory atomic device functions, mirroring
* NVIDIA's simpleAtomicIntrinsics CUDA sample.
*
* A 64×256 grid (16 384 threads) each performs eleven atomic operations on a
* shared 11-element i32 array and the host verifies the results.
*/

use cust::memory::{CopyDestination, DeviceBuffer};
use cust::module::Module;
use cust::stream::{Stream, StreamFlags};
use std::error::Error;
use std::time::Instant;

static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));

const NUM_BLOCKS: u32 = 64;
const NUM_THREADS: u32 = 256;
const NUM_DATA: usize = 11;

fn compute_gold(gpu_data: &[i32; NUM_DATA], total_threads: usize) -> bool {
let len = total_threads;
let mut ok = true;

// slot 0 – atomicAdd(+10): sum of len additions of 10
let expected = 10 * len as i32;
if gpu_data[0] != expected {
println!("atomicAdd failed: expected {expected}, got {}", gpu_data[0]);
ok = false;
}

// slot 1 – atomicSub(-10)
let expected = -(10 * len as i32);
if gpu_data[1] != expected {
println!("atomicSub failed: expected {expected}, got {}", gpu_data[1]);
ok = false;
}

// slot 2 – atomicExch: final value must be a valid tid in [0, len)
if !(0..len as i32).contains(&gpu_data[2]) {
println!("atomicExch failed: got {}", gpu_data[2]);
ok = false;
}

// slot 3 – atomicMax: sequential max of 0..len starting from -(1<<8)
let expected = {
let mut v = -(1i32 << 8);
for i in 0..len {
v = v.max(i as i32);
}
v
};
if gpu_data[3] != expected {
println!("atomicMax failed: expected {expected}, got {}", gpu_data[3]);
ok = false;
}

// slot 4 – atomicMin
let expected = {
let mut v = 1i32 << 8;
for i in 0..len {
v = v.min(i as i32);
}
v
};
if gpu_data[4] != expected {
println!("atomicMin failed: expected {expected}, got {}", gpu_data[4]);
ok = false;
}

// slot 5 – atomicInc(limit=17): each thread does bounded inc, final value in [0, 16]
if !(0..=16).contains(&gpu_data[5]) {
println!("atomicInc failed: expected [0, 16], got {}", gpu_data[5]);
ok = false;
}

// slot 6 – atomicDec(limit=137): each thread does bounded dec, final value in [0, 137]
if !(0..=137).contains(&gpu_data[6]) {
println!("atomicDec failed: expected [0, 137], got {}", gpu_data[6]);
ok = false;
}

// slot 7 – atomicCAS: final value must be a valid tid in [0, len)
if !(0..len as i32).contains(&gpu_data[7]) {
println!("atomicCAS failed: got {}", gpu_data[7]);
ok = false;
}

// slot 8 – atomicAnd(2*tid+7) starting from 0xff
let expected = {
let mut v = 0xffi32;
for i in 0..len {
v &= 2 * i as i32 + 7;
}
v
};
if gpu_data[8] != expected {
println!("atomicAnd failed: expected {expected}, got {}", gpu_data[8]);
ok = false;
}

// slot 9 – atomicOr(1<<tid) starting from 0.
// For tid ≥ 32 the PTX shl.b32 wraps (modulo 32), same as wrapping_shl.
let expected = {
let mut v = 0i32;
for i in 0..len {
v |= 1i32.wrapping_shl(i as u32);
}
v
};
if gpu_data[9] != expected {
println!("atomicOr failed: expected {expected}, got {}", gpu_data[9]);
ok = false;
}

// slot 10 – atomicXor(tid) starting from 0xff
let expected = {
let mut v = 0xffi32;
for i in 0..len {
v ^= i as i32;
}
v
};
if gpu_data[10] != expected {
println!(
"atomicXor failed: expected {expected}, got {}",
gpu_data[10]
);
ok = false;
}

ok
}

fn main() -> Result<(), Box<dyn Error>> {
println!("simpleAtomicIntrinsics starting...");

let _ctx = cust::quick_init()?;
let module = Module::from_ptx(PTX, &[])?;
let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?;

let mut h_data = [0i32; NUM_DATA];
// AND and XOR tests start with 0xff in their slots
h_data[8] = 0xff;
h_data[10] = 0xff;

let d_data = DeviceBuffer::from_slice(&h_data)?;

let kernel = module.get_function("test_kernel")?;

let start = Instant::now();

unsafe {
cust::launch!(
kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_data.as_device_ptr())
)?;
}

stream.synchronize()?;

let elapsed_ms = start.elapsed().as_secs_f64() * 1000.0;
println!("Processing time: {elapsed_ms:.3} ms");

d_data.copy_to(&mut h_data)?;

let total_threads = (NUM_BLOCKS * NUM_THREADS) as usize;
let passed = compute_gold(&h_data, total_threads);

println!(
"simpleAtomicIntrinsics completed, returned {}",
if passed { "OK" } else { "ERROR!" }
);

if !passed {
std::process::exit(1);
}

Ok(())
}