diff --git a/compiler/rustc_abi/src/lib.rs b/compiler/rustc_abi/src/lib.rs index 450a93ee8481e..96e4b822b041d 100644 --- a/compiler/rustc_abi/src/lib.rs +++ b/compiler/rustc_abi/src/lib.rs @@ -1753,6 +1753,9 @@ pub struct AddressSpace(pub u32); impl AddressSpace { /// LLVM's `0` address space. pub const ZERO: Self = AddressSpace(0); + /// The address space for workgroup memory on nvptx and amdgpu. + /// See e.g. the `gpu_launch_sized_workgroup_mem` intrinsic for details. + pub const GPU_WORKGROUP: Self = AddressSpace(3); } /// How many scalable vectors are in a `BackendRepr::ScalableVector`? diff --git a/compiler/rustc_codegen_llvm/src/declare.rs b/compiler/rustc_codegen_llvm/src/declare.rs index d7b8a304e9591..419d38f95e595 100644 --- a/compiler/rustc_codegen_llvm/src/declare.rs +++ b/compiler/rustc_codegen_llvm/src/declare.rs @@ -14,6 +14,7 @@ use std::borrow::Borrow; use itertools::Itertools; +use rustc_abi::AddressSpace; use rustc_codegen_ssa::traits::{MiscCodegenMethods, TypeMembershipCodegenMethods}; use rustc_data_structures::fx::FxIndexSet; use rustc_middle::ty::{Instance, Ty}; @@ -104,6 +105,28 @@ impl<'ll, CX: Borrow>> GenericCx<'ll, CX> { ) } } + + /// Declare a global value in a specific address space. + /// + /// If there’s a value with the same name already declared, the function will + /// return its Value instead. + pub(crate) fn declare_global_in_addrspace( + &self, + name: &str, + ty: &'ll Type, + addr_space: AddressSpace, + ) -> &'ll Value { + debug!("declare_global(name={name:?}, addrspace={addr_space:?})"); + unsafe { + llvm::LLVMRustGetOrInsertGlobalInAddrspace( + (**self).borrow().llmod, + name.as_c_char_ptr(), + name.len(), + ty, + addr_space.0, + ) + } + } } impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> { diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index b8ba6db9aa807..a298cf963dccf 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -3,8 +3,8 @@ use std::ffi::c_uint; use std::{assert_matches, iter, ptr}; use rustc_abi::{ - Align, BackendRepr, Float, HasDataLayout, Integer, NumScalableVectors, Primitive, Size, - WrappingRange, + AddressSpace, Align, BackendRepr, Float, HasDataLayout, Integer, NumScalableVectors, Primitive, + Size, WrappingRange, }; use rustc_codegen_ssa::base::{compare_simd_types, wants_msvc_seh, wants_wasm_eh}; use rustc_codegen_ssa::common::{IntPredicate, TypeKind}; @@ -176,6 +176,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { span: Span, ) -> Result<(), ty::Instance<'tcx>> { let tcx = self.tcx; + let llvm_version = crate::llvm_util::get_version(); let name = tcx.item_name(instance.def_id()); let fn_args = instance.args; @@ -192,7 +193,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { | sym::maximum_number_nsz_f64 | sym::maximum_number_nsz_f128 // Need at least LLVM 22 for `min/maximumnum` to not crash LLVM. - if crate::llvm_util::get_version() >= (22, 0, 0) => + if llvm_version >= (22, 0, 0) => { let intrinsic_name = if name.as_str().starts_with("min") { "llvm.minimumnum" @@ -418,7 +419,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { } // FIXME move into the branch below when LLVM 22 is the lowest version we support. - sym::carryless_mul if crate::llvm_util::get_version() >= (22, 0, 0) => { + sym::carryless_mul if llvm_version >= (22, 0, 0) => { let ty = args[0].layout.ty; if !ty.is_integral() { tcx.dcx().emit_err(InvalidMonomorphization::BasicIntegerType { @@ -618,6 +619,46 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { return Ok(()); } + sym::gpu_launch_sized_workgroup_mem => { + // Generate an anonymous global per call, with these properties: + // 1. The global is in the address space for workgroup memory + // 2. It is an `external` global + // 3. It is correctly aligned for the pointee `T` + // All instances of extern addrspace(gpu_workgroup) globals are merged in the LLVM backend. + // The name is irrelevant. + // See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared + let name = if llvm_version < (23, 0, 0) && tcx.sess.target.arch == Arch::Nvptx64 { + // The auto-assigned name for extern shared globals in the nvptx backend does + // not compile in ptxas. Workaround this issue by assigning a name. + // Fixed in LLVM 23. + "gpu_launch_sized_workgroup_mem" + } else { + "" + }; + let global = self.declare_global_in_addrspace( + name, + self.type_array(self.type_i8(), 0), + AddressSpace::GPU_WORKGROUP, + ); + let ty::RawPtr(inner_ty, _) = result.layout.ty.kind() else { unreachable!() }; + // The alignment of the global is used to specify the *minimum* alignment that + // must be obeyed by the GPU runtime. + // When multiple of these global variables are used by a kernel, the maximum alignment is taken. + // See https://github.com/llvm/llvm-project/blob/a271d07488a85ce677674bbe8101b10efff58c95/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp#L821 + let alignment = self.align_of(*inner_ty).bytes() as u32; + unsafe { + // FIXME Workaround the above issue by taking maximum alignment if the global existed + if tcx.sess.target.arch == Arch::Nvptx64 { + if alignment > llvm::LLVMGetAlignment(global) { + llvm::LLVMSetAlignment(global, alignment); + } + } else { + llvm::LLVMSetAlignment(global, alignment); + } + } + self.cx().const_pointercast(global, self.type_ptr()) + } + sym::amdgpu_dispatch_ptr => { let val = self.call_intrinsic("llvm.amdgcn.dispatch.ptr", &[], &[]); // Relying on `LLVMBuildPointerCast` to produce an addrspacecast diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs index 525d1dbe9d0d3..3e373c42eca34 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs @@ -2003,6 +2003,13 @@ unsafe extern "C" { NameLen: size_t, T: &'a Type, ) -> &'a Value; + pub(crate) fn LLVMRustGetOrInsertGlobalInAddrspace<'a>( + M: &'a Module, + Name: *const c_char, + NameLen: size_t, + T: &'a Type, + AddressSpace: c_uint, + ) -> &'a Value; pub(crate) fn LLVMRustGetNamedValue( M: &Module, Name: *const c_char, diff --git a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs index fd0c7c656ac21..f4a5e8baa2a5f 100644 --- a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs +++ b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs @@ -111,6 +111,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> { sym::abort | sym::unreachable | sym::cold_path + | sym::gpu_launch_sized_workgroup_mem | sym::breakpoint | sym::amdgpu_dispatch_ptr | sym::assert_zero_valid diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs index d952faa5edb74..9cc66f3c2adf6 100644 --- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs +++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs @@ -130,6 +130,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi | sym::forget | sym::frem_algebraic | sym::fsub_algebraic + | sym::gpu_launch_sized_workgroup_mem | sym::is_val_statically_known | sym::log2f16 | sym::log2f32 @@ -297,6 +298,7 @@ pub(crate) fn check_intrinsic_type( sym::field_offset => (1, 0, vec![], tcx.types.usize), sym::rustc_peek => (1, 0, vec![param(0)], param(0)), sym::caller_location => (0, 0, vec![], tcx.caller_location_ty()), + sym::gpu_launch_sized_workgroup_mem => (1, 0, vec![], Ty::new_mut_ptr(tcx, param(0))), sym::assert_inhabited | sym::assert_zero_valid | sym::assert_mem_uninitialized_valid => { (1, 0, vec![], tcx.types.unit) } diff --git a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp index c310e580af559..91bb1c9733630 100644 --- a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp +++ b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp @@ -299,10 +299,12 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertFunction(LLVMModuleRef M, .getCallee()); } -extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M, - const char *Name, - size_t NameLen, - LLVMTypeRef Ty) { +// Get the global variable with the given name if it exists or create a new +// external global. +extern "C" LLVMValueRef +LLVMRustGetOrInsertGlobalInAddrspace(LLVMModuleRef M, const char *Name, + size_t NameLen, LLVMTypeRef Ty, + unsigned int AddressSpace) { Module *Mod = unwrap(M); auto NameRef = StringRef(Name, NameLen); @@ -313,10 +315,24 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M, GlobalVariable *GV = Mod->getGlobalVariable(NameRef, true); if (!GV) GV = new GlobalVariable(*Mod, unwrap(Ty), false, - GlobalValue::ExternalLinkage, nullptr, NameRef); + GlobalValue::ExternalLinkage, nullptr, NameRef, + nullptr, GlobalValue::NotThreadLocal, AddressSpace); return wrap(GV); } +// Get the global variable with the given name if it exists or create a new +// external global. +extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M, + const char *Name, + size_t NameLen, + LLVMTypeRef Ty) { + Module *Mod = unwrap(M); + unsigned int AddressSpace = + Mod->getDataLayout().getDefaultGlobalsAddressSpace(); + return LLVMRustGetOrInsertGlobalInAddrspace(M, Name, NameLen, Ty, + AddressSpace); +} + // Must match the layout of `rustc_codegen_llvm::llvm::ffi::AttributeKind`. enum class LLVMRustAttributeKind { AlwaysInline = 0, diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index 981bfed363dcc..5e7834e800ff7 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -1032,6 +1032,7 @@ symbols! { global_asm, global_registration, globs, + gpu_launch_sized_workgroup_mem, gt, guard, guard_patterns, diff --git a/library/core/src/intrinsics/gpu.rs b/library/core/src/intrinsics/gpu.rs index 9e7624841d0c6..43cb7251c3c88 100644 --- a/library/core/src/intrinsics/gpu.rs +++ b/library/core/src/intrinsics/gpu.rs @@ -5,6 +5,51 @@ #![unstable(feature = "gpu_intrinsics", issue = "none")] +/// Returns the pointer to workgroup memory allocated at launch-time on GPUs. +/// +/// Workgroup memory is a memory region that is shared between all threads in +/// the same workgroup. It is faster to access than other memory but pointers do not +/// work outside the workgroup where they were obtained. +/// Workgroup memory can be allocated statically or after compilation, when +/// launching a gpu-kernel. `gpu_launch_sized_workgroup_mem` returns the pointer to +/// the memory that is allocated at launch-time. +/// The size of this memory can differ between launches of a gpu-kernel, depending on +/// what is specified at launch-time. +/// However, the alignment is fixed by the kernel itself, at compile-time. +/// +/// The returned pointer is the start of the workgroup memory region that is +/// allocated at launch-time. +/// All calls to `gpu_launch_sized_workgroup_mem` in a workgroup, independent of the +/// generic type, return the same address, so alias the same memory. +/// The returned pointer is aligned by at least the alignment of `T`. +/// +/// If `gpu_launch_sized_workgroup_mem` is invoked multiple times with different +/// types that have different alignment, then you may only rely on the resulting +/// pointer having the alignment of `T` after a call to `gpu_launch_sized_workgroup_mem::` +/// has occurred in the current program execution. +/// +/// # Safety +/// +/// The pointer is safe to dereference from the start (the returned pointer) up to the +/// size of workgroup memory that was specified when launching the current gpu-kernel. +/// This allocated size is not related in any way to `T`. +/// +/// The user must take care of synchronizing access to workgroup memory between +/// threads in a workgroup. The usual data race requirements apply. +/// +/// # Other APIs +/// +/// CUDA and HIP call this dynamic shared memory, shared between threads in a block. +/// OpenCL and SYCL call this local memory, shared between threads in a work-group. +/// GLSL calls this shared memory, shared between invocations in a work group. +/// DirectX calls this groupshared memory, shared between threads in a thread-group. +#[must_use = "returns a pointer that does nothing unless used"] +#[rustc_intrinsic] +#[rustc_nounwind] +#[unstable(feature = "gpu_launch_sized_workgroup_mem", issue = "135513")] +#[cfg(any(target_arch = "amdgpu", target_arch = "nvptx64"))] +pub fn gpu_launch_sized_workgroup_mem() -> *mut T; + /// Returns a pointer to the HSA kernel dispatch packet. /// /// A `gpu-kernel` on amdgpu is always launched through a kernel dispatch packet. diff --git a/src/tools/tidy/src/style.rs b/src/tools/tidy/src/style.rs index d144ffa222097..4e2f71b94ce2d 100644 --- a/src/tools/tidy/src/style.rs +++ b/src/tools/tidy/src/style.rs @@ -222,6 +222,10 @@ fn should_ignore(line: &str) -> bool { || static_regex!( "\\s*//@ \\!?(count|files|has|has-dir|hasraw|matches|matchesraw|snapshot)\\s.*" ).is_match(line) + // Matching for FileCheck checks + || static_regex!( + "\\s*// [a-zA-Z0-9-_]*:\\s.*" + ).is_match(line) } /// Returns `true` if `line` is allowed to be longer than the normal limit. diff --git a/tests/codegen-llvm/gpu-launch-sized-workgroup-memory.rs b/tests/codegen-llvm/gpu-launch-sized-workgroup-memory.rs new file mode 100644 index 0000000000000..4764160fd0b59 --- /dev/null +++ b/tests/codegen-llvm/gpu-launch-sized-workgroup-memory.rs @@ -0,0 +1,41 @@ +// Checks that the GPU intrinsic to get launch-sized workgroup memory works +// and correctly aligns the `external addrspace(...) global`s over multiple calls. + +//@ revisions: amdgpu nvptx-pre-llvm-23 nvptx-post-llvm-23 +//@ compile-flags: --crate-type=rlib -Copt-level=1 +// +//@ [amdgpu] compile-flags: --target amdgcn-amd-amdhsa -Ctarget-cpu=gfx900 +//@ [amdgpu] needs-llvm-components: amdgpu + +//@ [nvptx-pre-llvm-23] compile-flags: --target nvptx64-nvidia-cuda +//@ [nvptx-pre-llvm-23] needs-llvm-components: nvptx +//@ [nvptx-pre-llvm-23] max-llvm-major-version: 22 +//@ [nvptx-post-llvm-23] compile-flags: --target nvptx64-nvidia-cuda +//@ [nvptx-post-llvm-23] needs-llvm-components: nvptx +//@ [nvptx-post-llvm-23] min-llvm-version: 23 +//@ add-minicore +#![feature(intrinsics, no_core, rustc_attrs)] +#![no_core] + +extern crate minicore; + +#[rustc_intrinsic] +#[rustc_nounwind] +fn gpu_launch_sized_workgroup_mem() -> *mut T; + +// amdgpu-DAG: @[[SMALL:[^ ]+]] = external addrspace(3) global [0 x i8], align 4 +// amdgpu-DAG: @[[BIG:[^ ]+]] = external addrspace(3) global [0 x i8], align 8 +// amdgpu: ret { ptr, ptr } { ptr addrspacecast (ptr addrspace(3) @[[SMALL]] to ptr), ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr) } + +// nvptx-pre-llvm-23: @[[BIG:[^ ]+]] = external addrspace(3) global [0 x i8], align 8 +// nvptx-pre-llvm-23: ret { ptr, ptr } { ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr), ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr) } + +// nvptx-post-llvm-23-DAG: @[[SMALL:[^ ]+]] = external addrspace(3) global [0 x i8], align 4 +// nvptx-post-llvm-23-DAG: @[[BIG:[^ ]+]] = external addrspace(3) global [0 x i8], align 8 +// nvptx-post-llvm-23: ret { ptr, ptr } { ptr addrspacecast (ptr addrspace(3) @[[SMALL]] to ptr), ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr) } +#[unsafe(no_mangle)] +pub fn fun() -> (*mut i32, *mut f64) { + let small = gpu_launch_sized_workgroup_mem::(); + let big = gpu_launch_sized_workgroup_mem::(); // Increase alignment to 8 + (small, big) +}