diff --git a/compiler/rustc_abi/src/lib.rs b/compiler/rustc_abi/src/lib.rs
index 450a93ee8481e..96e4b822b041d 100644
--- a/compiler/rustc_abi/src/lib.rs
+++ b/compiler/rustc_abi/src/lib.rs
@@ -1753,6 +1753,9 @@ pub struct AddressSpace(pub u32);
 impl AddressSpace {
     /// LLVM's `0` address space.
     pub const ZERO: Self = AddressSpace(0);
+    /// The address space for workgroup memory on nvptx and amdgpu.
+    /// See e.g. the `gpu_launch_sized_workgroup_mem` intrinsic for details.
+    pub const GPU_WORKGROUP: Self = AddressSpace(3);
 }
 
 /// How many scalable vectors are in a `BackendRepr::ScalableVector`?
diff --git a/compiler/rustc_codegen_llvm/src/declare.rs b/compiler/rustc_codegen_llvm/src/declare.rs
index d7b8a304e9591..419d38f95e595 100644
--- a/compiler/rustc_codegen_llvm/src/declare.rs
+++ b/compiler/rustc_codegen_llvm/src/declare.rs
@@ -14,6 +14,7 @@
 use std::borrow::Borrow;
 
 use itertools::Itertools;
+use rustc_abi::AddressSpace;
 use rustc_codegen_ssa::traits::{MiscCodegenMethods, TypeMembershipCodegenMethods};
 use rustc_data_structures::fx::FxIndexSet;
 use rustc_middle::ty::{Instance, Ty};
@@ -104,6 +105,28 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
             )
         }
     }
+
+    /// Declare a global value in a specific address space.
+    ///
+    /// If there’s a value with the same name already declared, the function will
+    /// return its Value instead.
+    pub(crate) fn declare_global_in_addrspace(
+        &self,
+        name: &str,
+        ty: &'ll Type,
+        addr_space: AddressSpace,
+    ) -> &'ll Value {
+        debug!("declare_global(name={name:?}, addrspace={addr_space:?})");
+        unsafe {
+            llvm::LLVMRustGetOrInsertGlobalInAddrspace(
+                (**self).borrow().llmod,
+                name.as_c_char_ptr(),
+                name.len(),
+                ty,
+                addr_space.0,
+            )
+        }
+    }
 }
 
 impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs
index b8ba6db9aa807..a298cf963dccf 100644
--- a/compiler/rustc_codegen_llvm/src/intrinsic.rs
+++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs
@@ -3,8 +3,8 @@ use std::ffi::c_uint;
 use std::{assert_matches, iter, ptr};
 
 use rustc_abi::{
-    Align, BackendRepr, Float, HasDataLayout, Integer, NumScalableVectors, Primitive, Size,
-    WrappingRange,
+    AddressSpace, Align, BackendRepr, Float, HasDataLayout, Integer, NumScalableVectors, Primitive,
+    Size, WrappingRange,
 };
 use rustc_codegen_ssa::base::{compare_simd_types, wants_msvc_seh, wants_wasm_eh};
 use rustc_codegen_ssa::common::{IntPredicate, TypeKind};
@@ -176,6 +176,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
         span: Span,
     ) -> Result<(), ty::Instance<'tcx>> {
         let tcx = self.tcx;
+        let llvm_version = crate::llvm_util::get_version();
 
         let name = tcx.item_name(instance.def_id());
         let fn_args = instance.args;
@@ -192,7 +193,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
             | sym::maximum_number_nsz_f64
             | sym::maximum_number_nsz_f128
                 // Need at least LLVM 22 for `min/maximumnum` to not crash LLVM.
-                if crate::llvm_util::get_version() >= (22, 0, 0) =>
+                if llvm_version >= (22, 0, 0) =>
             {
                 let intrinsic_name = if name.as_str().starts_with("min") {
                     "llvm.minimumnum"
@@ -418,7 +419,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
             }
 
             // FIXME move into the branch below when LLVM 22 is the lowest version we support.
-            sym::carryless_mul if crate::llvm_util::get_version() >= (22, 0, 0) => {
+            sym::carryless_mul if llvm_version >= (22, 0, 0) => {
                 let ty = args[0].layout.ty;
                 if !ty.is_integral() {
                     tcx.dcx().emit_err(InvalidMonomorphization::BasicIntegerType {
@@ -618,6 +619,46 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                 return Ok(());
             }
 
+            sym::gpu_launch_sized_workgroup_mem => {
+                // Generate an anonymous global per call, with these properties:
+                // 1. The global is in the address space for workgroup memory
+                // 2. It is an `external` global
+                // 3. It is correctly aligned for the pointee `T`
+                // All instances of extern addrspace(gpu_workgroup) globals are merged in the LLVM backend.
+                // The name is irrelevant.
+                // See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared
+                let name = if llvm_version < (23, 0, 0) && tcx.sess.target.arch == Arch::Nvptx64 {
+                    // The auto-assigned name for extern shared globals in the nvptx backend does
+                    // not compile in ptxas. Workaround this issue by assigning a name.
+                    // Fixed in LLVM 23.
+                    "gpu_launch_sized_workgroup_mem"
+                } else {
+                    ""
+                };
+                let global = self.declare_global_in_addrspace(
+                    name,
+                    self.type_array(self.type_i8(), 0),
+                    AddressSpace::GPU_WORKGROUP,
+                );
+                let ty::RawPtr(inner_ty, _) = result.layout.ty.kind() else { unreachable!() };
+                // The alignment of the global is used to specify the *minimum* alignment that
+                // must be obeyed by the GPU runtime.
+                // When multiple of these global variables are used by a kernel, the maximum alignment is taken.
+                // See https://github.com/llvm/llvm-project/blob/a271d07488a85ce677674bbe8101b10efff58c95/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp#L821
+                let alignment = self.align_of(*inner_ty).bytes() as u32;
+                unsafe {
+                    // FIXME Workaround the above issue by taking maximum alignment if the global existed
+                    if tcx.sess.target.arch == Arch::Nvptx64 {
+                        if alignment > llvm::LLVMGetAlignment(global) {
+                            llvm::LLVMSetAlignment(global, alignment);
+                        }
+                    } else {
+                        llvm::LLVMSetAlignment(global, alignment);
+                    }
+                }
+                self.cx().const_pointercast(global, self.type_ptr())
+            }
+
             sym::amdgpu_dispatch_ptr => {
                 let val = self.call_intrinsic("llvm.amdgcn.dispatch.ptr", &[], &[]);
                 // Relying on `LLVMBuildPointerCast` to produce an addrspacecast
diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
index 525d1dbe9d0d3..3e373c42eca34 100644
--- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
@@ -2003,6 +2003,13 @@ unsafe extern "C" {
         NameLen: size_t,
         T: &'a Type,
     ) -> &'a Value;
+    pub(crate) fn LLVMRustGetOrInsertGlobalInAddrspace<'a>(
+        M: &'a Module,
+        Name: *const c_char,
+        NameLen: size_t,
+        T: &'a Type,
+        AddressSpace: c_uint,
+    ) -> &'a Value;
     pub(crate) fn LLVMRustGetNamedValue(
         M: &Module,
         Name: *const c_char,
diff --git a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
index fd0c7c656ac21..f4a5e8baa2a5f 100644
--- a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
@@ -111,6 +111,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
                 sym::abort
                 | sym::unreachable
                 | sym::cold_path
+                | sym::gpu_launch_sized_workgroup_mem
                 | sym::breakpoint
                 | sym::amdgpu_dispatch_ptr
                 | sym::assert_zero_valid
diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
index d952faa5edb74..9cc66f3c2adf6 100644
--- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs
+++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
@@ -130,6 +130,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi
         | sym::forget
         | sym::frem_algebraic
         | sym::fsub_algebraic
+        | sym::gpu_launch_sized_workgroup_mem
         | sym::is_val_statically_known
         | sym::log2f16
         | sym::log2f32
@@ -297,6 +298,7 @@ pub(crate) fn check_intrinsic_type(
         sym::field_offset => (1, 0, vec![], tcx.types.usize),
         sym::rustc_peek => (1, 0, vec![param(0)], param(0)),
         sym::caller_location => (0, 0, vec![], tcx.caller_location_ty()),
+        sym::gpu_launch_sized_workgroup_mem => (1, 0, vec![], Ty::new_mut_ptr(tcx, param(0))),
         sym::assert_inhabited | sym::assert_zero_valid | sym::assert_mem_uninitialized_valid => {
             (1, 0, vec![], tcx.types.unit)
         }
diff --git a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
index c310e580af559..91bb1c9733630 100644
--- a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
+++ b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
@@ -299,10 +299,12 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertFunction(LLVMModuleRef M,
                   .getCallee());
 }
 
-extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
-                                                  const char *Name,
-                                                  size_t NameLen,
-                                                  LLVMTypeRef Ty) {
+// Get the global variable with the given name if it exists or create a new
+// external global.
+extern "C" LLVMValueRef
+LLVMRustGetOrInsertGlobalInAddrspace(LLVMModuleRef M, const char *Name,
+                                     size_t NameLen, LLVMTypeRef Ty,
+                                     unsigned int AddressSpace) {
   Module *Mod = unwrap(M);
   auto NameRef = StringRef(Name, NameLen);
 
@@ -313,10 +315,24 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
   GlobalVariable *GV = Mod->getGlobalVariable(NameRef, true);
   if (!GV)
     GV = new GlobalVariable(*Mod, unwrap(Ty), false,
-                            GlobalValue::ExternalLinkage, nullptr, NameRef);
+                            GlobalValue::ExternalLinkage, nullptr, NameRef,
+                            nullptr, GlobalValue::NotThreadLocal, AddressSpace);
   return wrap(GV);
 }
 
+// Get the global variable with the given name if it exists or create a new
+// external global.
+extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
+                                                  const char *Name,
+                                                  size_t NameLen,
+                                                  LLVMTypeRef Ty) {
+  Module *Mod = unwrap(M);
+  unsigned int AddressSpace =
+      Mod->getDataLayout().getDefaultGlobalsAddressSpace();
+  return LLVMRustGetOrInsertGlobalInAddrspace(M, Name, NameLen, Ty,
+                                              AddressSpace);
+}
+
 // Must match the layout of `rustc_codegen_llvm::llvm::ffi::AttributeKind`.
 enum class LLVMRustAttributeKind {
   AlwaysInline = 0,
diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs
index 981bfed363dcc..5e7834e800ff7 100644
--- a/compiler/rustc_span/src/symbol.rs
+++ b/compiler/rustc_span/src/symbol.rs
@@ -1032,6 +1032,7 @@ symbols! {
         global_asm,
         global_registration,
         globs,
+        gpu_launch_sized_workgroup_mem,
         gt,
         guard,
         guard_patterns,
diff --git a/library/core/src/intrinsics/gpu.rs b/library/core/src/intrinsics/gpu.rs
index 9e7624841d0c6..43cb7251c3c88 100644
--- a/library/core/src/intrinsics/gpu.rs
+++ b/library/core/src/intrinsics/gpu.rs
@@ -5,6 +5,51 @@
 
 #![unstable(feature = "gpu_intrinsics", issue = "none")]
 
+/// Returns the pointer to workgroup memory allocated at launch-time on GPUs.
+///
+/// Workgroup memory is a memory region that is shared between all threads in
+/// the same workgroup. It is faster to access than other memory but pointers do not
+/// work outside the workgroup where they were obtained.
+/// Workgroup memory can be allocated statically or after compilation, when
+/// launching a gpu-kernel. `gpu_launch_sized_workgroup_mem` returns the pointer to
+/// the memory that is allocated at launch-time.
+/// The size of this memory can differ between launches of a gpu-kernel, depending on
+/// what is specified at launch-time.
+/// However, the alignment is fixed by the kernel itself, at compile-time.
+///
+/// The returned pointer is the start of the workgroup memory region that is
+/// allocated at launch-time.
+/// All calls to `gpu_launch_sized_workgroup_mem` in a workgroup, independent of the
+/// generic type, return the same address, so alias the same memory.
+/// The returned pointer is aligned by at least the alignment of `T`.
+///
+/// If `gpu_launch_sized_workgroup_mem` is invoked multiple times with different
+/// types that have different alignment, then you may only rely on the resulting
+/// pointer having the alignment of `T` after a call to `gpu_launch_sized_workgroup_mem::<T>`
+/// has occurred in the current program execution.
+///
+/// # Safety
+///
+/// The pointer is safe to dereference from the start (the returned pointer) up to the
+/// size of workgroup memory that was specified when launching the current gpu-kernel.
+/// This allocated size is not related in any way to `T`.
+///
+/// The user must take care of synchronizing access to workgroup memory between
+/// threads in a workgroup. The usual data race requirements apply.
+///
+/// # Other APIs
+///
+/// CUDA and HIP call this dynamic shared memory, shared between threads in a block.
+/// OpenCL and SYCL call this local memory, shared between threads in a work-group.
+/// GLSL calls this shared memory, shared between invocations in a work group.
+/// DirectX calls this groupshared memory, shared between threads in a thread-group.
+#[must_use = "returns a pointer that does nothing unless used"]
+#[rustc_intrinsic]
+#[rustc_nounwind]
+#[unstable(feature = "gpu_launch_sized_workgroup_mem", issue = "135513")]
+#[cfg(any(target_arch = "amdgpu", target_arch = "nvptx64"))]
+pub fn gpu_launch_sized_workgroup_mem<T>() -> *mut T;
+
 /// Returns a pointer to the HSA kernel dispatch packet.
 ///
 /// A `gpu-kernel` on amdgpu is always launched through a kernel dispatch packet.
diff --git a/src/tools/tidy/src/style.rs b/src/tools/tidy/src/style.rs
index d144ffa222097..4e2f71b94ce2d 100644
--- a/src/tools/tidy/src/style.rs
+++ b/src/tools/tidy/src/style.rs
@@ -222,6 +222,10 @@ fn should_ignore(line: &str) -> bool {
         || static_regex!(
             "\\s*//@ \\!?(count|files|has|has-dir|hasraw|matches|matchesraw|snapshot)\\s.*"
         ).is_match(line)
+        // Matching for FileCheck checks
+        || static_regex!(
+            "\\s*// [a-zA-Z0-9-_]*:\\s.*"
+        ).is_match(line)
 }
 
 /// Returns `true` if `line` is allowed to be longer than the normal limit.
diff --git a/tests/codegen-llvm/gpu-launch-sized-workgroup-memory.rs b/tests/codegen-llvm/gpu-launch-sized-workgroup-memory.rs
new file mode 100644
index 0000000000000..4764160fd0b59
--- /dev/null
+++ b/tests/codegen-llvm/gpu-launch-sized-workgroup-memory.rs
@@ -0,0 +1,41 @@
+// Checks that the GPU intrinsic to get launch-sized workgroup memory works
+// and correctly aligns the `external addrspace(...) global`s over multiple calls.
+
+//@ revisions: amdgpu nvptx-pre-llvm-23 nvptx-post-llvm-23
+//@ compile-flags: --crate-type=rlib -Copt-level=1
+//
+//@ [amdgpu] compile-flags: --target amdgcn-amd-amdhsa -Ctarget-cpu=gfx900
+//@ [amdgpu] needs-llvm-components: amdgpu
+
+//@ [nvptx-pre-llvm-23] compile-flags: --target nvptx64-nvidia-cuda
+//@ [nvptx-pre-llvm-23] needs-llvm-components: nvptx
+//@ [nvptx-pre-llvm-23] max-llvm-major-version: 22
+//@ [nvptx-post-llvm-23] compile-flags: --target nvptx64-nvidia-cuda
+//@ [nvptx-post-llvm-23] needs-llvm-components: nvptx
+//@ [nvptx-post-llvm-23] min-llvm-version: 23
+//@ add-minicore
+#![feature(intrinsics, no_core, rustc_attrs)]
+#![no_core]
+
+extern crate minicore;
+
+#[rustc_intrinsic]
+#[rustc_nounwind]
+fn gpu_launch_sized_workgroup_mem<T>() -> *mut T;
+
+// amdgpu-DAG: @[[SMALL:[^ ]+]] = external addrspace(3) global [0 x i8], align 4
+// amdgpu-DAG: @[[BIG:[^ ]+]] = external addrspace(3) global [0 x i8], align 8
+// amdgpu: ret { ptr, ptr } { ptr addrspacecast (ptr addrspace(3) @[[SMALL]] to ptr), ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr) }
+
+// nvptx-pre-llvm-23: @[[BIG:[^ ]+]] = external addrspace(3) global [0 x i8], align 8
+// nvptx-pre-llvm-23: ret { ptr, ptr } { ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr), ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr) }
+
+// nvptx-post-llvm-23-DAG: @[[SMALL:[^ ]+]] = external addrspace(3) global [0 x i8], align 4
+// nvptx-post-llvm-23-DAG: @[[BIG:[^ ]+]] = external addrspace(3) global [0 x i8], align 8
+// nvptx-post-llvm-23: ret { ptr, ptr } { ptr addrspacecast (ptr addrspace(3) @[[SMALL]] to ptr), ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr) }
+#[unsafe(no_mangle)]
+pub fn fun() -> (*mut i32, *mut f64) {
+    let small = gpu_launch_sized_workgroup_mem::<i32>();
+    let big = gpu_launch_sized_workgroup_mem::<f64>(); // Increase alignment to 8
+    (small, big)
+}