From 7905aef185ff75ee244654fcaa58b8fb613ee2c1 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 17 Jun 2026 12:24:04 -0700
Subject: [PATCH 1/4] [Bug] AMDGPU: don't emit llvm.amdgcn.permlane64 on CDNA

v_permlane64_b32 is an RDNA-only instruction: it exists on gfx11 (RDNA3) and
gfx12 (RDNA4) but on no CDNA part. quadrants previously enabled the
llvm.amdgcn.permlane64 intrinsic on gfx940/gfx941/gfx942 (CDNA3) as well. On
gfx942 the AMDGPU backend does not cleanly "Cannot select" the intrinsic -- it
selects the V_PERMLANE64_B32 pseudo, which has no valid MC opcode for CDNA, and
then crashes with a bare SIGSEGV in SIInstrInfo::getInstSizeInBytes during the
branch-relaxation pass. This made scene.build() segfault for any kernel using a
cross-half subgroup shuffle (genesis-world #2962).

Gate has_permlane64 to gfx11/gfx12 only, so every CDNA target (and gfx10.x
RDNA1/2) takes the existing LDS-roundtrip software emulation, which produces
correct cross-half results on wave64 hardware. Also drop the
QD_AMDGPU_FORCE_PERMLANE64_FALLBACK env-var escape hatch and correct the
related comments in llvm_context.cpp, runtime.cpp and test_simt.py.
---
 quadrants/runtime/llvm/llvm_context.cpp       | 26 +++++++++----------
 .../runtime/llvm/runtime_module/runtime.cpp   | 22 ++++++++--------
 tests/python/test_simt.py                     |  7 +++--
 3 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/quadrants/runtime/llvm/llvm_context.cpp b/quadrants/runtime/llvm/llvm_context.cpp
index 76ae4b5a0e..c50e2fbd86 100644
--- a/quadrants/runtime/llvm/llvm_context.cpp
+++ b/quadrants/runtime/llvm/llvm_context.cpp
@@ -556,9 +556,14 @@ std::unique_ptr<llvm::Module> QuadrantsLLVMContext::module_from_file(const std::
       // We use it to extend the SIMD32-scoped ``ds_bpermute`` (every shuffle op lowers to that) into a wave64-aware
       // cross-half shuffle on RDNA: ``ds_bpermute`` reads within the lane's own 32-lane SIMD cluster, ``permlane64``
       // brings the other SIMD's value to this lane, and we select between the two based on which half the target lane
-      // sits in. See ``amdgpu_cross_half_shuffle_i32`` in runtime.cpp. The instruction is gfx940+ (CDNA3) and gfx11+
-      // (RDNA3+) only -- on earlier wave64-capable targets (gfx9xx CDNA1/2, gfx10.x RDNA1/2) the AMDGPU LLVM backend
-      // hits "Cannot select" while lowering the intrinsic, so we have to provide a software emulation.
+      // sits in. See ``amdgpu_cross_half_shuffle_i32`` in runtime.cpp. ``v_permlane64_b32`` is an RDNA-only
+      // instruction: it exists on gfx11 (RDNA3) and gfx12 (RDNA4), but on NO CDNA part -- the AMD assembler rejects it
+      // ("instruction not supported on this GPU") for gfx908/gfx90a (CDNA1/2), gfx940/gfx941/gfx942 (CDNA3) and gfx950
+      // (CDNA4) alike. On every wave64-capable target without the native instruction (all CDNA parts, plus gfx10.x
+      // RDNA1/2) we must provide a software emulation. CDNA3 (gfx942) is especially treacherous: the backend does NOT
+      // cleanly "Cannot select" the intrinsic -- it selects the ``V_PERMLANE64_B32`` pseudo, which has no valid MC
+      // opcode for CDNA, and then crashes with a bare SIGSEGV inside ``SIInstrInfo::getInstSizeInBytes`` during the
+      // branch-relaxation pass. So we must never emit the intrinsic on CDNA.
       //
       // The emulation is a wave-local LDS roundtrip: each lane writes its ``value`` to ``lds[wave_base + lane]``,
       // a wavefront-scope acquire-release fence lowers to ``s_waitcnt lgkmcnt(0)`` (drains outstanding LDS writes),
@@ -574,17 +579,10 @@ std::unique_ptr<llvm::Module> QuadrantsLLVMContext::module_from_file(const std::
       // have to pass the explicit ``i32`` type alongside the ID -- otherwise ``CreateIntrinsic`` segfaults inside
       // ``getDeclaration()`` while resolving the mangled name.
       auto mcpu_str = AMDGPUContext::get_instance().get_mcpu();
-      bool has_permlane64 = (mcpu_str == "gfx940" || mcpu_str == "gfx941" || mcpu_str == "gfx942" ||
-                             mcpu_str.substr(0, 5) == "gfx11" || mcpu_str.substr(0, 5) == "gfx12");
-      // Escape hatch for validating the LDS software emulation on hardware that natively supports
-      // ``v_permlane64_b32``: setting ``QD_AMDGPU_FORCE_PERMLANE64_FALLBACK=1`` forces the JIT to take the LDS path
-      // even on gfx11+ / gfx940+, so we can exercise the fallback on a working AMD box (gfx1100 / gfx942) without
-      // needing a gfx10.x runner.  Has no effect on non-AMDGPU backends.
-      if (const char *force_fallback = std::getenv("QD_AMDGPU_FORCE_PERMLANE64_FALLBACK")) {
-        if (force_fallback[0] == '1') {
-          has_permlane64 = false;
-        }
-      }
+      // RDNA3+ (gfx11) and RDNA4 (gfx12) only. No CDNA part has ``v_permlane64_b32`` (see above), so every gfx9xx
+      // target takes the LDS emulation -- including gfx940/gfx941/gfx942 (CDNA3), which used to be (wrongly) listed
+      // here and made the AMDGPU backend segfault on any kernel using a cross-half subgroup shuffle.
+      bool has_permlane64 = (mcpu_str.substr(0, 5) == "gfx11" || mcpu_str.substr(0, 5) == "gfx12");
       if (has_permlane64) {
         patch_intrinsic("amdgpu_permlane64", llvm::Intrinsic::amdgcn_permlane64, true, {llvm::Type::getInt32Ty(*ctx)});
       } else if (auto permlane64_func = module->getFunction("amdgpu_permlane64")) {
diff --git a/quadrants/runtime/llvm/runtime_module/runtime.cpp b/quadrants/runtime/llvm/runtime_module/runtime.cpp
index facebb5f43..a4d4f4d39b 100644
--- a/quadrants/runtime/llvm/runtime_module/runtime.cpp
+++ b/quadrants/runtime/llvm/runtime_module/runtime.cpp
@@ -1015,13 +1015,13 @@ i32 amdgpu_ds_bpermute(i32 byte_index, i32 value) {
 }
 
 // Exchanges a 32-bit value between lanes ``i`` and ``i ^ 32`` in a single instruction. The native instruction
-// ``v_permlane64_b32`` is only available on gfx940+ (CDNA3) and gfx11+ (RDNA3+); ``llvm_context.cpp`` detects the
-// target at JIT time and patches this stub to either the ``llvm.amdgcn.permlane64`` intrinsic (on supported
-// hardware) or an LDS-roundtrip software emulation (on gfx9xx CDNA1/2 and gfx10.x RDNA1/2). The emulation has higher
-// latency (LDS store + ``s_waitcnt`` + LDS load -- roughly tens of cycles per call vs. a few for the native swap),
-// but produces correct cross-half results on RDNA wave64 emulation hardware. Used by
-// ``amdgpu_cross_half_shuffle_i32`` below to repair the cross-half story for ``ds_bpermute``, which is SIMD32-scoped
-// on RDNA.
+// ``v_permlane64_b32`` is RDNA-only -- it exists on gfx11 (RDNA3) and gfx12 (RDNA4), but on NO CDNA part (the AMD
+// assembler rejects it on gfx908/gfx90a/gfx940/gfx941/gfx942/gfx950). ``llvm_context.cpp`` detects the target at JIT
+// time and patches this stub to either the ``llvm.amdgcn.permlane64`` intrinsic (on gfx11+/gfx12 only) or an
+// LDS-roundtrip software emulation (on every CDNA part and on gfx10.x RDNA1/2). The emulation has higher latency (LDS
+// store + ``s_waitcnt`` + LDS load -- roughly tens of cycles per call vs. a few for the native swap), but produces
+// correct cross-half results on wave64 hardware. Used by ``amdgpu_cross_half_shuffle_i32`` below to repair the
+// cross-half story for ``ds_bpermute``, which is SIMD32-scoped on RDNA.
 i32 amdgpu_permlane64(i32 value) {
   __builtin_trap();
   return 0;
@@ -1078,10 +1078,10 @@ i32 amdgpu_lane_id() {
 // Note this is correct on every AMDGPU target we run on. On CDNA (gfx9xx, gfx940/942) ``ds_bpermute`` could in
 // principle directly address all 64 lanes, but because we always mask the byte argument to ``(target_lane & 31) * 4``
 // we never test that path -- on both ISAs the byte index is in [0, 128) and only addresses the bottom half. The
-// ``permlane64`` swap then supplies the top-half data: on hardware with the native instruction (gfx940+ CDNA3 /
-// gfx11+ RDNA3+) this is a single ``v_permlane64_b32``; on older wave64-capable targets (gfx9xx CDNA1/2, gfx10.x
-// RDNA1/2) the JIT patches ``amdgpu_permlane64`` to an LDS roundtrip that produces the same result at higher latency
-// (see the patching logic in ``llvm_context.cpp``).
+// ``permlane64`` swap then supplies the top-half data: on hardware with the native instruction (gfx11 RDNA3 / gfx12
+// RDNA4) this is a single ``v_permlane64_b32``; on every other wave64-capable target -- all CDNA parts (gfx9xx,
+// including gfx940/gfx942 CDNA3 and gfx950 CDNA4) and gfx10.x RDNA1/2 -- the JIT patches ``amdgpu_permlane64`` to an
+// LDS roundtrip that produces the same result at higher latency (see the patching logic in ``llvm_context.cpp``).
 //
 // OOR target lanes (``target_lane < 0`` or ``target_lane >= 64``): we mask to ``target_lane & 31`` for the byte and
 // ``& 32`` for the half-bit. The behaviour for OOR targets is implementation-defined on every backend (CUDA's
diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
index dd5f69c89c..deb6b72797 100644
--- a/tests/python/test_simt.py
+++ b/tests/python/test_simt.py
@@ -3845,8 +3845,11 @@ def test_subgroup_exclusive_add_tiled_log2_size_6():
 # to the ``permlane64``-based cross-half helper a lane in the bottom half could not read the top half (and vice
 # versa).  All five tests are gated to ``log2_group_size() == 6`` so they only assert anything on real wave64
 # hardware -- CUDA and SPIR-V backends with wave32 skip the absolute-correctness check (the cross-half partner is
-# out of range there, which is implementation-defined).  CDNA (gfx9xx, MI300X) already had a wave64-wide
-# ``ds_bpermute`` so its behaviour is unchanged by the fix; the new helper is observably a no-op on that path.
+# out of range there, which is implementation-defined).  Both CDNA and RDNA wave64 run these: the helper masks
+# ``ds_bpermute`` to the bottom 32 lanes and pulls the top half via ``amdgpu_permlane64``, which the JIT lowers to the
+# native ``v_permlane64_b32`` on RDNA3+/gfx11+ but to an LDS-roundtrip emulation on every CDNA part (gfx9xx, incl.
+# gfx942 MI300X) -- CDNA has no ``v_permlane64_b32`` and emitting the intrinsic there crashes the AMDGPU backend
+# (genesis-world issue #2962).  On CDNA these tests therefore exercise the emulation path automatically.
 # --------------------------------------------------------------------------------------------------------------------
 
 

From 33a7f16e76a172efad83bc9aa67de879a369541f Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 17 Jun 2026 12:40:23 -0700
Subject: [PATCH 2/4] [Bug] AMDGPU: make wave64 cross-half shuffle correct on
 CDNA

Gating permlane64 off CDNA (prev commit) stops the SIGSEGV but exposed a
latent correctness bug: the cross-half shuffle helper is RDNA-shaped. It masks
ds_bpermute's target to 31 and relies on permlane64 to fetch the top half,
which is correct only where ds_bpermute is SIMD32-scoped (RDNA). On CDNA
ds_bpermute is wave64-wide, so masking to 31 means every lane reads a
bottom-half lane and the top half is never reached -- shuffle_xor(v,32)
returned [32..63, 32..63] instead of [32..63, 0..31] on gfx942 (CI never
caught this: RDNA uses the native instruction, not this path).

Make the lowering architecture-aware via two JIT-patched knobs:
- amdgpu_ds_bpermute_lane_mask(): 63 on GCN/CDNA (gfx9xx, wave64-wide
  ds_bpermute), 31 on RDNA (SIMD32-scoped). With mask 63 a single wide
  ds_bpermute already returns lane target_lane for the whole wave.
- permlane64 patched to the identity on CDNA, so the helper's cross-SIMD
  branch equals the same-SIMD branch and the per-lane select is a true no-op
  (and the intrinsic, which has no MC opcode on CDNA, is never emitted).

RDNA paths are unchanged: gfx11/gfx12 keep native v_permlane64_b32, gfx10.x
keeps the LDS-roundtrip emulation, both with lane mask 31.
---
 quadrants/runtime/llvm/llvm_context.cpp       | 84 ++++++++++++-------
 .../runtime/llvm/runtime_module/runtime.cpp   | 34 +++++---
 tests/python/test_simt.py                     | 11 +--
 3 files changed, 83 insertions(+), 46 deletions(-)

diff --git a/quadrants/runtime/llvm/llvm_context.cpp b/quadrants/runtime/llvm/llvm_context.cpp
index c50e2fbd86..372375fd16 100644
--- a/quadrants/runtime/llvm/llvm_context.cpp
+++ b/quadrants/runtime/llvm/llvm_context.cpp
@@ -552,41 +552,65 @@ std::unique_ptr<llvm::Module> QuadrantsLLVMContext::module_from_file(const std::
       }
       patch_intrinsic("amdgpu_clock_i64", llvm::Intrinsic::amdgcn_s_memtime);
       patch_intrinsic("amdgpu_ds_bpermute", llvm::Intrinsic::amdgcn_ds_bpermute);
-      // ``llvm.amdgcn.permlane64`` exchanges a 32-bit value between lanes ``i`` and ``i ^ 32`` in a single instruction.
-      // We use it to extend the SIMD32-scoped ``ds_bpermute`` (every shuffle op lowers to that) into a wave64-aware
-      // cross-half shuffle on RDNA: ``ds_bpermute`` reads within the lane's own 32-lane SIMD cluster, ``permlane64``
-      // brings the other SIMD's value to this lane, and we select between the two based on which half the target lane
-      // sits in. See ``amdgpu_cross_half_shuffle_i32`` in runtime.cpp. ``v_permlane64_b32`` is an RDNA-only
-      // instruction: it exists on gfx11 (RDNA3) and gfx12 (RDNA4), but on NO CDNA part -- the AMD assembler rejects it
-      // ("instruction not supported on this GPU") for gfx908/gfx90a (CDNA1/2), gfx940/gfx941/gfx942 (CDNA3) and gfx950
-      // (CDNA4) alike. On every wave64-capable target without the native instruction (all CDNA parts, plus gfx10.x
-      // RDNA1/2) we must provide a software emulation. CDNA3 (gfx942) is especially treacherous: the backend does NOT
-      // cleanly "Cannot select" the intrinsic -- it selects the ``V_PERMLANE64_B32`` pseudo, which has no valid MC
-      // opcode for CDNA, and then crashes with a bare SIGSEGV inside ``SIInstrInfo::getInstSizeInBytes`` during the
-      // branch-relaxation pass. So we must never emit the intrinsic on CDNA.
+      // The wave64 cross-half subgroup shuffle (``amdgpu_cross_half_shuffle_i32`` in runtime.cpp) is built from
+      // ``ds_bpermute`` plus, on some targets, ``permlane64``. How those behave -- and therefore how we patch them --
+      // depends on the architecture family:
       //
-      // The emulation is a wave-local LDS roundtrip: each lane writes its ``value`` to ``lds[wave_base + lane]``,
-      // a wavefront-scope acquire-release fence lowers to ``s_waitcnt lgkmcnt(0)`` (drains outstanding LDS writes),
-      // and each lane then reads ``lds[wave_base + (lane ^ 32)]``. On RDNA wave64-emulation the two SIMD32 halves of
-      // the wave issue store / load in two passes apiece, but the waitcnt between them guarantees both halves' stores
-      // are committed to LDS before either half's loads issue, so the cross-half routing is correct.  ``wave_base``
-      // is ``(workitem.id.x >> 6) << 6``, scoping the LDS slot to a single wave so multi-wave workgroups don't
-      // collide.  The LDS buffer is a 1024-entry per-workgroup global (4 KiB) -- enough for the AMDGPU 1024-thread
-      // workgroup max at wave64.  The buffer is only materialised on this code path, so kernels on permlane64-capable
-      // hardware (the common case) pay zero LDS for cross-half shuffles.
+      //   * GCN / CDNA (gfx9xx: gfx900/906 Vega, gfx908/gfx90a CDNA1/2, gfx940/gfx941/gfx942 CDNA3, gfx950 CDNA4):
+      //     ``ds_bpermute`` addresses the full wave64 directly, so the cross-half shuffle is a single wide
+      //     ``ds_bpermute`` (lane mask 63) and ``permlane64`` is unnecessary. Critically, ``v_permlane64_b32`` does
+      //     not exist on CDNA -- emitting ``llvm.amdgcn.permlane64`` makes the backend select the ``V_PERMLANE64_B32``
+      //     pseudo, which has no valid MC opcode for CDNA, and then crash with a bare SIGSEGV inside
+      //     ``SIInstrInfo::getInstSizeInBytes`` during branch relaxation (genesis-world #2962). So on CDNA we patch
+      //     ``amdgpu_permlane64`` to the identity, which neutralises the helper's (RDNA-shaped) cross-SIMD branch
+      //     without ever emitting the intrinsic.
+      //   * RDNA3/4 (gfx11 / gfx12): ``ds_bpermute`` is SIMD32-scoped (lane mask 31), so the top half is reached via
+      //     the native single-instruction ``v_permlane64_b32``.
+      //   * RDNA1/2 (gfx10.x): ``ds_bpermute`` is SIMD32-scoped (lane mask 31), but ``v_permlane64_b32`` does not
+      //     exist yet, so we emulate the lane ``i`` <-> ``i ^ 32`` swap with an LDS roundtrip (below).
       //
-      // The intrinsic is overloaded on its element type (signature ``T -> T`` for any 32-bit-or-smaller ``T``), so we
-      // have to pass the explicit ``i32`` type alongside the ID -- otherwise ``CreateIntrinsic`` segfaults inside
-      // ``getDeclaration()`` while resolving the mangled name.
+      // The LDS emulation writes each lane's ``value`` to ``lds[wave_base + lane]``, issues a wavefront-scope
+      // acquire-release fence (lowers to ``s_waitcnt lgkmcnt(0)``: drains outstanding LDS writes without the
+      // cross-wave ``s_barrier`` a workgroup-scope fence would emit, which would deadlock if only some waves reach
+      // this point), then reads back ``lds[wave_base + (lane ^ 32)]``. ``wave_base`` is ``(workitem.id.x >> 6) << 6``,
+      // scoping the slot to a single wave so multi-wave workgroups don't collide. The buffer is a 1024-entry
+      // per-workgroup global (4 KiB, the AMDGPU 1024-thread wave64 max), materialised only on this path, so kernels
+      // on the other two paths pay zero LDS for cross-half shuffles.
+      //
+      // ``patch_intrinsic`` for permlane64 passes the explicit ``i32`` type alongside the ID because the intrinsic is
+      // overloaded on its element type (signature ``T -> T`` for any 32-bit-or-smaller ``T``); otherwise
+      // ``CreateIntrinsic`` segfaults inside ``getDeclaration()`` while resolving the mangled name.
       auto mcpu_str = AMDGPUContext::get_instance().get_mcpu();
-      // RDNA3+ (gfx11) and RDNA4 (gfx12) only. No CDNA part has ``v_permlane64_b32`` (see above), so every gfx9xx
-      // target takes the LDS emulation -- including gfx940/gfx941/gfx942 (CDNA3), which used to be (wrongly) listed
-      // here and made the AMDGPU backend segfault on any kernel using a cross-half subgroup shuffle.
-      bool has_permlane64 = (mcpu_str.substr(0, 5) == "gfx11" || mcpu_str.substr(0, 5) == "gfx12");
-      if (has_permlane64) {
+      bool is_gcn_cdna = (mcpu_str.substr(0, 4) == "gfx9");
+      bool has_native_permlane64 = (mcpu_str.substr(0, 5) == "gfx11" || mcpu_str.substr(0, 5) == "gfx12");
+
+      // Patch the ds_bpermute lane mask used by ``amdgpu_cross_half_shuffle_i32``: 63 where ``ds_bpermute`` is
+      // wave64-wide (GCN/CDNA), 31 where it is SIMD32-scoped (RDNA, paired with the permlane64 swap above).
+      if (auto mask_func = module->getFunction("amdgpu_ds_bpermute_lane_mask")) {
+        mask_func->deleteBody();
+        auto bb = llvm::BasicBlock::Create(*ctx, "entry", mask_func);
+        IRBuilder<> builder(*ctx);
+        builder.SetInsertPoint(bb);
+        builder.CreateRet(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), is_gcn_cdna ? 63 : 31));
+        QuadrantsLLVMContext::mark_inline(mask_func);
+      }
+
+      if (has_native_permlane64) {
         patch_intrinsic("amdgpu_permlane64", llvm::Intrinsic::amdgcn_permlane64, true, {llvm::Type::getInt32Ty(*ctx)});
+      } else if (is_gcn_cdna) {
+        // CDNA: the wide ds_bpermute already reaches all 64 lanes, so permlane64 is unnecessary -- and emitting it
+        // crashes the backend. Patch it to the identity so the helper's cross-SIMD branch returns the same value as
+        // its same-SIMD branch, making the per-lane select a true no-op.
+        if (auto permlane64_func = module->getFunction("amdgpu_permlane64")) {
+          permlane64_func->deleteBody();
+          auto bb = llvm::BasicBlock::Create(*ctx, "entry", permlane64_func);
+          IRBuilder<> builder(*ctx);
+          builder.SetInsertPoint(bb);
+          builder.CreateRet(&*permlane64_func->arg_begin());
+          QuadrantsLLVMContext::mark_inline(permlane64_func);
+        }
       } else if (auto permlane64_func = module->getFunction("amdgpu_permlane64")) {
-        // LDS-based software emulation.  Layout: ``[1024 x i32] addrspace(3)`` indexed by ``wave_base + lane``.
+        // gfx10.x RDNA1/2: LDS-based software emulation. Layout: ``[1024 x i32] addrspace(3)`` indexed by ``wave_base + lane``.
         auto i32_ty = llvm::Type::getInt32Ty(*ctx);
         auto buf_ty = llvm::ArrayType::get(i32_ty, 1024);
         auto lds_global = llvm::cast_or_null<llvm::GlobalVariable>(module->getNamedValue("__amdgpu_permlane64_lds"));
diff --git a/quadrants/runtime/llvm/runtime_module/runtime.cpp b/quadrants/runtime/llvm/runtime_module/runtime.cpp
index a4d4f4d39b..0edb72ec5d 100644
--- a/quadrants/runtime/llvm/runtime_module/runtime.cpp
+++ b/quadrants/runtime/llvm/runtime_module/runtime.cpp
@@ -1059,6 +1059,15 @@ i32 amdgpu_lane_id() {
   return amdgpu_mbcnt_hi(-1, amdgpu_mbcnt_lo(-1, 0));
 }
 
+// Lane-index mask applied to ``ds_bpermute``'s target in ``amdgpu_cross_half_shuffle_i32``. ``ds_bpermute``'s reach
+// differs by ISA, so ``llvm_context.cpp`` patches this at JIT load time: 63 on GCN/CDNA (gfx9xx), where
+// ``ds_bpermute`` addresses the full wave64 directly, and 31 on RDNA (gfx10/11/12), where it is SIMD32-scoped and the
+// top half is reached via the ``permlane64`` swap instead. Defaults to 31 so an unpatched build stays correct on
+// RDNA. (runtime.cpp is compiled at -O0 to bitcode, so this stays a real call until the JIT patches + inlines it.)
+i32 amdgpu_ds_bpermute_lane_mask() {
+  return 31;
+}
+
 // Wave64-aware "read ``value`` from lane ``target_lane``" gather for AMDGPU. Shared by every i32 shuffle variant
 // (``shuffle`` / ``shuffle_down`` / ``shuffle_up``); the f32 / i64 / f64 wrappers below decompose into i32 calls and
 // therefore inherit the wave64 fix for free.
@@ -1075,13 +1084,16 @@ i32 amdgpu_lane_id() {
 // so ``ds_bpermute(byte, permlane64(value))`` effectively reads from lanes 32-63. We always compute both reads and
 // select between them branchlessly based on the high bit of ``target_lane``: bit 5 picks the half.
 //
-// Note this is correct on every AMDGPU target we run on. On CDNA (gfx9xx, gfx940/942) ``ds_bpermute`` could in
-// principle directly address all 64 lanes, but because we always mask the byte argument to ``(target_lane & 31) * 4``
-// we never test that path -- on both ISAs the byte index is in [0, 128) and only addresses the bottom half. The
-// ``permlane64`` swap then supplies the top-half data: on hardware with the native instruction (gfx11 RDNA3 / gfx12
-// RDNA4) this is a single ``v_permlane64_b32``; on every other wave64-capable target -- all CDNA parts (gfx9xx,
-// including gfx940/gfx942 CDNA3 and gfx950 CDNA4) and gfx10.x RDNA1/2 -- the JIT patches ``amdgpu_permlane64`` to an
-// LDS roundtrip that produces the same result at higher latency (see the patching logic in ``llvm_context.cpp``).
+// The lane-index mask (``amdgpu_ds_bpermute_lane_mask()``) and ``permlane64`` are both JIT-patched per architecture
+// (see ``llvm_context.cpp``), because ``ds_bpermute``'s reach differs by ISA:
+//   * GCN / CDNA (gfx9xx, incl. gfx940/gfx942/gfx950): ``ds_bpermute`` addresses the full wave64, so the mask is 63
+//     and a single ``ds_bpermute((target_lane & 63) * 4, value)`` already returns lane ``target_lane`` for the whole
+//     wave. ``permlane64`` is patched to the identity there -- ``v_permlane64_b32`` does not exist on CDNA and
+//     emitting it crashes the backend (genesis-world #2962) -- so ``from_other_half`` equals ``from_self_half`` and
+//     the select below is a true no-op.
+//   * RDNA (gfx10/11/12): ``ds_bpermute`` is SIMD32-scoped, so the mask is 31 and ``ds_bpermute`` only reaches the
+//     issuing lane's own 32-lane half. The top half is supplied by the ``permlane64`` swap -- a single
+//     ``v_permlane64_b32`` on gfx11/gfx12, or an LDS-roundtrip emulation on gfx10.x.
 //
 // OOR target lanes (``target_lane < 0`` or ``target_lane >= 64``): we mask to ``target_lane & 31`` for the byte and
 // ``& 32`` for the half-bit. The behaviour for OOR targets is implementation-defined on every backend (CUDA's
@@ -1093,12 +1105,12 @@ i32 amdgpu_cross_half_shuffle_i32(i32 target_lane, i32 value) {
   // wave -- lifting it above the select keeps the AMDGPU backend happy and lets it issue exactly one
   // ``v_permlane64_b32``. ``ds_bpermute`` on RDNA wave64 is SIMD32-scoped with a 5-bit address (top half of the wave
   // is unreachable directly), so ``from_self_half`` handles the same-SIMD case and ``from_other_half`` handles the
-  // cross-SIMD case via the ``swapped`` payload. On CDNA the wave is one SIMD64 so both reads return the same value
-  // and the select is a no-op; we don't try to optimize that out because the dead read is cheap (LLVM CSE may fold
-  // it anyway).
+  // cross-SIMD case via the ``swapped`` payload. On CDNA the lane mask is 63 (``ds_bpermute`` is wave64-wide) and
+  // ``permlane64`` is patched to the identity, so ``swapped == value``, both reads return lane ``target_lane``, and
+  // the select is a true no-op; we don't optimize that out because the dead read is cheap (LLVM CSE may fold it).
   i32 self_lane = amdgpu_lane_id();
   i32 swapped = amdgpu_permlane64(value);
-  i32 byte = (target_lane & 31) * 4;
+  i32 byte = (target_lane & amdgpu_ds_bpermute_lane_mask()) * 4;
   // ``llvm.amdgcn.ds.bpermute`` is the real hardware ``ds_bpermute_b32`` -- but if LLVM's uniformity analysis decides
   // ``byte`` is uniform across the wave (e.g. ``target_lane`` is a compile-time constant), it sometimes lowers to a
   // ``v_readlane_b32``-style instruction that addresses lanes 0..31 wave-globally rather than SIMD32-locally. On
diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
index deb6b72797..b5f055010b 100644
--- a/tests/python/test_simt.py
+++ b/tests/python/test_simt.py
@@ -3845,11 +3845,12 @@ def test_subgroup_exclusive_add_tiled_log2_size_6():
 # to the ``permlane64``-based cross-half helper a lane in the bottom half could not read the top half (and vice
 # versa).  All five tests are gated to ``log2_group_size() == 6`` so they only assert anything on real wave64
 # hardware -- CUDA and SPIR-V backends with wave32 skip the absolute-correctness check (the cross-half partner is
-# out of range there, which is implementation-defined).  Both CDNA and RDNA wave64 run these: the helper masks
-# ``ds_bpermute`` to the bottom 32 lanes and pulls the top half via ``amdgpu_permlane64``, which the JIT lowers to the
-# native ``v_permlane64_b32`` on RDNA3+/gfx11+ but to an LDS-roundtrip emulation on every CDNA part (gfx9xx, incl.
-# gfx942 MI300X) -- CDNA has no ``v_permlane64_b32`` and emitting the intrinsic there crashes the AMDGPU backend
-# (genesis-world issue #2962).  On CDNA these tests therefore exercise the emulation path automatically.
+# out of range there, which is implementation-defined).  Both CDNA and RDNA wave64 run these, via different lowerings
+# of ``amdgpu_cross_half_shuffle_i32``: on CDNA (gfx9xx, incl. gfx942 MI300X) ``ds_bpermute`` is wave64-wide, so the
+# cross-half read is a single wide ``ds_bpermute`` (lane mask 63) and ``permlane64`` is unused; on RDNA ``ds_bpermute``
+# is SIMD32-scoped (lane mask 31) and the top half comes from ``amdgpu_permlane64`` -- the native ``v_permlane64_b32``
+# on gfx11+/gfx12, or an LDS-roundtrip emulation on gfx10.x.  ``v_permlane64_b32`` does not exist on CDNA and emitting
+# it there crashes the AMDGPU backend (genesis-world issue #2962), so the JIT must never emit it on gfx9xx.
 # --------------------------------------------------------------------------------------------------------------------
 
 

From 7ba5a45e928ccc89b7538586984b5da78a7b34a7 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 17 Jun 2026 13:23:05 -0700
Subject: [PATCH 3/4] [Docs] AMDGPU: correct wave64 cross-half shuffle lowering
 for CDNA

The subgroup docs described emitting v_permlane64_b32 on CDNA as
"well-defined and free", which is exactly the pre-fix behavior that
crashed the AMDGPU backend (genesis-world #2962). Document the actual
per-arch lowering: a single wave-wide ds_bpermute on CDNA (no permlane64)
and the permlane64 + ds_bpermute + select pairing on RDNA wave64.
---
 docs/source/user_guide/subgroup.md | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/docs/source/user_guide/subgroup.md b/docs/source/user_guide/subgroup.md
index ba99df5a92..ce21098538 100644
--- a/docs/source/user_guide/subgroup.md
+++ b/docs/source/user_guide/subgroup.md
@@ -110,14 +110,14 @@ Each lane returns the `value` held by the lane whose subgroup-local id equals `i
 Lane `i` returns the `value` held by lane `i + offset`. Lanes near the top of the subgroup - where `i + offset >= subgroup_size` - receive an implementation-defined value (typically their own `value`), so reduction patterns must only trust lane 0's final result, or mask out the out-of-range lanes.
 
 - `value` and `offset` dtypes: same as `shuffle` above; `offset` is a `u32`.
-- Maps to `__shfl_down_sync` on CUDA and `OpGroupNonUniformShuffleDown` on SPIR-V. On AMDGPU it is emulated with `ds_bpermute`; wave64 cross-half offsets (any `offset >= 32` for low-half lanes, or any non-zero `offset` for high-half lanes that lands across the SIMD32 boundary) go through the same `permlane64 + ds_bpermute + select` lowering as `shuffle` - see [AMDGPU wave64 cross-half lowering](#amdgpu-wave64-cross-half-lowering). These operations are added on both RDNA and CDNA.
+- Maps to `__shfl_down_sync` on CUDA and `OpGroupNonUniformShuffleDown` on SPIR-V. On AMDGPU it is emulated with `ds_bpermute`; wave64 cross-half offsets (any `offset >= 32` for low-half lanes, or any non-zero `offset` for high-half lanes that lands across the SIMD32 boundary) go through the same wave64 cross-half lowering as `shuffle` - a single wave-wide `ds_bpermute` on CDNA, or a `permlane64 + ds_bpermute + select` sequence on RDNA - see [AMDGPU wave64 cross-half lowering](#amdgpu-wave64-cross-half-lowering). These operations are added on both RDNA and CDNA.
 
 ### `shuffle_up(value, offset)`
 
 Lane `i` returns the `value` held by lane `i - offset`. Lanes near the bottom of the subgroup - where `i - offset < 0` - receive an implementation-defined value (typically their own `value`), so the bottom `offset` lanes' results should be ignored or masked.
 
 - Same dtype rules as `shuffle` / `shuffle_down`; `offset` is a `u32`.
-- Maps to `__shfl_up_sync` on CUDA and `OpGroupNonUniformShuffleUp` on SPIR-V. On AMDGPU it is emulated with `ds_bpermute((lane - offset) * 4, value)`; wave64 cross-half cases go through the [AMDGPU wave64 cross-half lowering](#amdgpu-wave64-cross-half-lowering) (same `permlane64 + ds_bpermute + select` sequence as `shuffle` / `shuffle_down`). These operations are added on both RDNA and CDNA.
+- Maps to `__shfl_up_sync` on CUDA and `OpGroupNonUniformShuffleUp` on SPIR-V. On AMDGPU it is emulated with `ds_bpermute((lane - offset) * 4, value)`; wave64 cross-half cases go through the [AMDGPU wave64 cross-half lowering](#amdgpu-wave64-cross-half-lowering) (same per-arch handling as `shuffle` / `shuffle_down`: a single wave-wide `ds_bpermute` on CDNA, a `permlane64 + ds_bpermute + select` sequence on RDNA). These operations are added on both RDNA and CDNA.
 
 ### `shuffle_xor(value, mask)`
 
@@ -132,7 +132,7 @@ Lane `i` returns the `value` held by lane `i ^ mask`. Convenient for butterfly p
 Every lane in the subgroup returns the `value` held by the lane whose subgroup-local id equals `index`. Expresses intent ("read lane `index`") more directly than `shuffle(value, index)` and on backends with a dedicated broadcast may map to a cheaper instruction.
 
 - Same dtype rules as `shuffle`.
-- Maps to `__shfl_sync` on CUDA, `ds_bpermute` (plus a `permlane64`-driven cross-half select on wave64) on AMDGPU, and `OpGroupNonUniformBroadcast` on SPIR-V. See [AMDGPU wave64 cross-half lowering](#amdgpu-wave64-cross-half-lowering) for the wave64 mechanics. These operations are added on both RDNA and CDNA.
+- Maps to `__shfl_sync` on CUDA, `ds_bpermute` (with a `permlane64`-driven cross-half select on RDNA wave64, or a single wave-wide `ds_bpermute` on CDNA wave64) on AMDGPU, and `OpGroupNonUniformBroadcast` on SPIR-V. See [AMDGPU wave64 cross-half lowering](#amdgpu-wave64-cross-half-lowering) for the wave64 mechanics. These operations are added on both RDNA and CDNA.
 - **Important: on SPIR-V, `index` must be dynamically uniform** - the same value on every lane in the subgroup. Passing a per-lane varying `index` is undefined behavior, because `OpGroupNonUniformBroadcast` requires its `Id` operand to be dynamically uniform across the subgroup. On CUDA / AMDGPU, `index` may vary per lane and the call is identical to `shuffle(value, index)`. If you need a varying source lane, use `shuffle` directly.
 
 ### `broadcast_first(value)`
@@ -565,9 +565,10 @@ After the call, lane `k` (within each group of 32) holds `a[group_start] + a[gro
 
 ### AMDGPU wave64 cross-half lowering
 
-AMDGPU `ds_bpermute_b32` - the LDS-routed permute that Quadrants uses to lower `shuffle`, `shuffle_down`, and `shuffle_up` - has a hardware quirk on RDNA (gfx10/11/12, e.g. RX 7900 XTX): its lane-id operand is **SIMD32-scoped**. On a wave64 RDNA wave the 64 lanes execute as two SIMD32 clusters; `ds_bpermute` on those chips can only address lanes inside the requesting lane's own SIMD32 half. CDNA (gfx9xx, MI200/MI300) keeps the wave on a single SIMD64, so `ds_bpermute` there is wave-wide and the quirk does not exist.
+AMDGPU `ds_bpermute_b32` - the LDS-routed permute that Quadrants uses to lower `shuffle`, `shuffle_down`, and `shuffle_up` - reaches a different set of lanes depending on the architecture, so Quadrants lowers cross-half wave64 shuffles two different ways:
 
-To make wave64 `shuffle` / `shuffle_down` / `shuffle_up` behave consistently across RDNA and CDNA, Quadrants always lowers cross-half-capable shuffles through this 3-op sequence:
+- **CDNA (gfx9xx, MI200 / MI300)**: the wave runs as a single SIMD64, so `ds_bpermute_b32` is wave-wide and addresses all 64 lanes directly. A cross-half shuffle is therefore a single `ds_bpermute_b32 (target_lane * 4), value` with the lane id masked to 6 bits - no `permlane64` involved. Quadrants must **not** emit `v_permlane64_b32` here: the instruction does not exist on any CDNA part, and feeding `llvm.amdgcn.permlane64` to the backend on gfx9xx makes it select a pseudo with no valid CDNA machine opcode and then crash during branch relaxation (genesis-world issue #2962).
+- **RDNA (gfx10/11/12, e.g. RX 7900 XTX)**: the 64 lanes execute as two SIMD32 clusters and `ds_bpermute_b32`'s lane-id operand is **SIMD32-scoped** - it can only address lanes inside the requesting lane's own 32-lane half. To reach the other half, Quadrants pairs `ds_bpermute` with a half-swap through this 3-op sequence:
 
 ```
 swapped = v_permlane64_b32 value         # swap the two SIMD32 halves of the wave
@@ -576,14 +577,14 @@ hi      = ds_bpermute_b32 (lane*4), swapped
 result  = ((target_lane ^ self_lane) & 32) ? hi : lo
 ```
 
-The two `ds_bpermute_b32` reads run in parallel - one reads the original payload (correct when target is in the same SIMD32 half), the other reads the `permlane64`-swapped payload (correct when the target is in the other half) - and a per-lane select picks between them based on whether the target crosses the 32-lane boundary. On CDNA the cross-half branch is dead, but the cost is one extra `v_permlane64_b32` (still well-defined and free) and one `v_cndmask_b32` - no measurable hit. On RDNA wave64 this is the only correct lowering.
+The two `ds_bpermute_b32` reads run in parallel - one reads the original payload (correct when the target is in the same SIMD32 half), the other reads the `permlane64`-swapped payload (correct when the target is in the other half) - and a per-lane select picks between them based on whether the target crosses the 32-lane boundary. The half-swap is a single `v_permlane64_b32` on gfx11 / gfx12 (RDNA3 / RDNA4); on gfx10.x (RDNA1/2), which predates the instruction, it is emulated with a wave-local LDS round-trip. (The same per-lane select runs on CDNA too, but with `permlane64` reduced to a no-op the two reads collapse to one wave-wide `ds_bpermute` and the select becomes dead.)
 
-One subtlety worth knowing about (mostly for anyone reading the generated IR): the lane-id operand to `ds_bpermute` is wrapped in an empty `+v` inline-asm fence inside the runtime helper. Without that fence, LLVM's AMDGPU backend can decide a compile-time-constant or otherwise uniform lane-id is "uniform across the wave" and silently lower the call to a `v_readlane_b32`-style instruction that addresses lanes 0..31 **wave-globally** rather than SIMD32-locally. That would break cross-half shuffles whose target lane is a literal (`broadcast(v, 47)`, `shuffle(v, qd.u32(40))`, etc.). The fence costs zero - same instruction shape on every path - and pins the lowering to a real `ds_bpermute_b32` so the SIMD-local semantics our `permlane64` pairing relies on always hold.
+One subtlety worth knowing about (mostly for anyone reading the generated IR): the lane-id operand to `ds_bpermute` is wrapped in an empty `+v` inline-asm fence inside the runtime helper. Without that fence, LLVM's AMDGPU backend can decide a compile-time-constant or otherwise uniform lane-id is "uniform across the wave" and silently lower the call to a `v_readlane_b32`-style instruction that addresses lanes 0..31 **wave-globally** rather than per the real `ds_bpermute_b32` lane semantics. That would break cross-half shuffles whose target lane is a literal (`broadcast(v, 47)`, `shuffle(v, qd.u32(40))`, etc.) on both ISAs. The fence costs zero - same instruction shape on every path - and pins the lowering to a real `ds_bpermute_b32` so the lane addressing the shuffle lowering relies on always holds.
 
 ## Performance notes
 
 - Shuffles are register-to-register on CUDA (`__shfl_sync`, `__shfl_down_sync`, `__shfl_up_sync`) and on SPIR-V where the GPU has hardware support - typically a handful of cycles, no memory traffic.
-- AMDGPU `shuffle`, `shuffle_down`, and `shuffle_up` all go through `ds_permute` / `ds_bpermute` (LDS-routed, roughly tens of cycles). On wave64 the lowering issues two parallel `ds_bpermute_b32` reads plus a `v_permlane64_b32` swap and a per-lane select to handle cross-half shuffles correctly on RDNA - see [AMDGPU wave64 cross-half lowering](#amdgpu-wave64-cross-half-lowering). The two `ds_bpermute` reads issue in parallel, so the latency is the same as a single read; the `permlane64` and `cndmask` add a few extra cycles.
+- AMDGPU `shuffle`, `shuffle_down`, and `shuffle_up` all go through `ds_permute` / `ds_bpermute` (LDS-routed, roughly tens of cycles). On CDNA wave64 a cross-half shuffle is a single wave-wide `ds_bpermute_b32`; on RDNA wave64 the lowering issues two parallel `ds_bpermute_b32` reads plus a `v_permlane64_b32` swap and a per-lane select to reach across the SIMD32 boundary - see [AMDGPU wave64 cross-half lowering](#amdgpu-wave64-cross-half-lowering). The two `ds_bpermute` reads issue in parallel, so the latency is the same as a single read; on RDNA the `permlane64` and `cndmask` add a few extra cycles.
 - `shuffle_xor` and `broadcast_first` are `@qd.func` wrappers over `shuffle` / `broadcast` and inline at compile time, so on every backend they cost exactly the same as the underlying op.
 - Both `ballot_first_n` and `ballot` lower to a single hardware instruction on every backend - one cycle on CUDA (`__ballot_sync`), one instruction on AMDGPU (a single `v_cmp_*_e64` populating the wavefront-width SETCC, then a low-half store for `ballot_first_n`), and `OpGroupNonUniformBallot` on SPIR-V (extract one or two components of the result `uvec4`). At `n == 32` `ballot_first_n` elides the predicate-masking step entirely; at `n < 32` it inserts one extra multiply on the predicate.
 - `reduce_add` and `reduce_all_add` both issue exactly `log2_group_size()` shuffles and `log2_group_size()` adds per call (5 on wave32, 6 on AMDGPU wave64). No barriers, no shared memory, no launch overhead (they inline). The same holds for the `_tiled` form at any `log2_size`.

From cda264cb2e23f59a45410bae382909cd52f09da8 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 17 Jun 2026 14:08:13 -0700
Subject: [PATCH 4/4] [Misc] AMDGPU: clang-format reflow of permlane64
 emulation comment

---
 quadrants/runtime/llvm/llvm_context.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/quadrants/runtime/llvm/llvm_context.cpp b/quadrants/runtime/llvm/llvm_context.cpp
index 372375fd16..31d1a151fd 100644
--- a/quadrants/runtime/llvm/llvm_context.cpp
+++ b/quadrants/runtime/llvm/llvm_context.cpp
@@ -610,7 +610,8 @@ std::unique_ptr<llvm::Module> QuadrantsLLVMContext::module_from_file(const std::
           QuadrantsLLVMContext::mark_inline(permlane64_func);
         }
       } else if (auto permlane64_func = module->getFunction("amdgpu_permlane64")) {
-        // gfx10.x RDNA1/2: LDS-based software emulation. Layout: ``[1024 x i32] addrspace(3)`` indexed by ``wave_base + lane``.
+        // gfx10.x RDNA1/2: LDS-based software emulation. Layout: ``[1024 x i32] addrspace(3)`` indexed by ``wave_base +
+        // lane``.
         auto i32_ty = llvm::Type::getInt32Ty(*ctx);
         auto buf_ty = llvm::ArrayType::get(i32_ty, 1024);
         auto lds_global = llvm::cast_or_null<llvm::GlobalVariable>(module->getNamedValue("__amdgpu_permlane64_lds"));