From 7905aef185ff75ee244654fcaa58b8fb613ee2c1 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 17 Jun 2026 12:24:04 -0700 Subject: [PATCH 1/4] [Bug] AMDGPU: don't emit llvm.amdgcn.permlane64 on CDNA v_permlane64_b32 is an RDNA-only instruction: it exists on gfx11 (RDNA3) and gfx12 (RDNA4) but on no CDNA part. quadrants previously enabled the llvm.amdgcn.permlane64 intrinsic on gfx940/gfx941/gfx942 (CDNA3) as well. On gfx942 the AMDGPU backend does not cleanly "Cannot select" the intrinsic -- it selects the V_PERMLANE64_B32 pseudo, which has no valid MC opcode for CDNA, and then crashes with a bare SIGSEGV in SIInstrInfo::getInstSizeInBytes during the branch-relaxation pass. This made scene.build() segfault for any kernel using a cross-half subgroup shuffle (genesis-world #2962). Gate has_permlane64 to gfx11/gfx12 only, so every CDNA target (and gfx10.x RDNA1/2) takes the existing LDS-roundtrip software emulation, which produces correct cross-half results on wave64 hardware. Also drop the QD_AMDGPU_FORCE_PERMLANE64_FALLBACK env-var escape hatch and correct the related comments in llvm_context.cpp, runtime.cpp and test_simt.py. --- quadrants/runtime/llvm/llvm_context.cpp | 26 +++++++++---------- .../runtime/llvm/runtime_module/runtime.cpp | 22 ++++++++-------- tests/python/test_simt.py | 7 +++-- 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/quadrants/runtime/llvm/llvm_context.cpp b/quadrants/runtime/llvm/llvm_context.cpp index 76ae4b5a0e..c50e2fbd86 100644 --- a/quadrants/runtime/llvm/llvm_context.cpp +++ b/quadrants/runtime/llvm/llvm_context.cpp @@ -556,9 +556,14 @@ std::unique_ptr QuadrantsLLVMContext::module_from_file(const std:: // We use it to extend the SIMD32-scoped ``ds_bpermute`` (every shuffle op lowers to that) into a wave64-aware // cross-half shuffle on RDNA: ``ds_bpermute`` reads within the lane's own 32-lane SIMD cluster, ``permlane64`` // brings the other SIMD's value to this lane, and we select between the two based on which half the target lane - // sits in. See ``amdgpu_cross_half_shuffle_i32`` in runtime.cpp. The instruction is gfx940+ (CDNA3) and gfx11+ - // (RDNA3+) only -- on earlier wave64-capable targets (gfx9xx CDNA1/2, gfx10.x RDNA1/2) the AMDGPU LLVM backend - // hits "Cannot select" while lowering the intrinsic, so we have to provide a software emulation. + // sits in. See ``amdgpu_cross_half_shuffle_i32`` in runtime.cpp. ``v_permlane64_b32`` is an RDNA-only + // instruction: it exists on gfx11 (RDNA3) and gfx12 (RDNA4), but on NO CDNA part -- the AMD assembler rejects it + // ("instruction not supported on this GPU") for gfx908/gfx90a (CDNA1/2), gfx940/gfx941/gfx942 (CDNA3) and gfx950 + // (CDNA4) alike. On every wave64-capable target without the native instruction (all CDNA parts, plus gfx10.x + // RDNA1/2) we must provide a software emulation. CDNA3 (gfx942) is especially treacherous: the backend does NOT + // cleanly "Cannot select" the intrinsic -- it selects the ``V_PERMLANE64_B32`` pseudo, which has no valid MC + // opcode for CDNA, and then crashes with a bare SIGSEGV inside ``SIInstrInfo::getInstSizeInBytes`` during the + // branch-relaxation pass. So we must never emit the intrinsic on CDNA. // // The emulation is a wave-local LDS roundtrip: each lane writes its ``value`` to ``lds[wave_base + lane]``, // a wavefront-scope acquire-release fence lowers to ``s_waitcnt lgkmcnt(0)`` (drains outstanding LDS writes), @@ -574,17 +579,10 @@ std::unique_ptr QuadrantsLLVMContext::module_from_file(const std:: // have to pass the explicit ``i32`` type alongside the ID -- otherwise ``CreateIntrinsic`` segfaults inside // ``getDeclaration()`` while resolving the mangled name. auto mcpu_str = AMDGPUContext::get_instance().get_mcpu(); - bool has_permlane64 = (mcpu_str == "gfx940" || mcpu_str == "gfx941" || mcpu_str == "gfx942" || - mcpu_str.substr(0, 5) == "gfx11" || mcpu_str.substr(0, 5) == "gfx12"); - // Escape hatch for validating the LDS software emulation on hardware that natively supports - // ``v_permlane64_b32``: setting ``QD_AMDGPU_FORCE_PERMLANE64_FALLBACK=1`` forces the JIT to take the LDS path - // even on gfx11+ / gfx940+, so we can exercise the fallback on a working AMD box (gfx1100 / gfx942) without - // needing a gfx10.x runner. Has no effect on non-AMDGPU backends. - if (const char *force_fallback = std::getenv("QD_AMDGPU_FORCE_PERMLANE64_FALLBACK")) { - if (force_fallback[0] == '1') { - has_permlane64 = false; - } - } + // RDNA3+ (gfx11) and RDNA4 (gfx12) only. No CDNA part has ``v_permlane64_b32`` (see above), so every gfx9xx + // target takes the LDS emulation -- including gfx940/gfx941/gfx942 (CDNA3), which used to be (wrongly) listed + // here and made the AMDGPU backend segfault on any kernel using a cross-half subgroup shuffle. + bool has_permlane64 = (mcpu_str.substr(0, 5) == "gfx11" || mcpu_str.substr(0, 5) == "gfx12"); if (has_permlane64) { patch_intrinsic("amdgpu_permlane64", llvm::Intrinsic::amdgcn_permlane64, true, {llvm::Type::getInt32Ty(*ctx)}); } else if (auto permlane64_func = module->getFunction("amdgpu_permlane64")) { diff --git a/quadrants/runtime/llvm/runtime_module/runtime.cpp b/quadrants/runtime/llvm/runtime_module/runtime.cpp index facebb5f43..a4d4f4d39b 100644 --- a/quadrants/runtime/llvm/runtime_module/runtime.cpp +++ b/quadrants/runtime/llvm/runtime_module/runtime.cpp @@ -1015,13 +1015,13 @@ i32 amdgpu_ds_bpermute(i32 byte_index, i32 value) { } // Exchanges a 32-bit value between lanes ``i`` and ``i ^ 32`` in a single instruction. The native instruction -// ``v_permlane64_b32`` is only available on gfx940+ (CDNA3) and gfx11+ (RDNA3+); ``llvm_context.cpp`` detects the -// target at JIT time and patches this stub to either the ``llvm.amdgcn.permlane64`` intrinsic (on supported -// hardware) or an LDS-roundtrip software emulation (on gfx9xx CDNA1/2 and gfx10.x RDNA1/2). The emulation has higher -// latency (LDS store + ``s_waitcnt`` + LDS load -- roughly tens of cycles per call vs. a few for the native swap), -// but produces correct cross-half results on RDNA wave64 emulation hardware. Used by -// ``amdgpu_cross_half_shuffle_i32`` below to repair the cross-half story for ``ds_bpermute``, which is SIMD32-scoped -// on RDNA. +// ``v_permlane64_b32`` is RDNA-only -- it exists on gfx11 (RDNA3) and gfx12 (RDNA4), but on NO CDNA part (the AMD +// assembler rejects it on gfx908/gfx90a/gfx940/gfx941/gfx942/gfx950). ``llvm_context.cpp`` detects the target at JIT +// time and patches this stub to either the ``llvm.amdgcn.permlane64`` intrinsic (on gfx11+/gfx12 only) or an +// LDS-roundtrip software emulation (on every CDNA part and on gfx10.x RDNA1/2). The emulation has higher latency (LDS +// store + ``s_waitcnt`` + LDS load -- roughly tens of cycles per call vs. a few for the native swap), but produces +// correct cross-half results on wave64 hardware. Used by ``amdgpu_cross_half_shuffle_i32`` below to repair the +// cross-half story for ``ds_bpermute``, which is SIMD32-scoped on RDNA. i32 amdgpu_permlane64(i32 value) { __builtin_trap(); return 0; @@ -1078,10 +1078,10 @@ i32 amdgpu_lane_id() { // Note this is correct on every AMDGPU target we run on. On CDNA (gfx9xx, gfx940/942) ``ds_bpermute`` could in // principle directly address all 64 lanes, but because we always mask the byte argument to ``(target_lane & 31) * 4`` // we never test that path -- on both ISAs the byte index is in [0, 128) and only addresses the bottom half. The -// ``permlane64`` swap then supplies the top-half data: on hardware with the native instruction (gfx940+ CDNA3 / -// gfx11+ RDNA3+) this is a single ``v_permlane64_b32``; on older wave64-capable targets (gfx9xx CDNA1/2, gfx10.x -// RDNA1/2) the JIT patches ``amdgpu_permlane64`` to an LDS roundtrip that produces the same result at higher latency -// (see the patching logic in ``llvm_context.cpp``). +// ``permlane64`` swap then supplies the top-half data: on hardware with the native instruction (gfx11 RDNA3 / gfx12 +// RDNA4) this is a single ``v_permlane64_b32``; on every other wave64-capable target -- all CDNA parts (gfx9xx, +// including gfx940/gfx942 CDNA3 and gfx950 CDNA4) and gfx10.x RDNA1/2 -- the JIT patches ``amdgpu_permlane64`` to an +// LDS roundtrip that produces the same result at higher latency (see the patching logic in ``llvm_context.cpp``). // // OOR target lanes (``target_lane < 0`` or ``target_lane >= 64``): we mask to ``target_lane & 31`` for the byte and // ``& 32`` for the half-bit. The behaviour for OOR targets is implementation-defined on every backend (CUDA's diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py index dd5f69c89c..deb6b72797 100644 --- a/tests/python/test_simt.py +++ b/tests/python/test_simt.py @@ -3845,8 +3845,11 @@ def test_subgroup_exclusive_add_tiled_log2_size_6(): # to the ``permlane64``-based cross-half helper a lane in the bottom half could not read the top half (and vice # versa). All five tests are gated to ``log2_group_size() == 6`` so they only assert anything on real wave64 # hardware -- CUDA and SPIR-V backends with wave32 skip the absolute-correctness check (the cross-half partner is -# out of range there, which is implementation-defined). CDNA (gfx9xx, MI300X) already had a wave64-wide -# ``ds_bpermute`` so its behaviour is unchanged by the fix; the new helper is observably a no-op on that path. +# out of range there, which is implementation-defined). Both CDNA and RDNA wave64 run these: the helper masks +# ``ds_bpermute`` to the bottom 32 lanes and pulls the top half via ``amdgpu_permlane64``, which the JIT lowers to the +# native ``v_permlane64_b32`` on RDNA3+/gfx11+ but to an LDS-roundtrip emulation on every CDNA part (gfx9xx, incl. +# gfx942 MI300X) -- CDNA has no ``v_permlane64_b32`` and emitting the intrinsic there crashes the AMDGPU backend +# (genesis-world issue #2962). On CDNA these tests therefore exercise the emulation path automatically. # -------------------------------------------------------------------------------------------------------------------- From 33a7f16e76a172efad83bc9aa67de879a369541f Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 17 Jun 2026 12:40:23 -0700 Subject: [PATCH 2/4] [Bug] AMDGPU: make wave64 cross-half shuffle correct on CDNA Gating permlane64 off CDNA (prev commit) stops the SIGSEGV but exposed a latent correctness bug: the cross-half shuffle helper is RDNA-shaped. It masks ds_bpermute's target to 31 and relies on permlane64 to fetch the top half, which is correct only where ds_bpermute is SIMD32-scoped (RDNA). On CDNA ds_bpermute is wave64-wide, so masking to 31 means every lane reads a bottom-half lane and the top half is never reached -- shuffle_xor(v,32) returned [32..63, 32..63] instead of [32..63, 0..31] on gfx942 (CI never caught this: RDNA uses the native instruction, not this path). Make the lowering architecture-aware via two JIT-patched knobs: - amdgpu_ds_bpermute_lane_mask(): 63 on GCN/CDNA (gfx9xx, wave64-wide ds_bpermute), 31 on RDNA (SIMD32-scoped). With mask 63 a single wide ds_bpermute already returns lane target_lane for the whole wave. - permlane64 patched to the identity on CDNA, so the helper's cross-SIMD branch equals the same-SIMD branch and the per-lane select is a true no-op (and the intrinsic, which has no MC opcode on CDNA, is never emitted). RDNA paths are unchanged: gfx11/gfx12 keep native v_permlane64_b32, gfx10.x keeps the LDS-roundtrip emulation, both with lane mask 31. --- quadrants/runtime/llvm/llvm_context.cpp | 84 ++++++++++++------- .../runtime/llvm/runtime_module/runtime.cpp | 34 +++++--- tests/python/test_simt.py | 11 +-- 3 files changed, 83 insertions(+), 46 deletions(-) diff --git a/quadrants/runtime/llvm/llvm_context.cpp b/quadrants/runtime/llvm/llvm_context.cpp index c50e2fbd86..372375fd16 100644 --- a/quadrants/runtime/llvm/llvm_context.cpp +++ b/quadrants/runtime/llvm/llvm_context.cpp @@ -552,41 +552,65 @@ std::unique_ptr QuadrantsLLVMContext::module_from_file(const std:: } patch_intrinsic("amdgpu_clock_i64", llvm::Intrinsic::amdgcn_s_memtime); patch_intrinsic("amdgpu_ds_bpermute", llvm::Intrinsic::amdgcn_ds_bpermute); - // ``llvm.amdgcn.permlane64`` exchanges a 32-bit value between lanes ``i`` and ``i ^ 32`` in a single instruction. - // We use it to extend the SIMD32-scoped ``ds_bpermute`` (every shuffle op lowers to that) into a wave64-aware - // cross-half shuffle on RDNA: ``ds_bpermute`` reads within the lane's own 32-lane SIMD cluster, ``permlane64`` - // brings the other SIMD's value to this lane, and we select between the two based on which half the target lane - // sits in. See ``amdgpu_cross_half_shuffle_i32`` in runtime.cpp. ``v_permlane64_b32`` is an RDNA-only - // instruction: it exists on gfx11 (RDNA3) and gfx12 (RDNA4), but on NO CDNA part -- the AMD assembler rejects it - // ("instruction not supported on this GPU") for gfx908/gfx90a (CDNA1/2), gfx940/gfx941/gfx942 (CDNA3) and gfx950 - // (CDNA4) alike. On every wave64-capable target without the native instruction (all CDNA parts, plus gfx10.x - // RDNA1/2) we must provide a software emulation. CDNA3 (gfx942) is especially treacherous: the backend does NOT - // cleanly "Cannot select" the intrinsic -- it selects the ``V_PERMLANE64_B32`` pseudo, which has no valid MC - // opcode for CDNA, and then crashes with a bare SIGSEGV inside ``SIInstrInfo::getInstSizeInBytes`` during the - // branch-relaxation pass. So we must never emit the intrinsic on CDNA. + // The wave64 cross-half subgroup shuffle (``amdgpu_cross_half_shuffle_i32`` in runtime.cpp) is built from + // ``ds_bpermute`` plus, on some targets, ``permlane64``. How those behave -- and therefore how we patch them -- + // depends on the architecture family: // - // The emulation is a wave-local LDS roundtrip: each lane writes its ``value`` to ``lds[wave_base + lane]``, - // a wavefront-scope acquire-release fence lowers to ``s_waitcnt lgkmcnt(0)`` (drains outstanding LDS writes), - // and each lane then reads ``lds[wave_base + (lane ^ 32)]``. On RDNA wave64-emulation the two SIMD32 halves of - // the wave issue store / load in two passes apiece, but the waitcnt between them guarantees both halves' stores - // are committed to LDS before either half's loads issue, so the cross-half routing is correct. ``wave_base`` - // is ``(workitem.id.x >> 6) << 6``, scoping the LDS slot to a single wave so multi-wave workgroups don't - // collide. The LDS buffer is a 1024-entry per-workgroup global (4 KiB) -- enough for the AMDGPU 1024-thread - // workgroup max at wave64. The buffer is only materialised on this code path, so kernels on permlane64-capable - // hardware (the common case) pay zero LDS for cross-half shuffles. + // * GCN / CDNA (gfx9xx: gfx900/906 Vega, gfx908/gfx90a CDNA1/2, gfx940/gfx941/gfx942 CDNA3, gfx950 CDNA4): + // ``ds_bpermute`` addresses the full wave64 directly, so the cross-half shuffle is a single wide + // ``ds_bpermute`` (lane mask 63) and ``permlane64`` is unnecessary. Critically, ``v_permlane64_b32`` does + // not exist on CDNA -- emitting ``llvm.amdgcn.permlane64`` makes the backend select the ``V_PERMLANE64_B32`` + // pseudo, which has no valid MC opcode for CDNA, and then crash with a bare SIGSEGV inside + // ``SIInstrInfo::getInstSizeInBytes`` during branch relaxation (genesis-world #2962). So on CDNA we patch + // ``amdgpu_permlane64`` to the identity, which neutralises the helper's (RDNA-shaped) cross-SIMD branch + // without ever emitting the intrinsic. + // * RDNA3/4 (gfx11 / gfx12): ``ds_bpermute`` is SIMD32-scoped (lane mask 31), so the top half is reached via + // the native single-instruction ``v_permlane64_b32``. + // * RDNA1/2 (gfx10.x): ``ds_bpermute`` is SIMD32-scoped (lane mask 31), but ``v_permlane64_b32`` does not + // exist yet, so we emulate the lane ``i`` <-> ``i ^ 32`` swap with an LDS roundtrip (below). // - // The intrinsic is overloaded on its element type (signature ``T -> T`` for any 32-bit-or-smaller ``T``), so we - // have to pass the explicit ``i32`` type alongside the ID -- otherwise ``CreateIntrinsic`` segfaults inside - // ``getDeclaration()`` while resolving the mangled name. + // The LDS emulation writes each lane's ``value`` to ``lds[wave_base + lane]``, issues a wavefront-scope + // acquire-release fence (lowers to ``s_waitcnt lgkmcnt(0)``: drains outstanding LDS writes without the + // cross-wave ``s_barrier`` a workgroup-scope fence would emit, which would deadlock if only some waves reach + // this point), then reads back ``lds[wave_base + (lane ^ 32)]``. ``wave_base`` is ``(workitem.id.x >> 6) << 6``, + // scoping the slot to a single wave so multi-wave workgroups don't collide. The buffer is a 1024-entry + // per-workgroup global (4 KiB, the AMDGPU 1024-thread wave64 max), materialised only on this path, so kernels + // on the other two paths pay zero LDS for cross-half shuffles. + // + // ``patch_intrinsic`` for permlane64 passes the explicit ``i32`` type alongside the ID because the intrinsic is + // overloaded on its element type (signature ``T -> T`` for any 32-bit-or-smaller ``T``); otherwise + // ``CreateIntrinsic`` segfaults inside ``getDeclaration()`` while resolving the mangled name. auto mcpu_str = AMDGPUContext::get_instance().get_mcpu(); - // RDNA3+ (gfx11) and RDNA4 (gfx12) only. No CDNA part has ``v_permlane64_b32`` (see above), so every gfx9xx - // target takes the LDS emulation -- including gfx940/gfx941/gfx942 (CDNA3), which used to be (wrongly) listed - // here and made the AMDGPU backend segfault on any kernel using a cross-half subgroup shuffle. - bool has_permlane64 = (mcpu_str.substr(0, 5) == "gfx11" || mcpu_str.substr(0, 5) == "gfx12"); - if (has_permlane64) { + bool is_gcn_cdna = (mcpu_str.substr(0, 4) == "gfx9"); + bool has_native_permlane64 = (mcpu_str.substr(0, 5) == "gfx11" || mcpu_str.substr(0, 5) == "gfx12"); + + // Patch the ds_bpermute lane mask used by ``amdgpu_cross_half_shuffle_i32``: 63 where ``ds_bpermute`` is + // wave64-wide (GCN/CDNA), 31 where it is SIMD32-scoped (RDNA, paired with the permlane64 swap above). + if (auto mask_func = module->getFunction("amdgpu_ds_bpermute_lane_mask")) { + mask_func->deleteBody(); + auto bb = llvm::BasicBlock::Create(*ctx, "entry", mask_func); + IRBuilder<> builder(*ctx); + builder.SetInsertPoint(bb); + builder.CreateRet(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), is_gcn_cdna ? 63 : 31)); + QuadrantsLLVMContext::mark_inline(mask_func); + } + + if (has_native_permlane64) { patch_intrinsic("amdgpu_permlane64", llvm::Intrinsic::amdgcn_permlane64, true, {llvm::Type::getInt32Ty(*ctx)}); + } else if (is_gcn_cdna) { + // CDNA: the wide ds_bpermute already reaches all 64 lanes, so permlane64 is unnecessary -- and emitting it + // crashes the backend. Patch it to the identity so the helper's cross-SIMD branch returns the same value as + // its same-SIMD branch, making the per-lane select a true no-op. + if (auto permlane64_func = module->getFunction("amdgpu_permlane64")) { + permlane64_func->deleteBody(); + auto bb = llvm::BasicBlock::Create(*ctx, "entry", permlane64_func); + IRBuilder<> builder(*ctx); + builder.SetInsertPoint(bb); + builder.CreateRet(&*permlane64_func->arg_begin()); + QuadrantsLLVMContext::mark_inline(permlane64_func); + } } else if (auto permlane64_func = module->getFunction("amdgpu_permlane64")) { - // LDS-based software emulation. Layout: ``[1024 x i32] addrspace(3)`` indexed by ``wave_base + lane``. + // gfx10.x RDNA1/2: LDS-based software emulation. Layout: ``[1024 x i32] addrspace(3)`` indexed by ``wave_base + lane``. auto i32_ty = llvm::Type::getInt32Ty(*ctx); auto buf_ty = llvm::ArrayType::get(i32_ty, 1024); auto lds_global = llvm::cast_or_null(module->getNamedValue("__amdgpu_permlane64_lds")); diff --git a/quadrants/runtime/llvm/runtime_module/runtime.cpp b/quadrants/runtime/llvm/runtime_module/runtime.cpp index a4d4f4d39b..0edb72ec5d 100644 --- a/quadrants/runtime/llvm/runtime_module/runtime.cpp +++ b/quadrants/runtime/llvm/runtime_module/runtime.cpp @@ -1059,6 +1059,15 @@ i32 amdgpu_lane_id() { return amdgpu_mbcnt_hi(-1, amdgpu_mbcnt_lo(-1, 0)); } +// Lane-index mask applied to ``ds_bpermute``'s target in ``amdgpu_cross_half_shuffle_i32``. ``ds_bpermute``'s reach +// differs by ISA, so ``llvm_context.cpp`` patches this at JIT load time: 63 on GCN/CDNA (gfx9xx), where +// ``ds_bpermute`` addresses the full wave64 directly, and 31 on RDNA (gfx10/11/12), where it is SIMD32-scoped and the +// top half is reached via the ``permlane64`` swap instead. Defaults to 31 so an unpatched build stays correct on +// RDNA. (runtime.cpp is compiled at -O0 to bitcode, so this stays a real call until the JIT patches + inlines it.) +i32 amdgpu_ds_bpermute_lane_mask() { + return 31; +} + // Wave64-aware "read ``value`` from lane ``target_lane``" gather for AMDGPU. Shared by every i32 shuffle variant // (``shuffle`` / ``shuffle_down`` / ``shuffle_up``); the f32 / i64 / f64 wrappers below decompose into i32 calls and // therefore inherit the wave64 fix for free. @@ -1075,13 +1084,16 @@ i32 amdgpu_lane_id() { // so ``ds_bpermute(byte, permlane64(value))`` effectively reads from lanes 32-63. We always compute both reads and // select between them branchlessly based on the high bit of ``target_lane``: bit 5 picks the half. // -// Note this is correct on every AMDGPU target we run on. On CDNA (gfx9xx, gfx940/942) ``ds_bpermute`` could in -// principle directly address all 64 lanes, but because we always mask the byte argument to ``(target_lane & 31) * 4`` -// we never test that path -- on both ISAs the byte index is in [0, 128) and only addresses the bottom half. The -// ``permlane64`` swap then supplies the top-half data: on hardware with the native instruction (gfx11 RDNA3 / gfx12 -// RDNA4) this is a single ``v_permlane64_b32``; on every other wave64-capable target -- all CDNA parts (gfx9xx, -// including gfx940/gfx942 CDNA3 and gfx950 CDNA4) and gfx10.x RDNA1/2 -- the JIT patches ``amdgpu_permlane64`` to an -// LDS roundtrip that produces the same result at higher latency (see the patching logic in ``llvm_context.cpp``). +// The lane-index mask (``amdgpu_ds_bpermute_lane_mask()``) and ``permlane64`` are both JIT-patched per architecture +// (see ``llvm_context.cpp``), because ``ds_bpermute``'s reach differs by ISA: +// * GCN / CDNA (gfx9xx, incl. gfx940/gfx942/gfx950): ``ds_bpermute`` addresses the full wave64, so the mask is 63 +// and a single ``ds_bpermute((target_lane & 63) * 4, value)`` already returns lane ``target_lane`` for the whole +// wave. ``permlane64`` is patched to the identity there -- ``v_permlane64_b32`` does not exist on CDNA and +// emitting it crashes the backend (genesis-world #2962) -- so ``from_other_half`` equals ``from_self_half`` and +// the select below is a true no-op. +// * RDNA (gfx10/11/12): ``ds_bpermute`` is SIMD32-scoped, so the mask is 31 and ``ds_bpermute`` only reaches the +// issuing lane's own 32-lane half. The top half is supplied by the ``permlane64`` swap -- a single +// ``v_permlane64_b32`` on gfx11/gfx12, or an LDS-roundtrip emulation on gfx10.x. // // OOR target lanes (``target_lane < 0`` or ``target_lane >= 64``): we mask to ``target_lane & 31`` for the byte and // ``& 32`` for the half-bit. The behaviour for OOR targets is implementation-defined on every backend (CUDA's @@ -1093,12 +1105,12 @@ i32 amdgpu_cross_half_shuffle_i32(i32 target_lane, i32 value) { // wave -- lifting it above the select keeps the AMDGPU backend happy and lets it issue exactly one // ``v_permlane64_b32``. ``ds_bpermute`` on RDNA wave64 is SIMD32-scoped with a 5-bit address (top half of the wave // is unreachable directly), so ``from_self_half`` handles the same-SIMD case and ``from_other_half`` handles the - // cross-SIMD case via the ``swapped`` payload. On CDNA the wave is one SIMD64 so both reads return the same value - // and the select is a no-op; we don't try to optimize that out because the dead read is cheap (LLVM CSE may fold - // it anyway). + // cross-SIMD case via the ``swapped`` payload. On CDNA the lane mask is 63 (``ds_bpermute`` is wave64-wide) and + // ``permlane64`` is patched to the identity, so ``swapped == value``, both reads return lane ``target_lane``, and + // the select is a true no-op; we don't optimize that out because the dead read is cheap (LLVM CSE may fold it). i32 self_lane = amdgpu_lane_id(); i32 swapped = amdgpu_permlane64(value); - i32 byte = (target_lane & 31) * 4; + i32 byte = (target_lane & amdgpu_ds_bpermute_lane_mask()) * 4; // ``llvm.amdgcn.ds.bpermute`` is the real hardware ``ds_bpermute_b32`` -- but if LLVM's uniformity analysis decides // ``byte`` is uniform across the wave (e.g. ``target_lane`` is a compile-time constant), it sometimes lowers to a // ``v_readlane_b32``-style instruction that addresses lanes 0..31 wave-globally rather than SIMD32-locally. On diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py index deb6b72797..b5f055010b 100644 --- a/tests/python/test_simt.py +++ b/tests/python/test_simt.py @@ -3845,11 +3845,12 @@ def test_subgroup_exclusive_add_tiled_log2_size_6(): # to the ``permlane64``-based cross-half helper a lane in the bottom half could not read the top half (and vice # versa). All five tests are gated to ``log2_group_size() == 6`` so they only assert anything on real wave64 # hardware -- CUDA and SPIR-V backends with wave32 skip the absolute-correctness check (the cross-half partner is -# out of range there, which is implementation-defined). Both CDNA and RDNA wave64 run these: the helper masks -# ``ds_bpermute`` to the bottom 32 lanes and pulls the top half via ``amdgpu_permlane64``, which the JIT lowers to the -# native ``v_permlane64_b32`` on RDNA3+/gfx11+ but to an LDS-roundtrip emulation on every CDNA part (gfx9xx, incl. -# gfx942 MI300X) -- CDNA has no ``v_permlane64_b32`` and emitting the intrinsic there crashes the AMDGPU backend -# (genesis-world issue #2962). On CDNA these tests therefore exercise the emulation path automatically. +# out of range there, which is implementation-defined). Both CDNA and RDNA wave64 run these, via different lowerings +# of ``amdgpu_cross_half_shuffle_i32``: on CDNA (gfx9xx, incl. gfx942 MI300X) ``ds_bpermute`` is wave64-wide, so the +# cross-half read is a single wide ``ds_bpermute`` (lane mask 63) and ``permlane64`` is unused; on RDNA ``ds_bpermute`` +# is SIMD32-scoped (lane mask 31) and the top half comes from ``amdgpu_permlane64`` -- the native ``v_permlane64_b32`` +# on gfx11+/gfx12, or an LDS-roundtrip emulation on gfx10.x. ``v_permlane64_b32`` does not exist on CDNA and emitting +# it there crashes the AMDGPU backend (genesis-world issue #2962), so the JIT must never emit it on gfx9xx. # -------------------------------------------------------------------------------------------------------------------- From 7ba5a45e928ccc89b7538586984b5da78a7b34a7 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 17 Jun 2026 13:23:05 -0700 Subject: [PATCH 3/4] [Docs] AMDGPU: correct wave64 cross-half shuffle lowering for CDNA The subgroup docs described emitting v_permlane64_b32 on CDNA as "well-defined and free", which is exactly the pre-fix behavior that crashed the AMDGPU backend (genesis-world #2962). Document the actual per-arch lowering: a single wave-wide ds_bpermute on CDNA (no permlane64) and the permlane64 + ds_bpermute + select pairing on RDNA wave64. --- docs/source/user_guide/subgroup.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/source/user_guide/subgroup.md b/docs/source/user_guide/subgroup.md index ba99df5a92..ce21098538 100644 --- a/docs/source/user_guide/subgroup.md +++ b/docs/source/user_guide/subgroup.md @@ -110,14 +110,14 @@ Each lane returns the `value` held by the lane whose subgroup-local id equals `i Lane `i` returns the `value` held by lane `i + offset`. Lanes near the top of the subgroup - where `i + offset >= subgroup_size` - receive an implementation-defined value (typically their own `value`), so reduction patterns must only trust lane 0's final result, or mask out the out-of-range lanes. - `value` and `offset` dtypes: same as `shuffle` above; `offset` is a `u32`. -- Maps to `__shfl_down_sync` on CUDA and `OpGroupNonUniformShuffleDown` on SPIR-V. On AMDGPU it is emulated with `ds_bpermute`; wave64 cross-half offsets (any `offset >= 32` for low-half lanes, or any non-zero `offset` for high-half lanes that lands across the SIMD32 boundary) go through the same `permlane64 + ds_bpermute + select` lowering as `shuffle` - see [AMDGPU wave64 cross-half lowering](#amdgpu-wave64-cross-half-lowering). These operations are added on both RDNA and CDNA. +- Maps to `__shfl_down_sync` on CUDA and `OpGroupNonUniformShuffleDown` on SPIR-V. On AMDGPU it is emulated with `ds_bpermute`; wave64 cross-half offsets (any `offset >= 32` for low-half lanes, or any non-zero `offset` for high-half lanes that lands across the SIMD32 boundary) go through the same wave64 cross-half lowering as `shuffle` - a single wave-wide `ds_bpermute` on CDNA, or a `permlane64 + ds_bpermute + select` sequence on RDNA - see [AMDGPU wave64 cross-half lowering](#amdgpu-wave64-cross-half-lowering). These operations are added on both RDNA and CDNA. ### `shuffle_up(value, offset)` Lane `i` returns the `value` held by lane `i - offset`. Lanes near the bottom of the subgroup - where `i - offset < 0` - receive an implementation-defined value (typically their own `value`), so the bottom `offset` lanes' results should be ignored or masked. - Same dtype rules as `shuffle` / `shuffle_down`; `offset` is a `u32`. -- Maps to `__shfl_up_sync` on CUDA and `OpGroupNonUniformShuffleUp` on SPIR-V. On AMDGPU it is emulated with `ds_bpermute((lane - offset) * 4, value)`; wave64 cross-half cases go through the [AMDGPU wave64 cross-half lowering](#amdgpu-wave64-cross-half-lowering) (same `permlane64 + ds_bpermute + select` sequence as `shuffle` / `shuffle_down`). These operations are added on both RDNA and CDNA. +- Maps to `__shfl_up_sync` on CUDA and `OpGroupNonUniformShuffleUp` on SPIR-V. On AMDGPU it is emulated with `ds_bpermute((lane - offset) * 4, value)`; wave64 cross-half cases go through the [AMDGPU wave64 cross-half lowering](#amdgpu-wave64-cross-half-lowering) (same per-arch handling as `shuffle` / `shuffle_down`: a single wave-wide `ds_bpermute` on CDNA, a `permlane64 + ds_bpermute + select` sequence on RDNA). These operations are added on both RDNA and CDNA. ### `shuffle_xor(value, mask)` @@ -132,7 +132,7 @@ Lane `i` returns the `value` held by lane `i ^ mask`. Convenient for butterfly p Every lane in the subgroup returns the `value` held by the lane whose subgroup-local id equals `index`. Expresses intent ("read lane `index`") more directly than `shuffle(value, index)` and on backends with a dedicated broadcast may map to a cheaper instruction. - Same dtype rules as `shuffle`. -- Maps to `__shfl_sync` on CUDA, `ds_bpermute` (plus a `permlane64`-driven cross-half select on wave64) on AMDGPU, and `OpGroupNonUniformBroadcast` on SPIR-V. See [AMDGPU wave64 cross-half lowering](#amdgpu-wave64-cross-half-lowering) for the wave64 mechanics. These operations are added on both RDNA and CDNA. +- Maps to `__shfl_sync` on CUDA, `ds_bpermute` (with a `permlane64`-driven cross-half select on RDNA wave64, or a single wave-wide `ds_bpermute` on CDNA wave64) on AMDGPU, and `OpGroupNonUniformBroadcast` on SPIR-V. See [AMDGPU wave64 cross-half lowering](#amdgpu-wave64-cross-half-lowering) for the wave64 mechanics. These operations are added on both RDNA and CDNA. - **Important: on SPIR-V, `index` must be dynamically uniform** - the same value on every lane in the subgroup. Passing a per-lane varying `index` is undefined behavior, because `OpGroupNonUniformBroadcast` requires its `Id` operand to be dynamically uniform across the subgroup. On CUDA / AMDGPU, `index` may vary per lane and the call is identical to `shuffle(value, index)`. If you need a varying source lane, use `shuffle` directly. ### `broadcast_first(value)` @@ -565,9 +565,10 @@ After the call, lane `k` (within each group of 32) holds `a[group_start] + a[gro ### AMDGPU wave64 cross-half lowering -AMDGPU `ds_bpermute_b32` - the LDS-routed permute that Quadrants uses to lower `shuffle`, `shuffle_down`, and `shuffle_up` - has a hardware quirk on RDNA (gfx10/11/12, e.g. RX 7900 XTX): its lane-id operand is **SIMD32-scoped**. On a wave64 RDNA wave the 64 lanes execute as two SIMD32 clusters; `ds_bpermute` on those chips can only address lanes inside the requesting lane's own SIMD32 half. CDNA (gfx9xx, MI200/MI300) keeps the wave on a single SIMD64, so `ds_bpermute` there is wave-wide and the quirk does not exist. +AMDGPU `ds_bpermute_b32` - the LDS-routed permute that Quadrants uses to lower `shuffle`, `shuffle_down`, and `shuffle_up` - reaches a different set of lanes depending on the architecture, so Quadrants lowers cross-half wave64 shuffles two different ways: -To make wave64 `shuffle` / `shuffle_down` / `shuffle_up` behave consistently across RDNA and CDNA, Quadrants always lowers cross-half-capable shuffles through this 3-op sequence: +- **CDNA (gfx9xx, MI200 / MI300)**: the wave runs as a single SIMD64, so `ds_bpermute_b32` is wave-wide and addresses all 64 lanes directly. A cross-half shuffle is therefore a single `ds_bpermute_b32 (target_lane * 4), value` with the lane id masked to 6 bits - no `permlane64` involved. Quadrants must **not** emit `v_permlane64_b32` here: the instruction does not exist on any CDNA part, and feeding `llvm.amdgcn.permlane64` to the backend on gfx9xx makes it select a pseudo with no valid CDNA machine opcode and then crash during branch relaxation (genesis-world issue #2962). +- **RDNA (gfx10/11/12, e.g. RX 7900 XTX)**: the 64 lanes execute as two SIMD32 clusters and `ds_bpermute_b32`'s lane-id operand is **SIMD32-scoped** - it can only address lanes inside the requesting lane's own 32-lane half. To reach the other half, Quadrants pairs `ds_bpermute` with a half-swap through this 3-op sequence: ``` swapped = v_permlane64_b32 value # swap the two SIMD32 halves of the wave @@ -576,14 +577,14 @@ hi = ds_bpermute_b32 (lane*4), swapped result = ((target_lane ^ self_lane) & 32) ? hi : lo ``` -The two `ds_bpermute_b32` reads run in parallel - one reads the original payload (correct when target is in the same SIMD32 half), the other reads the `permlane64`-swapped payload (correct when the target is in the other half) - and a per-lane select picks between them based on whether the target crosses the 32-lane boundary. On CDNA the cross-half branch is dead, but the cost is one extra `v_permlane64_b32` (still well-defined and free) and one `v_cndmask_b32` - no measurable hit. On RDNA wave64 this is the only correct lowering. +The two `ds_bpermute_b32` reads run in parallel - one reads the original payload (correct when the target is in the same SIMD32 half), the other reads the `permlane64`-swapped payload (correct when the target is in the other half) - and a per-lane select picks between them based on whether the target crosses the 32-lane boundary. The half-swap is a single `v_permlane64_b32` on gfx11 / gfx12 (RDNA3 / RDNA4); on gfx10.x (RDNA1/2), which predates the instruction, it is emulated with a wave-local LDS round-trip. (The same per-lane select runs on CDNA too, but with `permlane64` reduced to a no-op the two reads collapse to one wave-wide `ds_bpermute` and the select becomes dead.) -One subtlety worth knowing about (mostly for anyone reading the generated IR): the lane-id operand to `ds_bpermute` is wrapped in an empty `+v` inline-asm fence inside the runtime helper. Without that fence, LLVM's AMDGPU backend can decide a compile-time-constant or otherwise uniform lane-id is "uniform across the wave" and silently lower the call to a `v_readlane_b32`-style instruction that addresses lanes 0..31 **wave-globally** rather than SIMD32-locally. That would break cross-half shuffles whose target lane is a literal (`broadcast(v, 47)`, `shuffle(v, qd.u32(40))`, etc.). The fence costs zero - same instruction shape on every path - and pins the lowering to a real `ds_bpermute_b32` so the SIMD-local semantics our `permlane64` pairing relies on always hold. +One subtlety worth knowing about (mostly for anyone reading the generated IR): the lane-id operand to `ds_bpermute` is wrapped in an empty `+v` inline-asm fence inside the runtime helper. Without that fence, LLVM's AMDGPU backend can decide a compile-time-constant or otherwise uniform lane-id is "uniform across the wave" and silently lower the call to a `v_readlane_b32`-style instruction that addresses lanes 0..31 **wave-globally** rather than per the real `ds_bpermute_b32` lane semantics. That would break cross-half shuffles whose target lane is a literal (`broadcast(v, 47)`, `shuffle(v, qd.u32(40))`, etc.) on both ISAs. The fence costs zero - same instruction shape on every path - and pins the lowering to a real `ds_bpermute_b32` so the lane addressing the shuffle lowering relies on always holds. ## Performance notes - Shuffles are register-to-register on CUDA (`__shfl_sync`, `__shfl_down_sync`, `__shfl_up_sync`) and on SPIR-V where the GPU has hardware support - typically a handful of cycles, no memory traffic. -- AMDGPU `shuffle`, `shuffle_down`, and `shuffle_up` all go through `ds_permute` / `ds_bpermute` (LDS-routed, roughly tens of cycles). On wave64 the lowering issues two parallel `ds_bpermute_b32` reads plus a `v_permlane64_b32` swap and a per-lane select to handle cross-half shuffles correctly on RDNA - see [AMDGPU wave64 cross-half lowering](#amdgpu-wave64-cross-half-lowering). The two `ds_bpermute` reads issue in parallel, so the latency is the same as a single read; the `permlane64` and `cndmask` add a few extra cycles. +- AMDGPU `shuffle`, `shuffle_down`, and `shuffle_up` all go through `ds_permute` / `ds_bpermute` (LDS-routed, roughly tens of cycles). On CDNA wave64 a cross-half shuffle is a single wave-wide `ds_bpermute_b32`; on RDNA wave64 the lowering issues two parallel `ds_bpermute_b32` reads plus a `v_permlane64_b32` swap and a per-lane select to reach across the SIMD32 boundary - see [AMDGPU wave64 cross-half lowering](#amdgpu-wave64-cross-half-lowering). The two `ds_bpermute` reads issue in parallel, so the latency is the same as a single read; on RDNA the `permlane64` and `cndmask` add a few extra cycles. - `shuffle_xor` and `broadcast_first` are `@qd.func` wrappers over `shuffle` / `broadcast` and inline at compile time, so on every backend they cost exactly the same as the underlying op. - Both `ballot_first_n` and `ballot` lower to a single hardware instruction on every backend - one cycle on CUDA (`__ballot_sync`), one instruction on AMDGPU (a single `v_cmp_*_e64` populating the wavefront-width SETCC, then a low-half store for `ballot_first_n`), and `OpGroupNonUniformBallot` on SPIR-V (extract one or two components of the result `uvec4`). At `n == 32` `ballot_first_n` elides the predicate-masking step entirely; at `n < 32` it inserts one extra multiply on the predicate. - `reduce_add` and `reduce_all_add` both issue exactly `log2_group_size()` shuffles and `log2_group_size()` adds per call (5 on wave32, 6 on AMDGPU wave64). No barriers, no shared memory, no launch overhead (they inline). The same holds for the `_tiled` form at any `log2_size`. From cda264cb2e23f59a45410bae382909cd52f09da8 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 17 Jun 2026 14:08:13 -0700 Subject: [PATCH 4/4] [Misc] AMDGPU: clang-format reflow of permlane64 emulation comment --- quadrants/runtime/llvm/llvm_context.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/quadrants/runtime/llvm/llvm_context.cpp b/quadrants/runtime/llvm/llvm_context.cpp index 372375fd16..31d1a151fd 100644 --- a/quadrants/runtime/llvm/llvm_context.cpp +++ b/quadrants/runtime/llvm/llvm_context.cpp @@ -610,7 +610,8 @@ std::unique_ptr QuadrantsLLVMContext::module_from_file(const std:: QuadrantsLLVMContext::mark_inline(permlane64_func); } } else if (auto permlane64_func = module->getFunction("amdgpu_permlane64")) { - // gfx10.x RDNA1/2: LDS-based software emulation. Layout: ``[1024 x i32] addrspace(3)`` indexed by ``wave_base + lane``. + // gfx10.x RDNA1/2: LDS-based software emulation. Layout: ``[1024 x i32] addrspace(3)`` indexed by ``wave_base + + // lane``. auto i32_ty = llvm::Type::getInt32Ty(*ctx); auto buf_ty = llvm::ArrayType::get(i32_ty, 1024); auto lds_global = llvm::cast_or_null(module->getNamedValue("__amdgpu_permlane64_lds"));