diff --git a/clang/include/clang/Basic/BuiltinsAIE2P.def b/clang/include/clang/Basic/BuiltinsAIE2P.def index c9c36f7468fb..f9e35f2a97aa 100644 --- a/clang/include/clang/Basic/BuiltinsAIE2P.def +++ b/clang/include/clang/Basic/BuiltinsAIE2P.def @@ -342,6 +342,7 @@ BUILTIN(__builtin_aie2p_fifo_ld_fill, "vv*&V32i&i&", "t") BUILTIN(__builtin_aie2p_fifo_ld_pop_512_unaligned, "V64cv*&V32i&i&", "t") BUILTIN(__builtin_aie2p_fifo_ld_pop_544_bfp16, "vv*&V32i&i&V64c&V8c&", "t") BUILTIN(__builtin_aie2p_fifo_ld_pop_576_bfp16, "vv*&V32i&i&V64c&V8c&", "t") +BUILTIN(__builtin_aie2p_fifo_ld_pop_640_unaligned_sparse, "vv*&V32i&i&V64c&V16c&", "t") BUILTIN(__builtin_aie2p_fifo_ld_pop_1d_512_unaligned, "V64cv*&V32i&i&i", "t") BUILTIN(__builtin_aie2p_fifo_ld_pop_1d_544_bfp16, "vv*&V32i&i&iV64c&V8c&", "t") BUILTIN(__builtin_aie2p_fifo_ld_pop_1d_576_bfp16, "vv*&V32i&i&iV64c&V8c&", "t") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 19fe6d0e8eb0..aa8fec771bc6 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -23893,6 +23893,8 @@ static llvm::Intrinsic::ID getAIE2PIntrinsicFunction(unsigned BuiltinID) { return Intrinsic::aie2p_fifo_ld_pop_576_bfp16; case AIE::BI__builtin_aie2p_fifo_ld_pop_544_bfp16: return Intrinsic::aie2p_fifo_ld_pop_544_bfp16; + case AIE::BI__builtin_aie2p_fifo_ld_pop_640_unaligned_sparse: + return Intrinsic::aie2p_fifo_ld_pop_640_unaligned_sparse; case AIE::BI__builtin_aie2p_fifo_ld_pop_1d_512_unaligned: return Intrinsic::aie2p_fifo_ld_pop_1d_unaligned; case AIE::BI__builtin_aie2p_fifo_ld_pop_1d_576_bfp16: @@ -24680,6 +24682,7 @@ Value *CodeGenFunction::EmitAIEBuiltinExpr(unsigned BuiltinID, case AIE::BI__builtin_aie2p_fifo_ld_pop_3d_544_bfp16: case AIE::BI__builtin_aie2p_fifo_ld_pop_544_bfp16: case AIE::BI__builtin_aie2p_fifo_ld_pop_576_bfp16: + case AIE::BI__builtin_aie2p_fifo_ld_pop_640_unaligned_sparse: case AIE::BI__builtin_aie2ps_fifo_ld_pop_BFP640: case AIE::BI__builtin_aie2ps_fifo_ld_pop_BFP768: case AIE::BI__builtin_aie2ps_fifo_ld_pop_1d_BFP640: @@ -24728,6 +24731,7 @@ Value *CodeGenFunction::EmitAIEBuiltinExpr(unsigned BuiltinID, case AIE::BI__builtin_aie2p_fifo_ld_pop_3d_544_bfp16: case AIE::BI__builtin_aie2p_fifo_ld_pop_544_bfp16: case AIE::BI__builtin_aie2p_fifo_ld_pop_576_bfp16: + case AIE::BI__builtin_aie2p_fifo_ld_pop_640_unaligned_sparse: MXStructCount = 2; break; case AIE::BI__builtin_aie2ps_fifo_ld_pop_BFP640: @@ -25066,7 +25070,8 @@ Value *CodeGenFunction::EmitAIE2PBuiltinExpr(unsigned BuiltinID, case AIE::BI__builtin_aie2p_fifo_ld_pop_2d_544_bfp16: case AIE::BI__builtin_aie2p_fifo_ld_pop_3d_512_unaligned: case AIE::BI__builtin_aie2p_fifo_ld_pop_3d_576_bfp16: - case AIE::BI__builtin_aie2p_fifo_ld_pop_3d_544_bfp16: { + case AIE::BI__builtin_aie2p_fifo_ld_pop_3d_544_bfp16: + case AIE::BI__builtin_aie2p_fifo_ld_pop_640_unaligned_sparse: { return this->EmitAIEBuiltinExpr(BuiltinID, E, Arch); } default: diff --git a/clang/lib/Headers/aie2p/aie2p_aie_api_compat.h b/clang/lib/Headers/aie2p/aie2p_aie_api_compat.h index 3c45fef8b9f5..170d7c39ba8f 100644 --- a/clang/lib/Headers/aie2p/aie2p_aie_api_compat.h +++ b/clang/lib/Headers/aie2p/aie2p_aie_api_compat.h @@ -50,13 +50,11 @@ struct v8cfloat { }; struct v16cfloat {}; -struct v128uint16_sparse {}; -struct v256uint8_sparse {}; -struct v512uint4_sparse {}; - -struct v128int16_sparse {}; -struct v256int8_sparse {}; -struct v512int4_sparse {}; +// AIE2P-larger (1280-bit) sparse vector types are now defined in +// aiebase_typedefs.h (Followup H — G-T3.6-003 Tier 2) so that they are +// visible to aie2p_upd_ext.h, which is included before this compat header. +// The previous empty-stub definitions here (`struct v256int8_sparse {};`) +// shadowed those real types and were removed. struct v256bfp16ebs16_sparse {}; struct v256bfp16ebs8_sparse {}; diff --git a/clang/lib/Headers/aie2p/aie2p_ldst.h b/clang/lib/Headers/aie2p/aie2p_ldst.h index 3a1b0e7a017c..9d5ebfc4d69a 100644 --- a/clang/lib/Headers/aie2p/aie2p_ldst.h +++ b/clang/lib/Headers/aie2p/aie2p_ldst.h @@ -488,6 +488,71 @@ FIFO_ST(__aie_dm_resource_cd, restrict) return fifo_ld_popx(p, s, 31, 31); \ } +// Sparse FIFO load for the 640-bit-class types (data=v64* + sparsity_t mask). +// Wires through the new __builtin_aie2p_fifo_ld_pop_640_unaligned_sparse +// builtin, which lowers to silicon ops VLDA_POP_640_normal_pop / +// VLDB_POP_640_normal_pop (selected via VLD_POP_640_normal_pop_pseudo). +// +// T must be a sparse vector struct (v128int8_sparse, v128uint8_sparse, +// v64int16_sparse, v64uint16_sparse, v256int4_sparse, v256uint4_sparse) +// with the standard {DATA data; sparsity_t mask;} layout from +// aiebase_typedefs.h. T_DATA is the 512-bit dense data field type. +#define FIFO_LD_SPARSE(T, T_DATA, DM_BANK, RESTRICT) \ + INTRINSIC(void) \ + fifo_ld_reset(T##_unaligned DM_BANK *RESTRICT &p, fifo_state_t &s) { \ + s.pos = 0; \ + sparse_fifo_t &fifo = s.fifo; \ + __builtin_aie2p_fifo_ld_fill((void DM_BANK *RESTRICT &)p, fifo, s.pos); \ + } \ + \ + INTRINSIC(void) \ + fifo_ld_fill(T##_unaligned DM_BANK *RESTRICT &p, fifo_state_t &s) { \ + int &pos = s.pos; \ + sparse_fifo_t &fifo = s.fifo; \ + __builtin_aie2p_fifo_ld_fill((void DM_BANK *RESTRICT &)p, fifo, pos); \ + } \ + \ + INTRINSIC(T) \ + fifo_ld_pop(T##_unaligned DM_BANK *RESTRICT &p, fifo_state_t &s) { \ + int &pos = s.pos; \ + sparse_fifo_t &fifo = s.fifo; \ + T r; \ + __builtin_aie2p_fifo_ld_pop_640_unaligned_sparse( \ + (void DM_BANK *RESTRICT &)p, fifo, pos, (v64char &)r.data, \ + (v16char &)r.mask); \ + return r; \ + } + +// Wide-pair sparse FIFO load: pop a smaller sparse vector twice and stitch +// into the wider sparse vector via Followup H's set_##T1 + insert helpers. +// T1 is the wide sparse type (e.g. v256int8_sparse, must have lo/hi fields +// of T2 type per aiebase_typedefs.h composite-struct definitions). +// T2 is the narrow sparse type (e.g. v128int8_sparse). +#define FIFO_LD_SPARSE_WIDE(T1, T2, DM_BANK, RESTRICT) \ + INTRINSIC(void) \ + fifo_ld_reset(T1##_unaligned DM_BANK *RESTRICT &p, fifo_state_t &s) { \ + T2##_unaligned DM_BANK *RESTRICT &q = \ + (T2##_unaligned DM_BANK * RESTRICT &)p; \ + fifo_ld_reset(q, s); \ + } \ + \ + INTRINSIC(void) \ + fifo_ld_fill(T1##_unaligned DM_BANK *RESTRICT &p, fifo_state_t &s) { \ + T2##_unaligned DM_BANK *RESTRICT &q = \ + (T2##_unaligned DM_BANK * RESTRICT &)p; \ + fifo_ld_fill(q, s); \ + } \ + \ + INTRINSIC(T1) \ + fifo_ld_pop(T1##_unaligned DM_BANK *RESTRICT &p, fifo_state_t &s) { \ + T2##_unaligned DM_BANK *RESTRICT &q = \ + (T2##_unaligned DM_BANK * RESTRICT &)p; \ + T1 v; \ + v = set_##T1(0, fifo_ld_pop(q, s)); \ + v = insert(v, 1, fifo_ld_pop(q, s)); \ + return v; \ + } + #define FIFO_LD_BFP16_WIDE(T1, T2, DM_BANK, RESTRICT) \ INTRINSIC(void) \ fifo_ld_reset(T1##_unaligned DM_BANK *RESTRICT &p, fifo_state_t &s) { \ @@ -571,7 +636,9 @@ FIFO_ST(__aie_dm_resource_cd, restrict) FIFO_FILLX(v64bfp16ebs8_unaligned, DM_BANK, RESTRICT) \ FIFO_FILLX(v64bfp16ebs16_unaligned, DM_BANK, RESTRICT) \ FIFO_LD_BFP16_WIDE(v128bfp16ebs8, v64bfp16ebs8, DM_BANK, RESTRICT) \ - FIFO_LD_BFP16_WIDE(v128bfp16ebs16, v64bfp16ebs16, DM_BANK, RESTRICT) + FIFO_LD_BFP16_WIDE(v128bfp16ebs16, v64bfp16ebs16, DM_BANK, RESTRICT) \ + FIFO_LD_SPARSE(v128int8_sparse, v64int8, DM_BANK, RESTRICT) \ + FIFO_LD_SPARSE_WIDE(v256int8_sparse, v128int8_sparse, DM_BANK, RESTRICT) FIFO_LD(, ) FIFO_LD(__aie_dm_resource_a, ) @@ -602,6 +669,8 @@ FIFO_LD(__aie_dm_resource_cd, restrict) #undef FIFO_POPX #undef FIFO_FILLX #undef FIFO_LD_BFP16_WIDE +#undef FIFO_LD_SPARSE +#undef FIFO_LD_SPARSE_WIDE #undef FIFO_LD #endif // AIE2P_LDST_H diff --git a/clang/lib/Headers/aie2p/aie2p_upd_ext.h b/clang/lib/Headers/aie2p/aie2p_upd_ext.h index 569574e041d4..27197b79267f 100644 --- a/clang/lib/Headers/aie2p/aie2p_upd_ext.h +++ b/clang/lib/Headers/aie2p/aie2p_upd_ext.h @@ -2083,4 +2083,265 @@ INTRINSIC(v64bfp16ebs8) extract_v64bfp16ebs8(v128bfp16ebs8 m, int idx) { return {m.mantissaX1, m.exponentE1}; } +// --------------------------------------------------------------------------- +// Sparse vector data/sparsity extraction (Followup H — G-T3.6-003 Tier 1) +// +// Pattern mirrors the dense BFP `extract_v64bfp16ebs16` above: both operate on +// a packed struct, returning either a sub-vector or composing a smaller +// sparse from larger sparse halves. No custom builtin is needed — the +// underlying sparse register classes (mQXsa/mQXsb in +// AIE2PRegisterInfo.td:727+) are already targeted by the dense load builtin +// chain when the data field is an aligned `v64int8`. +// +// AIEv2 implements the same surface via __builtin_aiev2_ext_qx (see +// aiev2/aiev2_upd_ext.h:2602-2622), but that builtin is not defined in +// upstream Peano — it is supplied by Vitis Chess. By implementing as +// struct-field access, we avoid that builtin dependency and keep the AIE2P +// path self-contained. +// --------------------------------------------------------------------------- + +// extract_sparse_data — pull the dense data field out of a sparse vector. +// Mirrors the AIEv2 `extract_sparse_data` family (aiev2_upd_ext.h:2602-2622). +INTRINSIC(v128uint4) extract_sparse_data(v256uint4_sparse v) { + return v.data; +} +INTRINSIC(v64uint8) extract_sparse_data(v128uint8_sparse v) { + return v.data; +} +INTRINSIC(v32uint16) extract_sparse_data(v64uint16_sparse v) { + return v.data; +} +INTRINSIC(v128int4) extract_sparse_data(v256int4_sparse v) { + return v.data; +} +INTRINSIC(v64int8) extract_sparse_data(v128int8_sparse v) { + return v.data; +} +INTRINSIC(v32int16) extract_sparse_data(v64int16_sparse v) { + return v.data; +} + +// extract_v* synonym for extract_sparse_data — the aie_api compat layer +// uses both names depending on the call site. +INTRINSIC(v128uint4) extract_v128uint4(v256uint4_sparse v) { + return v.data; +} +INTRINSIC(v64uint8) extract_v64uint8(v128uint8_sparse v) { + return v.data; +} +INTRINSIC(v32uint16) extract_v32uint16(v64uint16_sparse v) { + return v.data; +} +INTRINSIC(v128int4) extract_v128int4(v256int4_sparse v) { + return v.data; +} +INTRINSIC(v64int8) extract_v64int8(v128int8_sparse v) { + return v.data; +} +INTRINSIC(v32int16) extract_v32int16(v64int16_sparse v) { + return v.data; +} + +// extract_sparsity — pull the sparsity_t mask out of a sparse vector. +INTRINSIC(sparsity_t) extract_sparsity(v256uint4_sparse v) { + return v.mask; +} +INTRINSIC(sparsity_t) extract_sparsity(v128uint8_sparse v) { + return v.mask; +} +INTRINSIC(sparsity_t) extract_sparsity(v64uint16_sparse v) { + return v.mask; +} +INTRINSIC(sparsity_t) extract_sparsity(v256int4_sparse v) { + return v.mask; +} +INTRINSIC(sparsity_t) extract_sparsity(v128int8_sparse v) { + return v.mask; +} +INTRINSIC(sparsity_t) extract_sparsity(v64int16_sparse v) { + return v.mask; +} + +// --------------------------------------------------------------------------- +// AIE2P-larger sparse <-> smaller sparse conversion (Followup H Tier 2) +// +// Mirrors the bfp16 pattern at extract_v64bfp16ebs16 (line ~2073 above): +// the larger composite struct holds two halves of the smaller type, and +// extract/concat is a struct-field operation. v256int8_sparse, +// v512int4_sparse, v128int16_sparse, v256uint8_sparse, v512uint4_sparse, +// v128uint16_sparse are now real composite structs (see +// aie2p_aie_api_compat.h:53-93 for the new struct definitions); the empty- +// struct stubs they previously held made these implementations impossible. +// --------------------------------------------------------------------------- + +// extract_v*_sparse: larger -> smaller via lo/hi field access. +INTRINSIC(v256uint4_sparse) +extract_v256uint4_sparse(v512uint4_sparse v, int idx) { + if (idx == 0) + return v.lo; + return v.hi; +} +INTRINSIC(v128uint8_sparse) +extract_v128uint8_sparse(v256uint8_sparse v, int idx) { + if (idx == 0) + return v.lo; + return v.hi; +} +INTRINSIC(v64uint16_sparse) +extract_v64uint16_sparse(v128uint16_sparse v, int idx) { + if (idx == 0) + return v.lo; + return v.hi; +} +INTRINSIC(v256int4_sparse) +extract_v256int4_sparse(v512int4_sparse v, int idx) { + if (idx == 0) + return v.lo; + return v.hi; +} +INTRINSIC(v128int8_sparse) +extract_v128int8_sparse(v256int8_sparse v, int idx) { + if (idx == 0) + return v.lo; + return v.hi; +} +INTRINSIC(v64int16_sparse) +extract_v64int16_sparse(v128int16_sparse v, int idx) { + if (idx == 0) + return v.lo; + return v.hi; +} + +// concat: smaller pair -> larger. +INTRINSIC(v512uint4_sparse) +concat(v256uint4_sparse a, v256uint4_sparse b) { + return v512uint4_sparse{a, b}; +} +INTRINSIC(v256uint8_sparse) +concat(v128uint8_sparse a, v128uint8_sparse b) { + return v256uint8_sparse{a, b}; +} +INTRINSIC(v128uint16_sparse) +concat(v64uint16_sparse a, v64uint16_sparse b) { + return v128uint16_sparse{a, b}; +} +INTRINSIC(v512int4_sparse) +concat(v256int4_sparse a, v256int4_sparse b) { + return v512int4_sparse{a, b}; +} +INTRINSIC(v256int8_sparse) +concat(v128int8_sparse a, v128int8_sparse b) { + return v256int8_sparse{a, b}; +} +INTRINSIC(v128int16_sparse) +concat(v64int16_sparse a, v64int16_sparse b) { + return v128int16_sparse{a, b}; +} + +// set_v*_sparse — set lane idx (always lo) and zero the rest. +INTRINSIC(v512uint4_sparse) +set_v512uint4_sparse(int idx, v256uint4_sparse v) { + v512uint4_sparse r{}; + if (idx == 0) + r.lo = v; + else + r.hi = v; + return r; +} +INTRINSIC(v256uint8_sparse) +set_v256uint8_sparse(int idx, v128uint8_sparse v) { + v256uint8_sparse r{}; + if (idx == 0) + r.lo = v; + else + r.hi = v; + return r; +} +INTRINSIC(v128uint16_sparse) +set_v128uint16_sparse(int idx, v64uint16_sparse v) { + v128uint16_sparse r{}; + if (idx == 0) + r.lo = v; + else + r.hi = v; + return r; +} +INTRINSIC(v512int4_sparse) +set_v512int4_sparse(int idx, v256int4_sparse v) { + v512int4_sparse r{}; + if (idx == 0) + r.lo = v; + else + r.hi = v; + return r; +} +INTRINSIC(v256int8_sparse) +set_v256int8_sparse(int idx, v128int8_sparse v) { + v256int8_sparse r{}; + if (idx == 0) + r.lo = v; + else + r.hi = v; + return r; +} +INTRINSIC(v128int16_sparse) +set_v128int16_sparse(int idx, v64int16_sparse v) { + v128int16_sparse r{}; + if (idx == 0) + r.lo = v; + else + r.hi = v; + return r; +} + +// insert / update: replace a lane with a new value. +INTRINSIC(v512uint4_sparse) +insert(v512uint4_sparse m, int idx, v256uint4_sparse v) { + if (idx == 0) + m.lo = v; + else + m.hi = v; + return m; +} +INTRINSIC(v256uint8_sparse) +insert(v256uint8_sparse m, int idx, v128uint8_sparse v) { + if (idx == 0) + m.lo = v; + else + m.hi = v; + return m; +} +INTRINSIC(v128uint16_sparse) +insert(v128uint16_sparse m, int idx, v64uint16_sparse v) { + if (idx == 0) + m.lo = v; + else + m.hi = v; + return m; +} +INTRINSIC(v512int4_sparse) +insert(v512int4_sparse m, int idx, v256int4_sparse v) { + if (idx == 0) + m.lo = v; + else + m.hi = v; + return m; +} +INTRINSIC(v256int8_sparse) +insert(v256int8_sparse m, int idx, v128int8_sparse v) { + if (idx == 0) + m.lo = v; + else + m.hi = v; + return m; +} +INTRINSIC(v128int16_sparse) +insert(v128int16_sparse m, int idx, v64int16_sparse v) { + if (idx == 0) + m.lo = v; + else + m.hi = v; + return m; +} + #endif // __AIE2P_UPD_EXT_H__ diff --git a/clang/lib/Headers/aiebase_typedefs.h b/clang/lib/Headers/aiebase_typedefs.h index 480c453b562d..aac03f93e2f5 100644 --- a/clang/lib/Headers/aiebase_typedefs.h +++ b/clang/lib/Headers/aiebase_typedefs.h @@ -211,6 +211,7 @@ typedef int32_t v4int32 __attribute__((__vector_size__(16))) __attribute__((aligned(__MIN_ALIGNMENT_16))); typedef int16_t v8int16 __attribute__((__vector_size__(16))) __attribute__((aligned(__MIN_ALIGNMENT_16))); +typedef char v16char __attribute__((__vector_size__(16))); typedef int8_t v16int8 __attribute__((__vector_size__(16))) __attribute__((aligned(__MIN_ALIGNMENT_16))); typedef uint32_t v4uint32 __attribute__((__vector_size__(16))) @@ -575,6 +576,53 @@ struct v128bfp16ebs8 { } __attribute__((packed)) __attribute__((return_in_regs)) __attribute__((aligned(8))); +// AIE2P-larger (1280-bit) sparse vector types — Followup H, G-T3.6-003. +// +// These are the AIE-API-style "wide sparse" types: each holds a pair of the +// AIEv2-sized (640-bit) sparse vectors. They were previously declared as +// empty-stub structs in aie2p_aie_api_compat.h:53-66 (now removed), which +// blocked any non-trivial implementation of the forward-decls in that file. +// +// Defining them as composite structs (mirroring the v128bfp16ebs16 pattern +// at lines 563-576 above) makes: +// - extract_v128int8_sparse(v256int8_sparse, int) -> field access +// - concat(v128int8_sparse, v128int8_sparse) -> brace-init +// - set_v256int8_sparse / insert(v256int8_sparse, ...) -> field write +// all implementable as pure header functions in aie2p_upd_ext.h. +// +// Bodies live at aie2p_upd_ext.h (tail), in the +// "AIE2P-larger sparse <-> smaller sparse conversion" block. +struct v512uint4_sparse { + v256uint4_sparse lo; + v256uint4_sparse hi; +} __attribute__((packed)) __attribute__((aligned(16))) +__attribute__((return_in_regs)) __attribute__((is_sparse)); +struct v256uint8_sparse { + v128uint8_sparse lo; + v128uint8_sparse hi; +} __attribute__((packed)) __attribute__((aligned(16))) +__attribute__((return_in_regs)) __attribute__((is_sparse)); +struct v128uint16_sparse { + v64uint16_sparse lo; + v64uint16_sparse hi; +} __attribute__((packed)) __attribute__((aligned(16))) +__attribute__((return_in_regs)) __attribute__((is_sparse)); +struct v512int4_sparse { + v256int4_sparse lo; + v256int4_sparse hi; +} __attribute__((packed)) __attribute__((aligned(16))) +__attribute__((return_in_regs)) __attribute__((is_sparse)); +struct v256int8_sparse { + v128int8_sparse lo; + v128int8_sparse hi; +} __attribute__((packed)) __attribute__((aligned(16))) +__attribute__((return_in_regs)) __attribute__((is_sparse)); +struct v128int16_sparse { + v64int16_sparse lo; + v64int16_sparse hi; +} __attribute__((packed)) __attribute__((aligned(16))) +__attribute__((return_in_regs)) __attribute__((is_sparse)); + #endif // __AIEARCH__ == 21 #if __AIEARCH__ == 22 diff --git a/clang/test/CodeGen/aie/aie2p/aie2p-sparse-fifo-ld.cpp b/clang/test/CodeGen/aie/aie2p/aie2p-sparse-fifo-ld.cpp new file mode 100644 index 000000000000..411e35758b24 --- /dev/null +++ b/clang/test/CodeGen/aie/aie2p/aie2p-sparse-fifo-ld.cpp @@ -0,0 +1,34 @@ +//===- aie2p-sparse-fifo-ld.cpp -------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// RUN: %clang -O2 %s --target=aie2p -nostdlibinc -S -emit-llvm -o - | FileCheck %s + +// CHECK-LABEL: test_fifo_ld_pop_sparse +// CHECK: call { ptr, <32 x i32>, i32, <64 x i8>, <16 x i8> } @llvm.aie2p.fifo.ld.pop.640.unaligned.sparse.p0.p0( +// CHECK: extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <16 x i8> } {{.*}}, 3 +// CHECK: extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <16 x i8> } {{.*}}, 4 +// CHECK: bitcast <16 x i8> {{.*}} to i128 +// CHECK: insertvalue {{.*}} <64 x i8> {{.*}}, 0 +// CHECK: insertvalue {{.*}} i128 {{.*}}, 1 +v128int8_sparse test_fifo_ld_pop_sparse(v128int8_sparse_unaligned *&p, + fifo_state_t &s) { + return fifo_ld_pop(p, s); +} + +// CHECK-LABEL: test_fifo_ld_pop_sparse_wide +// CHECK: call { ptr, <32 x i32>, i32, <64 x i8>, <16 x i8> } @llvm.aie2p.fifo.ld.pop.640.unaligned.sparse.p0.p0( +// CHECK: call { ptr, <32 x i32>, i32, <64 x i8>, <16 x i8> } @llvm.aie2p.fifo.ld.pop.640.unaligned.sparse.p0.p0( +// CHECK: insertvalue {{.*}} <64 x i8> {{.*}}, 0, 0 +// CHECK: insertvalue {{.*}} i128 {{.*}}, 0, 1 +// CHECK: insertvalue {{.*}} <64 x i8> {{.*}}, 1, 0 +// CHECK: insertvalue {{.*}} i128 {{.*}}, 1, 1 +v256int8_sparse test_fifo_ld_pop_sparse_wide(v256int8_sparse_unaligned *&p, + fifo_state_t &s) { + return fifo_ld_pop(p, s); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAIE2P.td b/llvm/include/llvm/IR/IntrinsicsAIE2P.td index dae8e0fd1492..9d1dc35d579d 100644 --- a/llvm/include/llvm/IR/IntrinsicsAIE2P.td +++ b/llvm/include/llvm/IR/IntrinsicsAIE2P.td @@ -694,9 +694,17 @@ class AIE2PFIFO_LD_POPUnaligned : DefaultAttrsIntrinsic<[llvm_v64i8_ty, llvm_any class AIE2PFIFO_LD_POP_576_BFP16 : DefaultAttrsIntrinsic<[llvm_anyptr_ty, llvm_v32i32_ty, llvm_i32_ty, llvm_v64i8_ty, llvm_v8i8_ty], [llvm_anyptr_ty, llvm_v32i32_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; +// POP 640-bit sparse (v128int8_sparse-class). Returns ptr/fifo/pos updates +// plus the 512-bit data half and the 128-bit sparsity-mask half. +// Mirrors the BFP16 multi-output shape: data goes into a v64i8 (=v64char) +// and the mask goes into a v16i8 (=v16char, 128 bits = sparsity_t storage). +class AIE2PFIFO_LD_POP_640_Sparse : DefaultAttrsIntrinsic<[llvm_anyptr_ty, llvm_v32i32_ty, llvm_i32_ty, llvm_v64i8_ty, llvm_v16i8_ty], + [llvm_anyptr_ty, llvm_v32i32_ty, llvm_i32_ty], + [IntrReadMem, IntrArgMemOnly]>; def int_aie2p_fifo_ld_pop_unaligned: AIE2PFIFO_LD_POPUnaligned; def int_aie2p_fifo_ld_pop_576_bfp16: AIE2PFIFO_LD_POP_576_BFP16; def int_aie2p_fifo_ld_pop_544_bfp16: AIE2PFIFO_LD_POP_576_BFP16; +def int_aie2p_fifo_ld_pop_640_unaligned_sparse: AIE2PFIFO_LD_POP_640_Sparse; // POP 1D class AIE2PFIFO_LD_POPUnaligned1D : DefaultAttrsIntrinsic<[llvm_v64i8_ty, llvm_anyptr_ty, llvm_v32i32_ty, llvm_i32_ty], diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 3ed07cbc5ce9..4361d7deeb4e 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -6786,6 +6786,8 @@ static const Value *getUnderlyingObjectAIEIntrinsic(const Value *V) { case Intrinsic::aie2p_fifo_ld_pop_576_1d_bfp16: case Intrinsic::aie2p_fifo_ld_pop_576_2d_bfp16: case Intrinsic::aie2p_fifo_ld_pop_576_3d_bfp16: + // fifo ld pop 640 sparse + case Intrinsic::aie2p_fifo_ld_pop_640_unaligned_sparse: // fifo st flush case Intrinsic::aie2p_fifo_st_flush: case Intrinsic::aie2p_fifo_st_flush_1d: diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PISelLowering.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PISelLowering.cpp index 87185c45e6cf..804eae33537f 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PISelLowering.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PISelLowering.cpp @@ -119,6 +119,7 @@ bool AIE2PTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::aie2p_fifo_ld_pop_576_2d_bfp16: case Intrinsic::aie2p_fifo_ld_pop_576_3d_bfp16: case Intrinsic::aie2p_fifo_ld_pop_576_bfp16: + case Intrinsic::aie2p_fifo_ld_pop_640_unaligned_sparse: // The HW does a 512-bit load from somewhere between addr-63 and addr+128 // depending on the FIFO availability and the input alignment. // A conservative access range would be [addr-64, addr+192) diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp index 950b4b15258b..4f89f50ba0c8 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp @@ -79,6 +79,10 @@ class AIE2PInstructionSelector : public AIEBaseInstructionSelector { Register MantissaDst, Register ExponentDst, MachineRegisterInfo &MRI); + bool buildAndConstrainSparseFifoLoadCopies(Register SparseVecDst, + Register DataDst, + Register MaskDst, + MachineRegisterInfo &MRI); Register createDSRegSequence(Register ModifierReg, Register Incr1Reg, Register Incr2Reg, Register Size1Reg, Register Count1Reg, Register Size2Reg, @@ -104,6 +108,8 @@ class AIE2PInstructionSelector : public AIEBaseInstructionSelector { bool selectVLD_FIFO_POP_BFP16_1D(MachineInstr &I, MachineRegisterInfo &MRI); bool selectVLD_FIFO_POP_BFP16_2D(MachineInstr &I, MachineRegisterInfo &MRI); bool selectVLD_FIFO_POP_BFP16_3D(MachineInstr &I, MachineRegisterInfo &MRI); + bool selectVLD_FIFO_POP_640_SPARSE(MachineInstr &I, + MachineRegisterInfo &MRI); bool selectVLD_FIFO_FILLX(MachineInstr &I, MachineRegisterInfo &MRI); bool selectVLD_FIFO_POPX(MachineInstr &I, MachineRegisterInfo &MRI); @@ -422,6 +428,8 @@ bool AIE2PInstructionSelector::select(MachineInstr &I) { case Intrinsic::aie2p_fifo_ld_pop_544_3d_bfp16: case Intrinsic::aie2p_fifo_ld_pop_576_3d_bfp16: return selectVLD_FIFO_POP_BFP16_3D(I, MRI); + case Intrinsic::aie2p_fifo_ld_pop_640_unaligned_sparse: + return selectVLD_FIFO_POP_640_SPARSE(I, MRI); case Intrinsic::set_loop_iterations: return selectSetLoopIterations(I, MRI, MIB); case Intrinsic::start_loop_iterations: @@ -784,6 +792,26 @@ bool AIE2PInstructionSelector::buildAndConstrainFifoLoadCopies( CopyMI1->getOperand(0)); } +// Split the 640-bit sparse-load destination into the 512-bit data half +// (sub_sparse_x) and the 128-bit sparsity-mask half (sub_sparse_q). +// Mirrors buildAndConstrainFifoLoadCopies (BFP16 path). +bool AIE2PInstructionSelector::buildAndConstrainSparseFifoLoadCopies( + Register SparseVecDest, Register DataDst, Register MaskDst, + MachineRegisterInfo &MRI) { + + auto CopyDataMI = MIB.buildInstr(TargetOpcode::COPY, {DataDst}, {}) + .addReg(SparseVecDest, 0, AIE2P::sub_sparse_x); + auto CopyMaskMI = MIB.buildInstr(TargetOpcode::COPY, {MaskDst}, {}) + .addReg(SparseVecDest, 0, AIE2P::sub_sparse_q); + + return constrainOperandRegClass(*MF, TRI, MRI, TII, RBI, *CopyDataMI, + AIE2P::VEC512RegClass, + CopyDataMI->getOperand(0)) && + constrainOperandRegClass(*MF, TRI, MRI, TII, RBI, *CopyMaskMI, + AIE2P::VEC128RegClass, + CopyMaskMI->getOperand(0)); +} + Register AIE2PInstructionSelector::createDSRegSequence( Register ModifierReg, Register Incr1Reg, Register Incr2Reg, Register Size1Reg, Register Count1Reg, Register Size2Reg, @@ -1879,6 +1907,8 @@ unsigned getLoadFifoOpcode(MachineInstr &I) { return AIE2P::VLDB_FILLX_512; case Intrinsic::aie2p_fifo_ld_popx: return AIE2P::VLDB_POPX_512; + case Intrinsic::aie2p_fifo_ld_pop_640_unaligned_sparse: + return AIE2P::VLD_POP_640_normal_pop_pseudo; } llvm_unreachable("unreachable: Failed to get sparse load opcode"); return AIE2P::INSTRUCTION_LIST_END; @@ -2081,6 +2111,36 @@ bool AIE2PInstructionSelector::selectVLD_FIFO_POP_BFP16( CopiesConstrained; } +// Lower the sparse-load intrinsic to the existing 640-bit silicon pseudo +// VLD_POP_640_normal_pop_pseudo. The pseudo destination is mQXsa (640 bits), +// which we then split via sub_sparse_x / sub_sparse_q copies into the +// data + mask outputs the builtin exposes by reference. +bool AIE2PInstructionSelector::selectVLD_FIFO_POP_640_SPARSE( + MachineInstr &I, MachineRegisterInfo &MRI) { + unsigned IntrinsicID = cast(I).getIntrinsicID(); + assert(IntrinsicID == Intrinsic::aie2p_fifo_ld_pop_640_unaligned_sparse); + // GIntrinsic returns are: ptr, fifo, pos, data, mask (5 defs). + Register PtrOut = I.getOperand(0).getReg(); + Register FifoOut = I.getOperand(1).getReg(); + Register AvailOut = I.getOperand(2).getReg(); + Register DataOut = I.getOperand(3).getReg(); + Register MaskOut = I.getOperand(4).getReg(); + // Then intrinsic-id, then ptr, fifo, pos inputs. + Register PtrIn = I.getOperand(6).getReg(); + Register FifoIn = I.getOperand(7).getReg(); + Register AvailIn = I.getOperand(8).getReg(); + Register Vec640Out = MRI.createVirtualRegister(&AIE2P::mQXsaRegClass); + MachineInstrBuilder MI = MIB.buildInstr( + getLoadFifoOpcode(I), {Vec640Out, PtrOut, FifoOut, AvailOut}, + {PtrIn, FifoIn, AvailIn}); + bool CopiesConstrained = + buildAndConstrainSparseFifoLoadCopies(Vec640Out, DataOut, MaskOut, MRI); + MI.cloneMemRefs(I); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI) && + CopiesConstrained; +} + bool AIE2PInstructionSelector::selectVLD_FIFO_POP_BFP16_1D( MachineInstr &I, MachineRegisterInfo &MRI) { unsigned IntrinsicID = cast(I).getIntrinsicID(); diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PRegisterBankInfo.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PRegisterBankInfo.cpp index 6f87607692aa..9054a2a599b7 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PRegisterBankInfo.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PRegisterBankInfo.cpp @@ -582,6 +582,12 @@ static bool isUsedAsFifoRegInIntrinsic(const MachineRegisterInfo &MRI, return checkFifoDstSrc(MI, FifoRegCandidate, 1, 7); break; } + case Intrinsic::aie2p_fifo_ld_pop_640_unaligned_sparse: { + // Same shape as 544/576 BFP16: 5 outputs (ptr, fifo, pos, data, mask), + // intrinsic-id at 5, then ptr=6, fifo=7, pos=8. + return checkFifoDstSrc(MI, FifoRegCandidate, 1, 7); + break; + } case Intrinsic::aie2p_fifo_ld_pop_2d_unaligned: { return checkFifoDstSrc(MI, FifoRegCandidate, 2, 7); break; diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-loads.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-loads.mir index b32257449fb2..f9cbdd4ac848 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-loads.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-loads.mir @@ -157,6 +157,31 @@ body: | PseudoRET implicit $lr, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13 ... +--- +name: pop_640_sparse +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: + ; CHECK-LABEL: name: pop_640_sparse + ; CHECK: [[DEF:%[0-9]+]]:eps = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:eldfiforeg = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:erf2 = IMPLICIT_DEF + ; CHECK-NEXT: [[VLD_POP_640_normal_pop_pseudo:%[0-9]+]]:mqxsa, [[VLD_POP_640_normal_pop_pseudo1:%[0-9]+]]:eps, [[VLD_POP_640_normal_pop_pseudo2:%[0-9]+]]:eldfiforeg, [[VLD_POP_640_normal_pop_pseudo3:%[0-9]+]]:erf2 = VLD_POP_640_normal_pop_pseudo [[DEF]], [[DEF1]], [[DEF2]], implicit-def $srfifo_uf + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY [[VLD_POP_640_normal_pop_pseudo]].sub_sparse_x + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec128 = COPY [[VLD_POP_640_normal_pop_pseudo]].sub_sparse_q + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLD_POP_640_normal_pop_pseudo1]], implicit [[VLD_POP_640_normal_pop_pseudo2]], implicit [[VLD_POP_640_normal_pop_pseudo3]], implicit [[COPY]], implicit [[COPY1]] + %14:vregbank(<64 x s8>) = G_IMPLICIT_DEF + %15:vregbank(<16 x s8>) = G_IMPLICIT_DEF + %6:ptrregbank(p0) = G_IMPLICIT_DEF + %7:fiforegbank(<32 x s32>) = G_IMPLICIT_DEF + %8:gprregbank(s32) = G_IMPLICIT_DEF + %9:ptrregbank(p0), %10:fiforegbank(<32 x s32>), %11:gprregbank(s32), %12:vregbank(<64 x s8>), %13:vregbank(<16 x s8>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.ld.pop.640.unaligned.sparse), %6:ptrregbank(p0), %7:fiforegbank(<32 x s32>), %8:gprregbank(s32), %14:vregbank(<64 x s8>), %15:vregbank(<16 x s8>) + PseudoRET implicit $lr, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13 +... + --- name: pop_unaligned_1d tracksRegLiveness: true