Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3,740 changes: 1,875 additions & 1,865 deletions llvm/lib/Target/AIE/aie2p/AIE2PGenSchedule.td

Large diffs are not rendered by default.

20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AIE/GlobalISel/legalize-dyn-stackalloc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,11 @@ define void @test_simple_dyn_alloca(i32 noundef %n) {
; AIE2P-NEXT: mova r1, #2; nopb ; nops ; nopxm ; nopv
; AIE2P-NEXT: paddxm [sp], #64
; AIE2P-NEXT: lshl r0, r0, r1
; AIE2P-NEXT: st lr, [sp, #-60] // 4-byte Folded Spill
; AIE2P-NEXT: st p7, [sp, #-64] // 4-byte Folded Spill
; AIE2P-NEXT: mov p7, sp
; AIE2P-NEXT: mov p1, sp
; AIE2P-NEXT: mova r1, #-64
; AIE2P-NEXT: st lr, [sp, #-60] // 4-byte Folded Spill
; AIE2P-NEXT: add r0, r0, #63
; AIE2P-NEXT: mov p0, p1
; AIE2P-NEXT: jl #extern_call
Expand Down Expand Up @@ -148,22 +148,22 @@ define void @test_loop_dyn_alloca(i32 noundef %n) {
; AIE2P-LABEL: test_loop_dyn_alloca:
; AIE2P: // %bb.0: // %entry
; AIE2P-NEXT: nopa ; nopb ; paddxm [sp], #64
; AIE2P-NEXT: st lr, [sp, #-32] // 4-byte Folded Spill
; AIE2P-NEXT: st r8, [sp, #-36] // 4-byte Folded Spill
; AIE2P-NEXT: st r9, [sp, #-40] // 4-byte Folded Spill
; AIE2P-NEXT: st r10, [sp, #-44] // 4-byte Folded Spill
; AIE2P-NEXT: st r11, [sp, #-48] // 4-byte Folded Spill
; AIE2P-NEXT: st r12, [sp, #-52] // 4-byte Folded Spill
; AIE2P-NEXT: st r13, [sp, #-56] // 4-byte Folded Spill
; AIE2P-NEXT: st p6, [sp, #-60] // 4-byte Folded Spill
; AIE2P-NEXT: st p7, [sp, #-64] // 4-byte Folded Spill
; AIE2P-NEXT: mov p7, sp
; AIE2P-NEXT: st r8, [sp, #-36] // 4-byte Folded Spill
; AIE2P-NEXT: mova r8, #1
; AIE2P-NEXT: st r9, [sp, #-40] // 4-byte Folded Spill
; AIE2P-NEXT: mova r9, #0
; AIE2P-NEXT: st r10, [sp, #-44] // 4-byte Folded Spill
; AIE2P-NEXT: mova r10, #10
; AIE2P-NEXT: st r11, [sp, #-48] // 4-byte Folded Spill
; AIE2P-NEXT: mova r11, #2
; AIE2P-NEXT: st r12, [sp, #-52] // 4-byte Folded Spill
; AIE2P-NEXT: mova r12, #-64
; AIE2P-NEXT: st r13, [sp, #-56] // 4-byte Folded Spill
; AIE2P-NEXT: mova r13, #0
; AIE2P-NEXT: st lr, [sp, #-32] // 4-byte Folded Spill
; AIE2P-NEXT: st p6, [sp, #-60] // 4-byte Folded Spill
; AIE2P-NEXT: padda [p7], #-64
; AIE2P-NEXT: .LBB1_1: // %for.body
; AIE2P-NEXT: // =>This Inner Loop Header: Depth=1
Expand Down Expand Up @@ -321,9 +321,9 @@ define void @test_huge_stack(i32 noundef %n) #0 {
; AIE2P-NEXT: mov p6, p7
; AIE2P-NEXT: padda [p0], m0
; AIE2P-NEXT: mova m0, #-32
; AIE2P-NEXT: st r0, [p0, #0]
; AIE2P-NEXT: padda [p3], m0
; AIE2P-NEXT: mova m0, #-24
; AIE2P-NEXT: st r0, [p0, #0]
; AIE2P-NEXT: lda r0, [p0, #0]
; AIE2P-NEXT: mov p0, sp
; AIE2P-NEXT: mov r8, p3
Expand Down
76 changes: 36 additions & 40 deletions llvm/test/CodeGen/AIE/aie2p/end-to-end/gelu-templated.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,77 +17,73 @@
define void @gelu_fn(ptr noalias %ifm, ptr noalias %ofm, ptr nonnull align 64 dereferenceable(64) %params) {
; CHECK-LABEL: gelu_fn:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; nopb ; nopxm
; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; nopxm
; CHECK-NEXT: movxm r0, #16544
; CHECK-NEXT: vbcst.16 x6, r0
; CHECK-NEXT: lda r1, [p2, #0]; movxm r0, #17280
; CHECK-NEXT: mova r0, #60; vbcst.16 x2, r0
; CHECK-NEXT: vadd.f dm3, dm1, dm0, r0
; CHECK-NEXT: vconv.fp32.bf16 cml0, x6
; CHECK-NEXT: nop
; CHECK-NEXT: vlda.conv.fp32.bf16 cml2, [p0], #64
; CHECK-NEXT: movxm r2, #15821
; CHECK-NEXT: mova r2, #255; movx r4, #1; vbcst.16 x4, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; vconv.bf16.fp32 x8, cml3; lshl r2, r1, r4; vbcst.16 x0, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cml2, [p0], #64; movxm r2, #15821
; CHECK-NEXT: movx r4, #1
; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; movx r2, #255; vbcst.16 x4, r2
; CHECK-NEXT: vconv.bf16.fp32 x8, cml3; lshl r2, r1, r4; vbcst.16 x0, r2
; CHECK-NEXT: mova r2, #828; mov m0, r2; vadd.f dm3, dm2, dm0, r0
; CHECK-NEXT: vlda.conv.fp32.bf16 cml2, [p0], #64; vmul.f dm2, x8, x2, r2
; CHECK-NEXT: nop
; CHECK-NEXT: vadd.f dm3, dm1, dm0, r0
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: vadd.f dm3, dm2, dm0, r0
; CHECK-NEXT: vconv.bf16.fp32 x10, cml3
; CHECK-NEXT: vconv.bf16.fp32 x8, cml2
; CHECK-NEXT: vmul.f dm1, x10, x2, r2
; CHECK-NEXT: vconv.bf16.fp32 x1, cml3
; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; vmul.f dm4, x8, x4, r2
; CHECK-NEXT: vconv.bf16.fp32 x7, cml3; vmul.f dm2, x1, x2, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; vconv.bf16.fp32 x1, cml3; vmul.f dm1, x10, x2, r2
; CHECK-NEXT: nop
; CHECK-NEXT: vmul.f dm2, x1, x2, r2
; CHECK-NEXT: vconv.bf16.fp32 x7, cml3; vmul.f dm4, x8, x4, r2
; CHECK-NEXT: vadd.f dm1, dm1, dm0, r0
; CHECK-NEXT: vmul.f dm3, x7, x2, r2
; CHECK-NEXT: vconv.bf16.fp32 x10, cml1; vadd.f dm1, dm1, dm0, r0
; CHECK-NEXT: nop
; CHECK-NEXT: vlda.conv.fp32.bf16 cml2, [p0], #64; vconv.bf16.fp32 x8, cml4; movx r3, #0; vmul.f dm4, x10, x4, r2
; CHECK-NEXT: vconv.bf16.fp32 x5, cml2; mov s0, r3
; CHECK-NEXT: vfloor.s32.bf16 x1, wl8, s0
; CHECK-NEXT: vconv.bf16.fp32 x5, cml3; vmul.f dm4, x5, x4, r2
; CHECK-NEXT: vconv.bf16.fp32 x7, cml1; movxm ls, #.LBB0_1; vadd.f dm2, dm2, dm0, r0
; CHECK-NEXT: mova r4, #-5; nopb ; vfloor.s32.bf16 x3, wh8, s0; movxm le, #.L_LEnd0; vmul.f dm3, x5, x4, r2
; CHECK-NEXT: mova r1, #2; nopb ; vconv.bf16.fp32 x10, cml4; lshl r4, r1, r4; vbcst.16 x6, r3; vmul.f dm4, x7, x2, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; vshuffle x1, x1, x3, r1
; CHECK-NEXT: vfloor.s32.bf16 x9, wl10, s0; vmin_ge.16 x3, r16, x1, x0, vaddsign1
; CHECK-NEXT: vfloor.s32.bf16 x3, wh10, s0; add.nc lc, r4, #-7
; CHECK-NEXT: nopa ; nopb ; vconv.bf16.fp32 x8, cml4; nopx ; vmax_lt.16 x11, r16, x3, x6, vaddsign1; nopv
; CHECK-NEXT: padda [p1], m0; nopb ; nops ; nopxm ; nopv
; CHECK-NEXT: vconv.bf16.fp32 x10, cml1
; CHECK-NEXT: vlda.conv.fp32.bf16 cml2, [p0], #64
; CHECK-NEXT: vconv.bf16.fp32 x5, cml2
; CHECK-NEXT: vconv.bf16.fp32 x8, cml4; vmul.f dm4, x10, x4, r2
; CHECK-NEXT: vconv.bf16.fp32 x7, cml1; vmul.f dm4, x5, x4, r2
; CHECK-NEXT: mova r3, #0; vconv.bf16.fp32 x5, cml3; vadd.f dm2, dm2, dm0, r0
; CHECK-NEXT: mov s0, r3; vmul.f dm3, x7, x2, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; nopb ; vfloor.s32.bf16 x1, wl8, s0; movxm ls, #.LBB0_1; vmul.f dm4, x5, x4, r2
; CHECK-NEXT: mova r4, #-5; vfloor.s32.bf16 x3, wh8, s0; movxm le, #.L_LEnd0
; CHECK-NEXT: vconv.bf16.fp32 x10, cml4; lshl r4, r1, r4; vbcst.16 x6, r3
; CHECK-NEXT: mova r1, #2; add.nc lc, r4, #-7
; CHECK-NEXT: nopa ; nopb ; vfloor.s32.bf16 x1, wh10, s0; nopx ; vshuffle x3, x1, x3, r1; nopv
; CHECK-NEXT: nopa ; nopb ; vconv.bf16.fp32 x9, cml4; nopx ; vmin_ge.16 x3, r16, x3, x0, vaddsign1; nopv
; CHECK-NEXT: padda [p1], m0; nopb ; vfloor.s32.bf16 x5, wl10, s0; nopx ; vmax_lt.16 x11, r16, x3, x6, vaddsign1; nopv
; CHECK-NEXT: .LBB0_1: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: nopa ; nopb ; vconv.bf16.fp32 x10, cml2; nopxm ; nopv
; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; vadd.f dm2, dm4, dm0, r0
; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; nopb ; vconv.bf16.fp32 x7, cml4; nopx ; vmov cml4, cml1; vmul.f dm4, x10, x2, r2
; CHECK-NEXT: nopa ; nopb ; vst x11, [p1], #64; nopx ; vshuffle x1, x9, x3, r1; nopv
; CHECK-NEXT: vfloor.s32.bf16 x3, wh8, s0; vmin_ge.16 x5, r16, x1, x0, vaddsign1
; CHECK-NEXT: vfloor.s32.bf16 x9, wl8, s0; vmax_lt.16 x11, r16, x5, x6, vaddsign1
; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; nopb ; vconv.bf16.fp32 x8, cml2; nopxm ; vadd.f dm2, dm2, dm0, r0
; CHECK-NEXT: nopa ; nopb ; vst x11, [p1], #64; nopx ; vmov cml2, cml1; nopv
; CHECK-NEXT: nopa ; nopb ; nopx ; vshuffle x10, x5, x1, r1; vfloor.s32.bf16 x1, wh9, s0
; CHECK-NEXT: vconv.bf16.fp32 x3, cml3; vmin_ge.16 x7, r16, x10, x0, vaddsign1; vmul.f dm3, x8, x2, r2
; CHECK-NEXT: vfloor.s32.bf16 x5, wl9, s0; vmax_lt.16 x11, r16, x7, x6, vaddsign1
; CHECK-NEXT: .L_LEnd0:
; CHECK-NEXT: nopa ; nopb ; vconv.bf16.fp32 x8, cml3; nopxm ; vmul.f dm3, x7, x4, r2
; CHECK-NEXT: nopa ; nopb ; vconv.bf16.fp32 x9, cml4; nopxm ; vmul.f dm4, x3, x4, r2
; CHECK-NEXT: // %bb.2:
; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; vshuffle x10, x9, x3, r1; nopv
; CHECK-NEXT: vmin_ge.16 x10, r16, x10, x0, vaddsign1
; CHECK-NEXT: vmax_lt.16 x10, r16, x10, x6, vaddsign1
; CHECK-NEXT: vst x11, [p1], #64
; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; vshuffle x10, x5, x1, r1; nopv
; CHECK-NEXT: vst x11, [p1], #64; nopx ; vmin_ge.16 x10, r16, x10, x0, vaddsign1
; CHECK-NEXT: vfloor.s32.bf16 x8, wh9, s0; vmax_lt.16 x10, r16, x10, x6, vaddsign1
; CHECK-NEXT: vfloor.s32.bf16 x10, wl9, s0
; CHECK-NEXT: vst x10, [p1], #64
; CHECK-NEXT: vfloor.s32.bf16 x10, wl8, s0
; CHECK-NEXT: vfloor.s32.bf16 x8, wh8, s0
; CHECK-NEXT: nop
; CHECK-NEXT: vshuffle x8, x10, x8, r1
; CHECK-NEXT: vmin_ge.16 x8, r16, x8, x0, vaddsign1
; CHECK-NEXT: vmax_lt.16 x8, r16, x8, x6, vaddsign1
; CHECK-NEXT: vconv.bf16.fp32 x8, cml3
; CHECK-NEXT: vconv.bf16.fp32 x8, cml4
; CHECK-NEXT: vst x8, [p1], #64
; CHECK-NEXT: vfloor.s32.bf16 x10, wl8, s0
; CHECK-NEXT: vfloor.s32.bf16 x8, wh8, s0
; CHECK-NEXT: nop
; CHECK-NEXT: vshuffle x8, x10, x8, r1
; CHECK-NEXT: vmin_ge.16 x8, r16, x8, x0, vaddsign1
; CHECK-NEXT: vmax_lt.16 x8, r16, x8, x6, vaddsign1
; CHECK-NEXT: vconv.bf16.fp32 x8, cml4
; CHECK-NEXT: vconv.bf16.fp32 x8, cml3
; CHECK-NEXT: vst x8, [p1], #64
; CHECK-NEXT: vmul.f dm3, x8, x4, r2
; CHECK-NEXT: nop
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AIE/aie2p/end-to-end/gemm-bfp16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,18 @@ declare <64 x i32> @llvm.aie2p.BFP576.BFP576.ACC2048.mac.conf(<64 x i8>, <8 x i8
define dso_local void @gemm_bfp16(ptr %ofm_ptr, ptr %ifm_ptr, ptr %wts_ptr, ptr %param, i20 %0, i20 %1, i20 %2, i20 %3, i20 %4, i20 %5, i20 %6, i20 %7, i20 %idx.ext.i.i.i.i.i.i.i.i.i90.i) {
; CHECK-LABEL: gemm_bfp16:
; CHECK: // %bb.0: // %newFuncRoot
; CHECK-NEXT: nopa ; nopb ; nopx ; mov r7, p5; nops
; CHECK-NEXT: nopa ; nopb ; nopx ; mov r7, p5
; CHECK-NEXT: paddxm [sp], #64
; CHECK-NEXT: st p6, [sp, #-60] // 4-byte Folded Spill
; CHECK-NEXT: mova m0, #-68; mov p6, sp
; CHECK-NEXT: padda [p6], m0; mov dc5, #0
; CHECK-NEXT: lda dn2, [p6], #-4; movs m2, p4; movxm r0, #16256
; CHECK-NEXT: padda [p6], m0; movxm r0, #16256
; CHECK-NEXT: lda dn2, [p6], #-4; movs m2, p4; mov dc5, #0
; CHECK-NEXT: lda m0, [p6], #-4; movs dj2, p5; mov r6, p4
; CHECK-NEXT: mova dj6, #68; movs p5, p0; vbcst.16 x0, r0
; CHECK-NEXT: lda dj0, [p6], #-4; movs dc1, dc5; movx r1, #53; mov dc2, dc5
; CHECK-NEXT: lda dj4, [p6], #-4; movs dc3, dc5; movx r2, #60; mov dc4, dc5
; CHECK-NEXT: lda dn0, [p6], #-4; movs dc0, dc5; movx r3, #780; mov dj3, r7
; CHECK-NEXT: lda m4, [p6, #-4]; st p7, [sp, #-64]; movx r4, #0; vmov x1, x0 // 4-byte Folded Spill
; CHECK-NEXT: mova dj6, #68; movs p5, p0; mov dc1, dc5
; CHECK-NEXT: lda dj0, [p6], #-4; movs dc2, dc5; movx r1, #53; mov dc3, dc5
; CHECK-NEXT: lda dj4, [p6], #-4; st p7, [sp, #-64]; movx r2, #60; vbcst.16 x0, r0 // 4-byte Folded Spill
; CHECK-NEXT: lda dn0, [p6], #-4; movs dc4, dc5; movx r3, #780; mov dc0, dc5
; CHECK-NEXT: lda m4, [p6, #-4]; movs dj3, r7; movx r4, #0; vmov x1, x0
; CHECK-NEXT: lda dn4, [p6, #0]; movs p6, p1; movx r0, #52; mov dn3, dn2
; CHECK-NEXT: .LBB0_1: // %for.body.i
; CHECK-NEXT: // =>This Loop Header: Depth=1
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AIE/aie2p/extractelement.ll
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,11 @@ define i64 @extract_v16i64_dyn(<16 x i64> inreg %v, i32 %idx) nounwind {
; AIE2P-NEXT: vmov x0, bmlh0
; AIE2P-NEXT: vmov x2, bmll0
; AIE2P-NEXT: lt r27, r2, r0
; AIE2P-NEXT: add r16, r27, #-1
; AIE2P-NEXT: sel.nez r0, r1, r0, r27
; AIE2P-NEXT: ret lr
; AIE2P-NEXT: sel.nez r0, r1, r0, r27 // Delay Slot 5
; AIE2P-NEXT: sub r0, r2, r0 // Delay Slot 4
; AIE2P-NEXT: vsel.32 x0, x2, x0, r16 // Delay Slot 3
; AIE2P-NEXT: add r16, r27, #-1 // Delay Slot 5
; AIE2P-NEXT: vsel.32 x0, x2, x0, r16 // Delay Slot 4
; AIE2P-NEXT: sub r0, r2, r0 // Delay Slot 3
; AIE2P-NEXT: vextract.64 r1:r0, x0, r0, vaddsign1 // Delay Slot 2
; AIE2P-NEXT: nop // Delay Slot 1
%1 = extractelement <16 x i64> %v, i32 %idx
Expand Down Expand Up @@ -127,12 +127,12 @@ define i32 @extract_v64i32_dyn(<64 x i32> inreg %v, i32 %idx) nounwind {
; AIE2P-NEXT: sel.nez r0, r2, r0, r27
; AIE2P-NEXT: lt r27, r1, r2
; AIE2P-NEXT: vsel.32 x0, x6, x0, r16
; AIE2P-NEXT: add r18, r27, #-1
; AIE2P-NEXT: sel.nez r0, r3, r0, r27
; AIE2P-NEXT: add r18, r27, #-1
; AIE2P-NEXT: ret lr
; AIE2P-NEXT: vsel.32 x0, x2, x0, r17 // Delay Slot 5
; AIE2P-NEXT: sub r0, r1, r0 // Delay Slot 4
; AIE2P-NEXT: vsel.32 x0, x4, x0, r18 // Delay Slot 3
; AIE2P-NEXT: vsel.32 x0, x4, x0, r18 // Delay Slot 4
; AIE2P-NEXT: sub r0, r1, r0 // Delay Slot 3
; AIE2P-NEXT: vextract.32 r0, x0, r0, vaddsign1 // Delay Slot 2
; AIE2P-NEXT: nop // Delay Slot 1
%1 = extractelement <64 x i32> %v, i32 %idx
Expand Down Expand Up @@ -171,12 +171,12 @@ define i64 @extract_v32i64_dyn(<32 x i64> inreg %v, i32 %idx) nounwind {
; AIE2P-NEXT: sel.nez r0, r1, r0, r27
; AIE2P-NEXT: lt r27, r2, r1
; AIE2P-NEXT: vsel.32 x0, x6, x0, r16
; AIE2P-NEXT: add r18, r27, #-1
; AIE2P-NEXT: sel.nez r0, r3, r0, r27
; AIE2P-NEXT: add r18, r27, #-1
; AIE2P-NEXT: ret lr
; AIE2P-NEXT: vsel.32 x0, x2, x0, r17 // Delay Slot 5
; AIE2P-NEXT: sub r0, r2, r0 // Delay Slot 4
; AIE2P-NEXT: vsel.32 x0, x4, x0, r18 // Delay Slot 3
; AIE2P-NEXT: vsel.32 x0, x4, x0, r18 // Delay Slot 4
; AIE2P-NEXT: sub r0, r2, r0 // Delay Slot 3
; AIE2P-NEXT: vextract.64 r1:r0, x0, r0, vaddsign1 // Delay Slot 2
; AIE2P-NEXT: nop // Delay Slot 1
%1 = extractelement <32 x i64> %v, i32 %idx
Expand Down
Loading