diff --git a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h index 59f692d02eed..608b0ac0d5de 100644 --- a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h +++ b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h @@ -406,6 +406,10 @@ struct AIEBaseInstrInfo : public TargetInstrInfo { llvm_unreachable("Target didn't implement getNumReservedDelaySlots"); } + /// Convert an InstrStage's getUnits() value to an FU bit position. + /// Default: identity, matching AIE's FUNCUNIT_REPRESENTATION(x) = (x). + virtual unsigned getFuncUnitIndex(uint64_t Units) const { return Units; } + /// Check whether Opc represents a JNZ instruction. This is mainly for /// detecting a downcounting loop branch. virtual bool isJNZ(unsigned Opc) const { return false; } diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp index c97d19c230a4..c210c13f2010 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp @@ -618,7 +618,14 @@ SchedulingStage InterBlockScheduling::updateScheduling(BlockState &BS) { if (BS.getRegions().size() == 1) { auto &PostSWP = BS.getPostSWP(); if (PostSWP.isPostPipelineCandidate(*BS.TheBlock)) { - BS.FixPoint.II = PostSWP.getResMII(*BS.TheBlock); + // A CLI --aie-postpipeliner-target-ii is a hard limit: start at + // exactly that II (bypassing --aie-postpipeliner-maxii) and let + // updatePipelining one-shot it. A pragma-driven TargetII is a soft + // hint: start at ResMII and iterate normally; the solver fallback at + // II == TargetII is handled inside the post-pipeliner. + BS.FixPoint.II = PostSWP.isTargetIIHardLimit() + ? PostSWP.getTargetII() + : PostSWP.getResMII(*BS.TheBlock); BS.FixPoint.IITries = 1; return SchedulingStage::Pipelining; } @@ -632,11 +639,17 @@ SchedulingStage InterBlockScheduling::updatePipelining(BlockState &BS) { return BS.FixPoint.Stage; } - // Otherwise try a larger II. - // We cut off at larger IIs to prevent excessive compilation time. - if (++BS.FixPoint.II <= PostPipelinerMaxII && - ++BS.FixPoint.IITries <= PostPipelinerMaxTryII) { - return SchedulingStage::Pipelining; + // A CLI --aie-postpipeliner-target-ii is one-shot: try only the requested + // II, even if it exceeds --aie-postpipeliner-maxii. If that attempt + // failed, do not try any other II. A pragma-driven TargetII keeps the + // normal iteration (ResMII..MaxII). + if (!BS.getPostSWP().isTargetIIHardLimit()) { + // Otherwise try a larger II. + // We cut off at larger IIs to prevent excessive compilation time. + if (++BS.FixPoint.II <= PostPipelinerMaxII && + ++BS.FixPoint.IITries <= PostPipelinerMaxTryII) { + return SchedulingStage::Pipelining; + } } auto *BB = BS.TheBlock; diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp index 2d9dee7ab056..807d411ebcbe 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/ResourceScoreboard.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/MC/MCInstrItineraries.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include #include @@ -44,9 +45,16 @@ static cl::opt cl::desc("Number of runs for heuristics that converge"), cl::init(20), cl::Hidden); -static cl::opt PresetII("aie-postpipeliner-target-ii", - cl::desc("II for which to allow the solver"), - cl::init(0), cl::Hidden); +static cl::opt + UseSolver("aie-postpipeliner-solver", + cl::desc("Use the solver as fallback after heuristics fail"), + cl::init(false), cl::Hidden); + +static cl::opt + PresetII("aie-postpipeliner-target-ii", + cl::desc("Run solver-only at this II; bypasses MaxII and " + "skips heuristics"), + cl::init(0), cl::Hidden); PipelineScheduleVisitor::~PipelineScheduleVisitor() {} @@ -160,16 +168,35 @@ bool PostPipeliner::isPostPipelineCandidate(MachineBasicBlock &LoopBlock) { return false; } - if (PresetII) { - TargetII = PresetII; + // No solver backend compiled in: TargetII/--aie-postpipeliner-solver + // are no-ops. Keep pre-commit behavior (heuristics only). + if (!Solver::hasSolver()) { + const bool AnyRequest = + PresetII || UseSolver || getInitiationInterval(getLoopID(LoopBlock)); + if (AnyRequest) { + DEBUG_SUMMARY( + dbgs() << " PostPipeliner: ignoring TargetII/solver request, " + "no solver compiled in\n"); + } return true; } - auto ParsedInitiationInterval = getInitiationInterval(getLoopID(LoopBlock)); - if (ParsedInitiationInterval) { - TargetII = *ParsedInitiationInterval; - DEBUG_SUMMARY(dbgs() << " PostPipeliner: TargetII=" << TargetII << "\n"); + + // --aie-postpipeliner-target-ii: hard one-shot. Bypasses MaxII and + // skips heuristics; only the solver runs at exactly this II. + if (PresetII) { + TargetII = PresetII; + TargetIIIsHardLimit = true; + } else if (!UseSolver) { + // Pragma soft hint: heuristics iterate normally and the solver runs + // at II == TargetII. --aie-postpipeliner-solver overrides this. + if (const auto Pragma = getInitiationInterval(getLoopID(LoopBlock))) + TargetII = *Pragma; } + if (TargetII) + DEBUG_SUMMARY(dbgs() << " PostPipeliner: TargetII=" << TargetII + << (TargetIIIsHardLimit ? " (hard)" : " (soft)") + << "\n"); return true; } @@ -1431,8 +1458,7 @@ static const ConfigStrategy::Configuration Heuristics[] = { {1, false, false, 1, {Prio::NodeNum}, {}}, // pure bottom up }; -bool PostPipeliner::tryApproaches() { - DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength << "\n"); +bool PostPipeliner::runHeuristics() { int HeuristicIndex = 0; for (const auto &Config : Heuristics) { if (Heuristic >= 0 && Heuristic != HeuristicIndex++) { @@ -1459,27 +1485,45 @@ bool PostPipeliner::tryApproaches() { } DEBUG_SUMMARY(dbgs() << " Strategy " << S.name() << " failed\n"); } + // Last-chance heuristic: relax the iteration-count constraint. IterCountSlackStrategy Relaxed(*DAG, Info, MinLength + II); resetSchedule(/*FullReset=*/true); - if (scheduleWithStrategy(Relaxed)) { + return scheduleWithStrategy(Relaxed); +} + +bool PostPipeliner::runSolverFallback() { + const SolverData Data = createSolverData(); + const int NS = MinLength / II; + if (solve(Data, NS, false)) { return true; } - - // TargetII is the OK from the user to spend some time reaching this II. - // Therefore, if we haven't found a solution yet, bring in the big guns. - if (II == TargetII) { - const SolverData Data = createSolverData(); - int NS = MinLength / II; - if (solve(Data, NS, false)) { - return true; - } - if (NS == MinTripCount) { - // Only try this at the boundary case - if (solve(Data, NS + 1, true)) { - return true; - } - } + // Let's try SEF solution. + if (solve(Data, NS + 1, true)) { + return true; } + // Marsshot: last try with full NS + 1. + return solve(Data, NS + 1, false); +} + +bool PostPipeliner::tryApproaches() { + DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength << "\n"); + + // CLI --aie-postpipeliner-target-ii: solver-only, skip heuristics. + const bool SolverOnly = TargetIIIsHardLimit; + const bool RunHeuristics = !SolverOnly; + + // Solver runs at this II if the user asked for solver fallback at every + // II, or this II matches a TargetII (CLI hard or pragma soft hint). + const bool SolverAtThisII = + UseSolver || SolverOnly || (TargetII != 0 && II == TargetII); + // Belt-and-braces re-check: never call solve() with no backend, even + // though isPostPipelineCandidate already filtered the request out. + const bool RunSolver = Solver::hasSolver() && SolverAtThisII; + + if (RunHeuristics && runHeuristics()) + return true; + if (RunSolver && runSolverFallback()) + return true; DEBUG_SUMMARY(dbgs() << "=== II=" << II << " Failed ===\n"); return false; @@ -1495,13 +1539,41 @@ bool PostPipeliner::solve(const SolverData &Data, int NS, bool SEFStage) { return false; } +// Register \p MI's per-cycle FU footprint into \p Data so +// SWPSolver::resourceConflicts can forbid same-cycle co-occupation. +static void addResourceUses(SolverData &Data, int Id, const MachineInstr *MI, + const AIEBaseInstrInfo *TII) { + const InstrItineraryData *Itin = + MI->getMF()->getSubtarget().getInstrItineraryData(); + if (!Itin || Itin->isEmpty()) { + return; + } + const unsigned SchedClass = MI->getDesc().getSchedClass(); + int Cycle = 0; + for (const InstrStage &IS : Itin->getStages(SchedClass)) { + const bool IsRequired = IS.getReservationKind() == InstrStage::Required; + const bool IsReserved = IS.getReservationKind() == InstrStage::Reserved; + assert(IsRequired != IsReserved && + "ReservationKind must be exactly one of Required/Reserved"); + + assert(IS.getNextCycles() >= 0 && + "Negative NextCycles breaks cumulative offset"); + const unsigned FUIndex = TII->getFuncUnitIndex(IS.getUnits()); + for (unsigned C = 0; C < IS.getCycles(); C++) { + const int Off = Cycle + C; + Data.addResourceUse(Id, Off, FUIndex, IsRequired); + } + Cycle += IS.getNextCycles(); + } +} + SolverData PostPipeliner::createSolverData() { SolverData Data; // Add the forward dependence edges within the first iteration for (int N = 0; N < NInstr; N++) { const SUnit &SU = DAG->SUnits[N]; MachineInstr *const MI = SU.getInstr(); - auto SlotKind = TII->getSlotKind(MI->getOpcode()); + const auto SlotKind = TII->getSlotKind(MI->getOpcode()); const uint64_t MemoryBanks = HR.getMemoryBanks(MI); const int Id = @@ -1512,6 +1584,7 @@ SolverData PostPipeliner::createSolverData() { assert(From < NInstr); Data.addLatency(From, N, Dep.getSignedLatency()); } + addResourceUses(Data, Id, MI, TII); } // Add loop-carried dependences to future iterations. The iteration @@ -1534,10 +1607,8 @@ SolverData PostPipeliner::createSolverData() { bool PostPipeliner::applySolver(const SolverData &Data, SWPSolver &Solver, int NS, bool SEFStage) { - // We don't model the resource hazards. They would be very tedious to express, - // since resource uses are offset relative to the instruction cycle. We would - // need to interpret raw itinerary data, and the modulo constraints on those - // would lead to very awkard expressions. + // FU resource hazards are modeled by SWPSolver::resourceConflicts via + // SolverData::ResourceUses (Required/Reserved bits per InstrItin stage). Solver.setScheduleSize(II, NS); Solver.genModel(Data, SEFStage); if (!Solver.solveModel()) { diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.h b/llvm/lib/Target/AIE/AIEPostPipeliner.h index dd2d2c4762bc..58ff832109a4 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.h +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.h @@ -247,11 +247,14 @@ class PostPipeliner { /// The minimum tripcount, read from the pragma, or from an LC initialization. int MinTripCount = 0; - /// The II requested by a pragma. This will trigger expensive algorithms - /// like solvers or exhaustive searches to be run if the heuristic methods - /// don't find a solution. + /// User/pragma-requested II at which the solver is additionally run. + /// Stays 0 when no solver backend is compiled in. int TargetII = 0; + /// True when TargetII is a hard CLI one-shot (skip heuristics, bypass + /// MaxII), false when it's a soft pragma hint. + bool TargetIIIsHardLimit = false; + /// The Preheader of the loop. MachineBasicBlock *Preheader = nullptr; @@ -316,6 +319,14 @@ class PostPipeliner { /// If it returns true, a valid schedule is laid down in Info. bool tryApproaches(); + /// Run the heuristic strategies (each ConfigStrategy plus the relaxed + /// IterCountSlackStrategy fallback) at the current II. + bool runHeuristics(); + + /// Run the solver-based last-resort attempts at the current II: + /// (NS, !SEF), (NS+1, SEF), (NS+1, !SEF). + bool runSolverFallback(); + /// Find the first available unscheduled instruction with the highest /// priority. int mostUrgent(PostPipelinerStrategy &Strategy); @@ -346,6 +357,14 @@ class PostPipeliner { /// \pre isPostPipelineCandidate has returned true int getResMII(MachineBasicBlock &LoopBlock); + /// Return the user/pragma-requested II, or 0 if none was set. + /// \pre isPostPipelineCandidate has returned true + int getTargetII() const { return TargetII; } + + /// True when TargetII is a hard CLI one-shot (vs a soft pragma hint). + /// \pre isPostPipelineCandidate has returned true + bool isTargetIIHardLimit() const { return TargetIIIsHardLimit; } + // Schedule using the given InitiationInterval. Return true when successful. // In that case calls to the query methods below are legitimate. bool schedule(ScheduleDAGMI &DAG, int InitiationInterval, diff --git a/llvm/lib/Target/AIE/AIESWPSolver.cpp b/llvm/lib/Target/AIE/AIESWPSolver.cpp index e222d1d378ad..168ccb2fbe0b 100644 --- a/llvm/lib/Target/AIE/AIESWPSolver.cpp +++ b/llvm/lib/Target/AIE/AIESWPSolver.cpp @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// // This file contains an interface to create constraints to model a software @@ -58,6 +58,14 @@ std::vector> getSolvers() { return Solvers; } +bool hasSolver() { +#if LLVM_WITH_Z3 + return true; +#else + return false; +#endif // LLVM_WITH_Z3 +} + Slot &SolverData::addSlot(int N) { auto It = Slots.emplace(N, Slot(N)).first; return It->second; @@ -72,6 +80,35 @@ int SolverData::addInstruction(int SlotNumber, uint64_t MemoryBanks, return Id; } +// Grow \p BV so bit index \p Idx is in range, then set it. +static void setFU(BitVector &BV, unsigned Idx) { + if (Idx >= BV.size()) + BV.resize(Idx + 1); + BV.set(Idx); +} + +void SolverData::addResourceUse(int InstrId, int CycleOffset, unsigned FUIndex, + bool IsRequired) { + // Try to merge into an existing entry for this (instruction, offset). + for (ResourceUseEntry &E : ResourceUses) { + const bool IsSameSlot = E.Instr == InstrId && E.CycleOffset == CycleOffset; + if (!IsSameSlot) + continue; + + BitVector &TargetBV = IsRequired ? E.Required : E.Reserved; + setFU(TargetBV, FUIndex); + assert(!E.Required.anyCommon(E.Reserved) && + "FU cannot be both Required and Reserved at the same offset"); + return; + } + + // No matching entry: append a fresh one and set the requested bit. + ResourceUses.push_back({InstrId, CycleOffset, BitVector(), BitVector()}); + ResourceUseEntry &New = ResourceUses.back(); + BitVector &TargetBV = IsRequired ? New.Required : New.Reserved; + setFU(TargetBV, FUIndex); +} + void SolverData::addLatency(int Src, int Dst, int Latency, int Distance) { Latencies.emplace_back(Src, Dst, Latency, Distance); if (Distance) { @@ -174,6 +211,40 @@ void SWPSolver::conflicts(const SolverData &Data) { } } +// True iff \p A and \p B touch a shared FU under FuncUnitWrapper::conflict() +// rules: Req-Req, Req-Res, or Res-Req overlap (Res-Res is not a conflict). +static bool fuConflict(const ResourceUseEntry &A, const ResourceUseEntry &B) { + return A.Required.anyCommon(B.Required) || A.Reserved.anyCommon(B.Required) || + A.Required.anyCommon(B.Reserved); +} + +void SWPSolver::resourceConflicts(const SolverData &Data) { + const auto &Uses = Data.getResourceUses(); + const auto &Insts = Data.getInstructions(); + for (size_t I = 0; I < Uses.size(); ++I) { + for (size_t J = I + 1; J < Uses.size(); ++J) { + const ResourceUseEntry &UA = Uses[I]; + const ResourceUseEntry &UB = Uses[J]; + // Self-conflicts are tracked by the scoreboard pre-check, not here. + if (UA.Instr == UB.Instr) { + continue; + } + // Only emit when FuncUnitWrapper would also flag this pair (keeps the + // solver model and the real scoreboard in lockstep). + if (!fuConflict(UA, UB)) { + continue; + } + // Same-slot, same-offset is already forbidden by genSlotConstraint; + // skip the redundant resource constraint to keep the model small. + if (UA.CycleOffset == UB.CycleOffset && + Insts[UA.Instr].TheSlot == Insts[UB.Instr].TheSlot) { + continue; + } + genConflict(UA.Instr, UB.Instr, UA.CycleOffset, UB.CycleOffset); + } + } +} + #if LLVM_WITH_Z3 Z3Solver::Z3Solver() : Solver(Context), Zero(Context.int_val(0)) { // timeout behaves undeterministically @@ -230,6 +301,7 @@ void Z3Solver::genModel(const SolverData &Data, bool SEFStage) { cycles(Data); latencies(Data); conflicts(Data); + resourceConflicts(Data); } bool Z3Solver::solveModel() { @@ -391,15 +463,28 @@ void Z3BinarySolver::genSlotConstraint(int SlotNo, const Slot &Slot) { } } -void Z3BinarySolver::genConflict(int M, int N) { +void Z3BinarySolver::genConflict(int InstrA, int InstrB, int OffsetA, + int OffsetB) { const int II = getII(); - z3::expr_vector Elements(Context); - // All stages have a contribution to a particular cycle) - for (int C = 0; C < II; C++) { + // Collision condition for two instructions sharing a resource: + // issueA + OffsetA = issueB + OffsetB (mod II). + // Equivalently: + // issueB = issueA + (OffsetA - OffsetB) (mod II). + // Memory-bank callers pass OffsetA = OffsetB = 0, so this reduces to + // issueA != issueB (mod II). + const int OffsetDiff = OffsetA - OffsetB; + // Normalize into [0, II); +II handles a negative OffsetDiff. + const int Delta = (OffsetDiff % II + II) % II; + + // For each CA the colliding CB is unique. + for (int CA = 0; CA < II; CA++) { + // Unwrap then wrap back into [0, II). + const int CBRaw = CA + Delta; + const int CB = CBRaw % II; z3::expr_vector Elements(Context); for (int S = 0; S < NumStages; S++) { - addVar(M, S, C, Elements); - addVar(N, S, C, Elements); + addVar(InstrA, S, CA, Elements); + addVar(InstrB, S, CB, Elements); } if (Elements.empty()) { continue; @@ -451,8 +536,14 @@ z3::expr Z3IntegerSolver::genCycle(int N) { return StageVarDecls[N] * getII() + CycleVarDecls[N]; } -void Z3IntegerSolver::genConflict(int M, int N) { - Solver.add(CycleVarDecls[M] != CycleVarDecls[N]); +void Z3IntegerSolver::genConflict(int InstrA, int InstrB, int OffsetA, + int OffsetB) { + // Non-zero offsets need z3::mod against II (NIA, not LIA); only the + // delta-zero memory-bank case is supported today. + assert(OffsetA == 0 && OffsetB == 0 && + "Z3IntegerSolver::genConflict with non-zero offsets is not " + "implemented (would require z3::mod, pushing the model into NIA)."); + Solver.add(CycleVarDecls[InstrA] != CycleVarDecls[InstrB]); } void Z3IntegerSolver::genSlotConstraint(int SlotNo, const Slot &Slot) { diff --git a/llvm/lib/Target/AIE/AIESWPSolver.h b/llvm/lib/Target/AIE/AIESWPSolver.h index 2816f233fb2f..c9cb32a7fc7b 100644 --- a/llvm/lib/Target/AIE/AIESWPSolver.h +++ b/llvm/lib/Target/AIE/AIESWPSolver.h @@ -4,13 +4,14 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_AIE_AIESWPSOLVER_H #define LLVM_LIB_TARGET_AIE_AIESWPSOLVER_H +#include "llvm/ADT/BitVector.h" #include "llvm/Config/config.h" #if LLVM_WITH_Z3 #include "z3++.h" @@ -69,6 +70,15 @@ class Instruction { HasSideEffect(HasSideEffect) {} }; +// Per-cycle FU fingerprint of an instruction. BitVector (not uint64_t) +// so the model scales past 64 FUs (AIE2PS has 106). See fuConflict(). +struct ResourceUseEntry { + int Instr; + int CycleOffset; // relative to the instruction's issue cycle + BitVector Required; + BitVector Reserved; +}; + class ProblemSize { public: ProblemSize(); @@ -99,6 +109,9 @@ class SolverData { std::vector Instructions; // Holds all latencies. std::vector Latencies; + // One entry per (Instr, CycleOffset) with Required/Reserved BitVectors + // OR-aggregated across stages covering CycleOffset. + std::vector ResourceUses; // Add a slot to the problem Slot &addSlot(int N); @@ -109,6 +122,10 @@ class SolverData { // Distance represents the iteration distance, i.e. the number of // cfg backedges it spans. void addLatency(int Src, int Dst, int Latency, int Distance = 0); + /// Mark FU bit \p FUIndex as Required (\p IsRequired) or Reserved on + /// the (\p InstrId, \p CycleOffset) entry, creating it if absent. + void addResourceUse(int InstrId, int CycleOffset, unsigned FUIndex, + bool IsRequired); // Post-process when all data has been added. // II supplies the initiation interval. @@ -130,6 +147,9 @@ class SolverData { const std::vector &getInstructions() const { return Instructions; } + const std::vector &getResourceUses() const { + return ResourceUses; + } }; class SWPSolver { @@ -156,9 +176,11 @@ class SWPSolver { virtual void genSlotConstraint(int SlotNo, const Slot &Slot) = 0; // Generate a constraint that represents a dependence latency virtual void genLatencyConstraint(const Latency &L) = 0; - // Generate a mutual exclusion constraint for instructions M and N in any - // cycle - virtual void genConflict(int M, int N) = 0; + // Forbid (\p InstrA, \p InstrB) from colliding at the same modular cycle: + // issueA + \p OffsetA = issueB + \p OffsetB (mod II). Default offsets 0 + // recover the legacy "issueA != issueB (mod II)" memory-bank constraint. + virtual void genConflict(int InstrA, int InstrB, int OffsetA = 0, + int OffsetB = 0) = 0; // Return the vector of instruction cycles // \pre genModel() has returned true @@ -178,11 +200,20 @@ class SWPSolver { virtual bool solveModel() = 0; // Generate further instruction conflict constraints void conflicts(const SolverData &Data); + // Emit FU exclusion constraints from \p Data.ResourceUses: every pair of + // instructions sharing an FU bit is forbidden from co-occupying its modular + // cycle. + void resourceConflicts(const SolverData &Data); }; // Return the set of solvers to try std::vector> getSolvers(); +/// Return true if at least one SWP solver backend is compiled into this +/// build. When false, getSolvers() returns an empty vector and any code +/// path that depends on the solver must fall back gracefully. +bool hasSolver(); + #if LLVM_WITH_Z3 class Z3Solver : public SWPSolver { protected: @@ -249,7 +280,8 @@ class Z3BinarySolver : public Z3Solver { void genSlotConstraint(int SlotNo, const Slot &Slot) override; void genLatencyConstraint(const Latency &L) override; - void genConflict(int M, int N) override; + void genConflict(int InstrA, int InstrB, int OffsetA = 0, + int OffsetB = 0) override; z3::expr genCycle(int I) override; void vars(const SolverData &Data, bool SEFStage) override; void scheduled(const SolverData &Data) override; @@ -269,7 +301,7 @@ class Z3IntegerSolver : public Z3Solver { void vars(const SolverData &Data, bool SEFStage) override; void scheduled(const SolverData &Data) override; z3::expr genCycle(int N) override; - void genConflict(int M, int N) override; + void genConflict(int A, int B, int OffsetA = 0, int OffsetB = 0) override; void genSlotConstraint(int SlotNo, const Slot &Slot) override; public: diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v10-solver.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v10-solver.mir index 5376373100a2..6d59d70e4dfe 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v10-solver.mir +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v10-solver.mir @@ -3,13 +3,14 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates # REQUIRES: enable_z3_solver # RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 %s \ # RUN: --start-before=postmisched \ # RUN: --aie-postpipeliner-heuristic-runs=1 \ +# RUN: --aie-postpipeliner-solver \ # RUN: -o - | FileCheck %s @@ -22,65 +23,64 @@ ; CHECK-LABEL: gemm: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry - ; CHECK-NEXT: nopa ; vldb x4, [p7, #64]; nopxm + ; CHECK-NEXT: vldb x4, [p7, #64]; nopx ; CHECK-NEXT: vldb.3d x7, [p7], d0; movs p4, p7 ; CHECK-NEXT: paddb [p4], m4 - ; CHECK-NEXT: vldb x9, [p4, #0] ; CHECK-NEXT: vldb x5, [p4, #64] + ; CHECK-NEXT: vldb x9, [p4, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vldb x4, [p7, #64]; mov p5, p6 ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0 ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vshuffle x7, x7, x4, r1 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x9, [p4, #0]; vshuffle x8, x9, x5, r0 - ; CHECK-NEXT: vldb x5, [p4, #64]; vshuffle x9, x9, x5, r1 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x5, [p4, #64]; vshuffle x8, x9, x5, r0 + ; CHECK-NEXT: vldb x9, [p4, #0]; vshuffle x9, x9, x5, r1 ; CHECK-NEXT: padda [p5], m5; add.nc lc, r0, #-3; vmul.f dm4, y3, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; movxm ls, #.LBB0_1; vmul.f dm4, y4, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; movxm le, #.L_LEnd0 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; movxm ls, #.LBB0_1; vmul.f dm4, y4, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; movxm le, #.L_LEnd0 ; CHECK-NEXT: vldb x4, [p7, #64]; mov p5, p6 ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0 ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x7, x7, x4, r1 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vldb x5, [p4, #64]; vshuffle x9, x9, x5, r1 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x5, [p4, #64]; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: padda [p5], m5; nopb ; nops ; nopxm ; vmul.f dm4, y3, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64] - ; CHECK-NEXT: vldb x4, [p7, #64]; mov p5, p6 - ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0; vmac.f dm3, dm3, ex0, ex1, r3 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; nopx ; vshuffle x7, x7, x4, r1; vmac.f dm0, dm0, ex2, ex3, r3 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex1, dm4; nopx ; vshuffle x8, x9, x5, r0; vmac.f dm1, dm1, ex2, ex1, r3 + ; CHECK-NEXT: nopa ; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex3, dm4; nopx ; vshuffle x9, x9, x5, r1; nopv + ; CHECK-NEXT: padda [p5], m5; nopb ; nopx ; vmul.f dm4, y3, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0] + ; CHECK-NEXT: vldb x4, [p7, #64]; mov p5, p6; vmac.f dm3, dm3, ex0, ex1, r3 + ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; nopx ; vshuffle x7, x7, x4, r1; vmac.f dm1, dm1, ex2, ex1, r3 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopa ; vldb x5, [p4, #64]; vconv.bfp16ebs8.fp32 ex3, dm4; nopx ; vshuffle x9, x9, x5, r1; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x5, [p4, #64]; vconv.bfp16ebs8.fp32 ex1, dm4; nopx ; vshuffle x8, x9, x5, r0; vmac.f dm0, dm0, ex2, ex3, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: // %bb.2: - ; CHECK-NEXT: padda [p5], m5; nopb ; nops ; nopxm ; vmul.f dm4, y3, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; nopb ; nopx ; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64] - ; CHECK-NEXT: mov p5, p6 - ; CHECK-NEXT: vshuffle x6, x7, x4, r0; vmac.f dm3, dm3, ex0, ex1, r3 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x7, x7, x4, r1; vmac.f dm0, dm0, ex2, ex3, r3 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0; vmac.f dm1, dm1, ex2, ex1, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vshuffle x9, x9, x5, r1; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: nopa ; vldb x9, [p4, #0]; nopx ; vshuffle x9, x9, x5, r1; vconv.bfp16ebs8.fp32 ex3, dm4 ; CHECK-NEXT: padda [p5], m5; vmul.f dm4, y3, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64] - ; CHECK-NEXT: nop + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0] + ; CHECK-NEXT: mov p5, p6; vmac.f dm3, dm3, ex0, ex1, r3 + ; CHECK-NEXT: vshuffle x6, x7, x4, r0; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x7, x7, x4, r1; vmac.f dm1, dm1, ex2, ex1, r3 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0; vmac.f dm0, dm0, ex2, ex3, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vshuffle x9, x9, x5, r1 + ; CHECK-NEXT: padda [p5], m5; vmul.f dm4, y3, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0] ; CHECK-NEXT: vmac.f dm3, dm3, ex0, ex1, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex0, dm4; vmac.f dm0, dm0, ex2, ex3, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex1, dm4; vmac.f dm1, dm1, ex2, ex1, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex0, dm4; vmac.f dm1, dm1, ex2, ex1, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex1, dm4; vmac.f dm0, dm0, ex2, ex3, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4 ; CHECK-NEXT: nop ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex2, dm4 ; CHECK-NEXT: nop - ; CHECK-NEXT: nop ; CHECK-NEXT: vmac.f dm3, dm3, ex0, ex1, r3 - ; CHECK-NEXT: vmac.f dm0, dm0, ex2, ex3, r3 - ; CHECK-NEXT: vmac.f dm1, dm1, ex2, ex1, r3 ; CHECK-NEXT: vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vmac.f dm1, dm1, ex2, ex1, r3 + ; CHECK-NEXT: vmac.f dm0, dm0, ex2, ex3, r3 ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v6-solver.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v6-solver.mir index 719ace1080ad..fb04cfdebb19 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v6-solver.mir +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v6-solver.mir @@ -3,13 +3,14 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates # REQUIRES: enable_z3_solver # RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 %s \ # RUN: --start-before=postmisched \ # RUN: --aie-postpipeliner-heuristic-runs=1 \ +# RUN: --aie-postpipeliner-solver \ # RUN: -o - | FileCheck %s # derived from GEMM_Bfp16_opt_0 @@ -29,36 +30,36 @@ ; CHECK-LABEL: gemm: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry - ; CHECK-NEXT: nopa ; vldb x4, [p7, #64]; nopxm ; movs p4, p7 + ; CHECK-NEXT: nopa ; vldb x4, [p7, #64]; nopx ; movs p4, p7 ; CHECK-NEXT: vldb.3d x7, [p7], d0 ; CHECK-NEXT: paddb [p4], m4 ; CHECK-NEXT: vldb x9, [p4, #0] ; CHECK-NEXT: vldb x5, [p4, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop - ; CHECK-NEXT: movs p4, p7; vldb x4, [p7, #64]; add.nc lc, r0, #-3 + ; CHECK-NEXT: vldb x4, [p7, #64]; movs p4, p7 ; CHECK-NEXT: vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0 ; CHECK-NEXT: paddb [p4], m4; vshuffle x7, x7, x4, r1 ; CHECK-NEXT: vldb x9, [p4, #0]; mov p5, p6 ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vldb x5, [p4, #64]; vshuffle x8, x9, x5, r0; vmul.f dm4, y3, y5, r2 ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vshuffle x9, x9, x5, r1 - ; CHECK-NEXT: padda [p5], m5; movxm ls, #.LBB0_1 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vldb x4, [p7, #64]; movs p4, p7; movxm le, #.L_LEnd0; vmul.f dm4, y4, y5, r2 + ; CHECK-NEXT: padda [p5], m5; add.nc lc, r0, #-3 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vldb x4, [p7, #64]; movs p4, p7; movxm ls, #.LBB0_1; vmul.f dm4, y4, y5, r2 ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0 ; CHECK-NEXT: paddb [p4], m4; vshuffle x7, x7, x4, r1 ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex1, dm4; vldb x9, [p4, #0]; mov p5, p6 ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vldb x5, [p4, #64]; vshuffle x8, x9, x5, r0; vmul.f dm4, y3, y5, r2 ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x9, x9, x5, r1 - ; CHECK-NEXT: padda [p5], m5; vconv.bfp16ebs8.fp32 ex3, dm4 + ; CHECK-NEXT: padda [p5], m5; nopb ; vconv.bfp16ebs8.fp32 ex3, dm4; movxm le, #.L_LEnd0; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vldb x4, [p7, #64]; movs p4, p7; nopxm ; vmul.f dm4, y4, y5, r2 ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; vldb.3d x7, [p7], d0; vconv.bfp16ebs8.fp32 ex2, dm4; nopx ; vshuffle x6, x7, x4, r0; nopv - ; CHECK-NEXT: paddb [p4], m4; vshuffle x7, x7, x4, r1; vmac.f dm3, dm3, ex0, ex1, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex1, dm4; vldb x9, [p4, #0]; mov p5, p6; vmac.f dm2, dm2, ex0, ex3, r3 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vldb x5, [p4, #64]; vshuffle x8, x9, x5, r0; vmul.f dm4, y3, y5, r2 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x9, x9, x5, r1; vmac.f dm1, dm1, ex2, ex1, r3 + ; CHECK-NEXT: nopa ; paddb [p4], m4; nops ; nopx ; vshuffle x7, x7, x4, r1; vmac.f dm3, dm3, ex0, ex1, r3 + ; CHECK-NEXT: nopa ; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex1, dm4; nopx ; mov p5, p6; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vldb x5, [p4, #64]; nops ; nopx ; vshuffle x8, x9, x5, r0; vmul.f dm4, y3, y5, r2 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; nopb ; vconv.bfp16ebs8.fp32 ex0, dm4; nopx ; vshuffle x9, x9, x5, r1; vmac.f dm1, dm1, ex2, ex1, r3 ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: padda [p5], m5; nopb ; vconv.bfp16ebs8.fp32 ex3, dm4; nopxm ; vmac.f dm0, dm0, ex2, ex3, r3 ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v7-solver.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v7-solver.mir index 49e5498ad6d0..e88a9cb2069f 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v7-solver.mir +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v7-solver.mir @@ -3,13 +3,14 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates # REQUIRES: enable_z3_solver # RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 %s \ # RUN: --start-before=postmisched \ # RUN: --aie-postpipeliner-heuristic-runs=1 \ +# RUN: --aie-postpipeliner-solver \ # RUN: -o - | FileCheck %s @@ -22,65 +23,64 @@ ; CHECK-LABEL: gemm: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry - ; CHECK-NEXT: nopa ; vldb x4, [p7, #64]; nopxm + ; CHECK-NEXT: vldb x4, [p7, #64]; nopx ; CHECK-NEXT: vldb.3d x7, [p7], d0; movs p4, p7 ; CHECK-NEXT: paddb [p4], m4 - ; CHECK-NEXT: vldb x9, [p4, #0] ; CHECK-NEXT: vldb x5, [p4, #64] + ; CHECK-NEXT: vldb x9, [p4, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop - ; CHECK-NEXT: vldb x4, [p7, #64]; mov p5, p6 - ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vshuffle x7, x7, x4, r1 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x9, [p4, #0]; vshuffle x8, x9, x5, r0 - ; CHECK-NEXT: vldb x5, [p4, #64]; vshuffle x9, x9, x5, r1 + ; CHECK-NEXT: vldb x4, [p7, #64]; vshuffle x6, x7, x4, r0 + ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x7, x7, x4, r1 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; mov p5, p6 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x5, [p4, #64]; vshuffle x8, x9, x5, r0 + ; CHECK-NEXT: vldb x9, [p4, #0]; vshuffle x9, x9, x5, r1 ; CHECK-NEXT: padda [p5], m5; add.nc lc, r0, #-3; vmul.f dm4, y3, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; movxm ls, #.LBB0_1; vmul.f dm4, y4, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; movxm le, #.L_LEnd0 - ; CHECK-NEXT: vldb x4, [p7, #64]; mov p5, p6 - ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x7, x7, x4, r1 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vldb x5, [p4, #64]; vshuffle x9, x9, x5, r1 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; movxm ls, #.LBB0_1; vmul.f dm4, y4, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; movxm le, #.L_LEnd0 + ; CHECK-NEXT: vldb x4, [p7, #64]; vshuffle x6, x7, x4, r0 + ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x7, x7, x4, r1 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; mov p5, p6 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x5, [p4, #64]; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: padda [p5], m5; nopb ; nops ; nopxm ; vmul.f dm4, y3, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64] - ; CHECK-NEXT: vldb x4, [p7, #64]; mov p5, p6 - ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0; vmac.f dm3, dm3, ex0, ex1, r3 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; nopx ; vshuffle x7, x7, x4, r1; vmac.f dm0, dm0, ex2, ex3, r3 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex1, dm4; nopx ; vshuffle x8, x9, x5, r0; vmac.f dm1, dm1, ex2, ex1, r3 + ; CHECK-NEXT: nopa ; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex3, dm4; nopx ; vshuffle x9, x9, x5, r1; nopv + ; CHECK-NEXT: padda [p5], m5; nopb ; nopx ; vmul.f dm4, y3, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0] + ; CHECK-NEXT: vldb x4, [p7, #64]; vshuffle x6, x7, x4, r0; vmac.f dm3, dm3, ex0, ex1, r3 + ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x7, x7, x4, r1; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; nopx ; mov p5, p6; vmac.f dm1, dm1, ex2, ex1, r3 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopa ; vldb x5, [p4, #64]; vconv.bfp16ebs8.fp32 ex3, dm4; nopx ; vshuffle x9, x9, x5, r1; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x5, [p4, #64]; vconv.bfp16ebs8.fp32 ex1, dm4; nopx ; vshuffle x8, x9, x5, r0; vmac.f dm0, dm0, ex2, ex3, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: // %bb.2: - ; CHECK-NEXT: padda [p5], m5; nopb ; nops ; nopxm ; vmul.f dm4, y3, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; nopb ; nopx ; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64] - ; CHECK-NEXT: mov p5, p6 + ; CHECK-NEXT: nopa ; vldb x9, [p4, #0]; nopx ; vshuffle x9, x9, x5, r1; vconv.bfp16ebs8.fp32 ex3, dm4 + ; CHECK-NEXT: padda [p5], m5; vmul.f dm4, y3, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0] ; CHECK-NEXT: vshuffle x6, x7, x4, r0; vmac.f dm3, dm3, ex0, ex1, r3 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x7, x7, x4, r1; vmac.f dm0, dm0, ex2, ex3, r3 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0; vmac.f dm1, dm1, ex2, ex1, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vshuffle x9, x9, x5, r1; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vshuffle x7, x7, x4, r1; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vconv.bfp16ebs8.fp32 ex0, dm4; mov p5, p6; vmac.f dm1, dm1, ex2, ex1, r3 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0; vmac.f dm0, dm0, ex2, ex3, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vshuffle x9, x9, x5, r1 ; CHECK-NEXT: padda [p5], m5; vmul.f dm4, y3, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64] - ; CHECK-NEXT: nop + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0] ; CHECK-NEXT: vmac.f dm3, dm3, ex0, ex1, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex0, dm4; vmac.f dm0, dm0, ex2, ex3, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex1, dm4; vmac.f dm1, dm1, ex2, ex1, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex0, dm4; vmac.f dm1, dm1, ex2, ex1, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex1, dm4; vmac.f dm0, dm0, ex2, ex3, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4 ; CHECK-NEXT: nop ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex2, dm4 ; CHECK-NEXT: nop - ; CHECK-NEXT: nop ; CHECK-NEXT: vmac.f dm3, dm3, ex0, ex1, r3 - ; CHECK-NEXT: vmac.f dm0, dm0, ex2, ex3, r3 - ; CHECK-NEXT: vmac.f dm1, dm1, ex2, ex1, r3 ; CHECK-NEXT: vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vmac.f dm1, dm1, ex2, ex1, r3 + ; CHECK-NEXT: vmac.f dm0, dm0, ex2, ex3, r3 ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 diff --git a/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/maxpool-10instr-solver.mir b/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/maxpool-10instr-solver.mir new file mode 100644 index 000000000000..c43631c4509e --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/maxpool-10instr-solver.mir @@ -0,0 +1,122 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +# REQUIRES: enable_z3_solver + +# RUN: llc -mtriple=aie2ps -O2 --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-heuristic-runs=1 \ +# RUN: --aie-postpipeliner-solver \ +# RUN: -o - | FileCheck %s + +# MaxPool2D inner loop (10 instructions) with VSEL. +# Similar to the 9-instr variant but with an extra VSEL_8 between +# VSHIFT and VMAX_LT_8 in the compute chain. +# The Z3 solver finds a solution at II=4 NS=4 but it fails +# CheckFixedSchedule due to pipeline resource hazards. +# Without the retry mechanism the heuristic falls back to II=7. + +--- | + target triple = "aie2ps" + + define void @maxpool_inner_10instr(ptr addrspace(5) noalias %data, ptr addrspace(6) noalias %weights, ptr addrspace(7) noalias %out, i32 %n) { + ; CHECK-LABEL: maxpool_inner_10instr: + ; CHECK: // %bb.0: // %preheader + ; CHECK-NEXT: nopa ; nopb ; nopx ; mov p0, p6; nops + ; CHECK-NEXT: movs p3, p1; vldb wh9, [p0, #32]; mov p5, p6 + ; CHECK-NEXT: vlda wl7, [p3], #32; vldb.3d wl9, [p5], d0; mov r24, p0 + ; CHECK-NEXT: nop + ; CHECK-NEXT: movs p0, p5 + ; CHECK-NEXT: vldb wh9, [p0, #32]; and r26, r24, r18 + ; CHECK-NEXT: vlda wl7, [p3], #32; vldb.3d wl9, [p5], d0; add.nc lc, r2, #-3; mov r24, p0 + ; CHECK-NEXT: movxm ls, #.LBB0_1 + ; CHECK-NEXT: nopa ; nopb ; movs p0, p5; movxm le, #.L_LEnd0; nopv + ; CHECK-NEXT: nopa ; vldb wh9, [p0, #32]; nops ; and r26, r24, r18; vshift x5, x9, x0, r26; nopv + ; CHECK-NEXT: vlda wl7, [p3], #32; vldb.3d wl9, [p5], d0; nops ; nopx ; mov r24, p0; nopv + ; CHECK-NEXT: nopa ; nopb ; movs p4, p2; nopx ; vsel.8 x3, x0, x5, r5:r4; nopv + ; CHECK-NEXT: nopa ; nopb ; movs p0, p5; nopx ; vmax_lt.8 x1, r17:r16, x7, x3, vaddsign1; nopv + ; CHECK-NEXT: .LBB0_1: // %loop_body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: nopa ; vldb wh9, [p0, #32]; and r26, r24, r18; vshift x5, x9, x0, r26; nops + ; CHECK-NEXT: vlda wl7, [p3], #32; vldb.3d wl9, [p5], d0; mov r24, p0 + ; CHECK-NEXT: vst wl1, [p4], #32; vsel.8 x3, x0, x5, r5:r4 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopa ; nopb ; movs p0, p5; nopx ; vmax_lt.8 x1, r17:r16, x7, x3, vaddsign1; nopv + ; CHECK-NEXT: // %bb.2: // %exit + ; CHECK-NEXT: nopa ; and r26, r24, r18; vshift x5, x9, x0, r26 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vst wl1, [p4], #32; vsel.8 x3, x0, x5, r5:r4 + ; CHECK-NEXT: vmax_lt.8 x1, r17:r16, x7, x3, vaddsign1 + ; CHECK-NEXT: vshift x5, x9, x0, r26 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vst wl1, [p4], #32; vsel.8 x3, x0, x5, r5:r4 + ; CHECK-NEXT: vmax_lt.8 x1, r17:r16, x7, x3, vaddsign1 + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: vst wl1, [p4], #32 + ; CHECK-NEXT: ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + preheader: + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %loop_body + loop_body: + %data.phi = phi ptr addrspace(5) [ %data, %preheader ], [ %data.phi, %loop_body ] + %weights.phi = phi ptr addrspace(6) [ %weights, %preheader ], [ %weights.phi, %loop_body ] + %out.phi = phi ptr addrspace(7) [ %out, %preheader ], [ %out.phi, %loop_body ] + %dec = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %dec, label %loop_body, label %exit, !llvm.loop !0 + exit: + ret void + } + + declare void @llvm.set.loop.iterations.i32(i32) + declare i1 @llvm.loop.decrement.i32(i32) + + !0 = distinct !{!0, !1} + !1 = !{!"llvm.loop.itercount.range", i64 4} +... +--- +name: maxpool_inner_10instr +tracksRegLiveness: true +body: | + bb.0.preheader: + successors: %bb.1(0x80000000) + liveins: $p1, $p2, $p6, $r2, $r6, $r18, $r22, $x0:0x0000000000000003, $l2:0x000000000C000000, $d0_3d:0x3800000040001C00, $crbf8conf, $crfp8conf, $vaddsign1 + + $lc = ADD_NC_add_lc_ri $r2, 0 + MOVXM_lng_cg_ls_abs %bb.1, implicit-def $ls + MOVXM_lng_cg_le_abs , implicit-def $le + $p3 = MOV_scalar_pseudo $p1 + $p4 = MOV_scalar_pseudo $p2 + $p0 = MOV_scalar_pseudo $p6 + $p5 = MOV_scalar_pseudo $p6 + + bb.1.loop_body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $dc0, $dc4, $dj0, $dj4, $dn0, $dn4, $l2:0x000000000C000000, $m0, $m1, $p0, $p1, $p2, $p3, $p4, $p5, $p6, $r0, $r2, $r6, $r18, $r22, $x0:0x0000000000000003, $d0_3d:0x3800000040001C00 + + renamable $wh9 = VLD_w_idx_imm_pseudo renamable $p0, 32 :: (load (<8 x s32>) from %ir.data.phi, addrspace 5) + $wl9, $p5, $dc0, $dc4 = VLD_3D_w_pseudo killed $p5, $d0_3d :: (load (<8 x s32>) from %ir.data.phi, addrspace 5) + renamable $wl7, renamable $p3 = VLD_w_pstm_nrm_imm_pseudo killed renamable $p3, 32 :: (load (<8 x s32>) from %ir.weights.phi, addrspace 6) + $r24 = MOV_alu_mv_mv_mv_scl killed $p0 + renamable $r26 = AND killed renamable $r24, renamable $r18 + renamable $x5 = VSHIFT killed renamable $x9, undef renamable $x0, killed renamable $r26 + renamable $x3 = VSEL_8 renamable $x0, killed renamable $x5, renamable $l2, implicit $crbf8conf, implicit $crfp8conf + renamable $x1, dead renamable $l8 = VMAX_LT_8_vaddSign1 renamable $x7, killed renamable $x3, implicit $crbf8conf, implicit $crfp8conf, implicit $vaddsign1 + renamable $p4 = VST_dmw_sts_w_st_pstm_nrm_imm killed renamable $wl1, killed renamable $p4, 32 :: (store (<8 x s32>) into %ir.out.phi, addrspace 6) + $p0 = MOV_scalar_pseudo $p5 + PseudoLoopEnd , %bb.1 + + bb.2.exit: + liveins: $p3, $p4 + RET implicit $lr + DelayedSchedBarrier +... diff --git a/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/maxpool-9instr-solver.mir b/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/maxpool-9instr-solver.mir new file mode 100644 index 000000000000..acc810838d23 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/maxpool-9instr-solver.mir @@ -0,0 +1,117 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +# REQUIRES: enable_z3_solver + +# RUN: llc -mtriple=aie2ps -O2 --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-heuristic-runs=1 \ +# RUN: --aie-postpipeliner-solver \ +# RUN: -o - | FileCheck %s + +# MaxPool2D inner loop (9 instructions) without VSEL. +# The Z3 solver uses the same NS as the heuristics (MinLength/II and MinLength/II + 1) +# and finds a valid schedule at II=4. + +--- | + target triple = "aie2ps" + + define void @maxpool_inner_9instr(ptr addrspace(5) noalias %data, ptr addrspace(6) noalias %weights, ptr addrspace(7) noalias %out, i32 %n) { + ; CHECK-LABEL: maxpool_inner_9instr: + ; CHECK: // %bb.0: // %preheader + ; CHECK-NEXT: nopa ; nopb ; nopx ; mov p3, p0; nops + ; CHECK-NEXT: movs p5, p1; vldb wh5, [p3, #32]; mov p7, p0 + ; CHECK-NEXT: vlda wl3, [p5], #32; vldb.3d wl5, [p7], d0; mov r22, p3 + ; CHECK-NEXT: nop + ; CHECK-NEXT: movs p3, p7 + ; CHECK-NEXT: vldb wh5, [p3, #32]; and r24, r22, r20 + ; CHECK-NEXT: vlda wl3, [p5], #32; vldb.3d wl5, [p7], d0; add.nc lc, r2, #-3; mov r22, p3 + ; CHECK-NEXT: movxm ls, #.LBB0_1 + ; CHECK-NEXT: nopa ; nopb ; movs p3, p7; movxm le, #.L_LEnd0; nopv + ; CHECK-NEXT: nopa ; vldb wh5, [p3, #32]; nops ; and r24, r22, r20; vshift x1, x5, x0, r24; nopv + ; CHECK-NEXT: vlda wl3, [p5], #32; vldb.3d wl5, [p7], d0; nops ; nopx ; mov r22, p3; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; vmax_lt.8 x10, r17:r16, x3, x1, vaddsign1; nopv + ; CHECK-NEXT: nopa ; nopb ; movs p3, p7; nopx ; mov p6, p2; nopv + ; CHECK-NEXT: .LBB0_1: // %loop_body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: nopa ; vldb wh5, [p3, #32]; nops ; and r24, r22, r20; vshift x1, x5, x0, r24; nopv + ; CHECK-NEXT: vlda wl3, [p5], #32; vldb.3d wl5, [p7], d0; vst wl10, [p6], #32; mov r22, p3 + ; CHECK-NEXT: vmax_lt.8 x10, r17:r16, x3, x1, vaddsign1 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopa ; nopb ; movs p3, p7; nopxm ; nopv + ; CHECK-NEXT: // %bb.2: // %exit + ; CHECK-NEXT: nopa ; nopb ; nops ; and r24, r22, r20; vshift x1, x5, x0, r24; nopv + ; CHECK-NEXT: vst wl10, [p6], #32; nopb ; nopx + ; CHECK-NEXT: vmax_lt.8 x10, r17:r16, x3, x1, vaddsign1 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vshift x1, x5, x0, r24 + ; CHECK-NEXT: vst wl10, [p6], #32 + ; CHECK-NEXT: vmax_lt.8 x10, r17:r16, x3, x1, vaddsign1 + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: vst wl10, [p6], #32 + ; CHECK-NEXT: ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + preheader: + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %loop_body + loop_body: + %data.phi = phi ptr addrspace(5) [ %data, %preheader ], [ %data.phi, %loop_body ] + %weights.phi = phi ptr addrspace(6) [ %weights, %preheader ], [ %weights.phi, %loop_body ] + %out.phi = phi ptr addrspace(7) [ %out, %preheader ], [ %out.phi, %loop_body ] + %dec = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %dec, label %loop_body, label %exit, !llvm.loop !0 + exit: + ret void + } + + declare void @llvm.set.loop.iterations.i32(i32) + declare i1 @llvm.loop.decrement.i32(i32) + + !0 = distinct !{!0, !1} + !1 = !{!"llvm.loop.itercount.range", i64 4} +... +--- +name: maxpool_inner_9instr +tracksRegLiveness: true +body: | + bb.0.preheader: + successors: %bb.1(0x80000000) + liveins: $p0, $p1, $p2, $p5, $p6, $p7, $r2, $r20, $x0:0x0000000000000003, $d0_3d:0x3800000040001C00, $crbf8conf, $crfp8conf, $vaddsign1 + + $lc = ADD_NC_add_lc_ri $r2, 0 + MOVXM_lng_cg_ls_abs %bb.1, implicit-def $ls + MOVXM_lng_cg_le_abs , implicit-def $le + $p5 = MOV_scalar_pseudo $p1 + $p6 = MOV_scalar_pseudo $p2 + $p3 = MOV_scalar_pseudo $p0 + $p7 = MOV_scalar_pseudo $p0 + + bb.1.loop_body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $dc0, $dc1, $dc4, $dj0, $dj1, $dj4, $dn0, $dn1, $dn4, $l2:0x0000000008000000, $m0, $m1, $p0, $p1, $p2, $p3, $p4, $p5, $p6, $p7, $r0, $r1, $r2, $r4, $r6, $r20, $x0:0x0000000000000003, $d0_3d:0x3800000040001C00 + + renamable $wh5 = VLD_w_idx_imm_pseudo renamable $p3, 32 :: (load (<8 x s32>) from %ir.data.phi, addrspace 5) + $wl5, $p7, $dc0, $dc4 = VLD_3D_w_pseudo killed $p7, $d0_3d :: (load (<8 x s32>) from %ir.data.phi, addrspace 5) + renamable $wl3, renamable $p5 = VLD_w_pstm_nrm_imm_pseudo killed renamable $p5, 32 :: (load (<8 x s32>) from %ir.weights.phi, addrspace 6) + $r22 = MOV_alu_mv_mv_mv_scl killed $p3 + renamable $r24 = AND killed renamable $r22, renamable $r20 + renamable $x1 = VSHIFT killed renamable $x5, undef renamable $x0, killed renamable $r24 + renamable $x10, dead renamable $l8 = VMAX_LT_8_vaddSign1 renamable $x3, killed renamable $x1, implicit $crbf8conf, implicit $crfp8conf, implicit $vaddsign1 + renamable $p6 = VST_dmw_sts_w_st_pstm_nrm_imm killed renamable $wl10, killed renamable $p6, 32 :: (store (<8 x s32>) into %ir.out.phi, addrspace 6) + $p3 = MOV_scalar_pseudo $p7 + PseudoLoopEnd , %bb.1 + + bb.2.exit: + liveins: $p5, $p6 + RET implicit $lr + DelayedSchedBarrier +... diff --git a/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/maxpool-9instr-target-ii.mir b/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/maxpool-9instr-target-ii.mir new file mode 100644 index 000000000000..9f5065023699 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/maxpool-9instr-target-ii.mir @@ -0,0 +1,111 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +# REQUIRES: enable_z3_solver + +# RUN: llc -mtriple=aie2ps -O2 --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-target-ii=4 \ +# RUN: -o - | FileCheck %s + +# MaxPool2D inner loop (9 instructions) without VSEL. +# Locks in the --aie-postpipeliner-target-ii=4 path: the post-pipeliner +# starts directly at II=4, skips heuristics, and runs only the Z3 solver +# at that II (no fallback to other IIs). + +--- | + target triple = "aie2ps" + + define void @maxpool_inner_9instr(ptr addrspace(5) noalias %data, ptr addrspace(6) noalias %weights, ptr addrspace(7) noalias %out, i32 %n) { + ; CHECK-LABEL: maxpool_inner_9instr: + ; CHECK: // %bb.0: // %preheader + ; CHECK-NEXT: nopa ; nopb ; movs p5, p1; nopx ; mov p7, p0; nopv + ; CHECK-NEXT: vlda wl3, [p5], #32; vldb.3d wl5, [p7], d0; nops ; nopx ; mov p3, p0; nopv + ; CHECK-NEXT: vldb wh5, [p3, #32] + ; CHECK-NEXT: movxm ls, #.LBB0_1 + ; CHECK-NEXT: add.nc lc, r2, #-2; mov r22, p3 + ; CHECK-NEXT: vlda wl3, [p5], #32; vldb.3d wl5, [p7], d0; movs p3, p7; movxm le, #.L_LEnd0; nopv + ; CHECK-NEXT: nopa ; vldb wh5, [p3, #32]; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; and r24, r22, r20; nopm ; nopv + ; CHECK-NEXT: nopa ; nopb ; movs p6, p2; nopx ; mov r22, p3; nopv + ; CHECK-NEXT: .LBB0_1: // %loop_body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vlda wl3, [p5], #32; vldb.3d wl5, [p7], d0; movs p3, p7; nopx ; vshift x1, x5, x0, r24; nopv + ; CHECK-NEXT: nopa ; vldb wh5, [p3, #32]; nops ; nopx ; vmax_lt.8 x10, r17:r16, x3, x1, vaddsign1; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; and r24, r22, r20; nopm ; nopv + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopa ; nopb ; vst wl10, [p6], #32; nopx ; mov r22, p3; nopv + ; CHECK-NEXT: // %bb.2: // %exit + ; CHECK-NEXT: movs p3, p7; vshift x1, x5, x0, r24 + ; CHECK-NEXT: vmax_lt.8 x10, r17:r16, x3, x1, vaddsign1 + ; CHECK-NEXT: and r24, r22, r20 + ; CHECK-NEXT: vst wl10, [p6], #32 + ; CHECK-NEXT: vshift x1, x5, x0, r24 + ; CHECK-NEXT: vmax_lt.8 x10, r17:r16, x3, x1, vaddsign1 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vst wl10, [p6], #32 + ; CHECK-NEXT: ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + preheader: + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %loop_body + loop_body: + %data.phi = phi ptr addrspace(5) [ %data, %preheader ], [ %data.phi, %loop_body ] + %weights.phi = phi ptr addrspace(6) [ %weights, %preheader ], [ %weights.phi, %loop_body ] + %out.phi = phi ptr addrspace(7) [ %out, %preheader ], [ %out.phi, %loop_body ] + %dec = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %dec, label %loop_body, label %exit, !llvm.loop !0 + exit: + ret void + } + + declare void @llvm.set.loop.iterations.i32(i32) + declare i1 @llvm.loop.decrement.i32(i32) + + !0 = distinct !{!0, !1} + !1 = !{!"llvm.loop.itercount.range", i64 4} +... +--- +name: maxpool_inner_9instr +tracksRegLiveness: true +body: | + bb.0.preheader: + successors: %bb.1(0x80000000) + liveins: $p0, $p1, $p2, $p5, $p6, $p7, $r2, $r20, $x0:0x0000000000000003, $d0_3d:0x3800000040001C00, $crbf8conf, $crfp8conf, $vaddsign1 + + $lc = ADD_NC_add_lc_ri $r2, 0 + MOVXM_lng_cg_ls_abs %bb.1, implicit-def $ls + MOVXM_lng_cg_le_abs , implicit-def $le + $p5 = MOV_scalar_pseudo $p1 + $p6 = MOV_scalar_pseudo $p2 + $p3 = MOV_scalar_pseudo $p0 + $p7 = MOV_scalar_pseudo $p0 + + bb.1.loop_body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $dc0, $dc1, $dc4, $dj0, $dj1, $dj4, $dn0, $dn1, $dn4, $l2:0x0000000008000000, $m0, $m1, $p0, $p1, $p2, $p3, $p4, $p5, $p6, $p7, $r0, $r1, $r2, $r4, $r6, $r20, $x0:0x0000000000000003, $d0_3d:0x3800000040001C00 + + renamable $wh5 = VLD_w_idx_imm_pseudo renamable $p3, 32 :: (load (<8 x s32>) from %ir.data.phi, addrspace 5) + $wl5, $p7, $dc0, $dc4 = VLD_3D_w_pseudo killed $p7, $d0_3d :: (load (<8 x s32>) from %ir.data.phi, addrspace 5) + renamable $wl3, renamable $p5 = VLD_w_pstm_nrm_imm_pseudo killed renamable $p5, 32 :: (load (<8 x s32>) from %ir.weights.phi, addrspace 6) + $r22 = MOV_alu_mv_mv_mv_scl killed $p3 + renamable $r24 = AND killed renamable $r22, renamable $r20 + renamable $x1 = VSHIFT killed renamable $x5, undef renamable $x0, killed renamable $r24 + renamable $x10, dead renamable $l8 = VMAX_LT_8_vaddSign1 renamable $x3, killed renamable $x1, implicit $crbf8conf, implicit $crfp8conf, implicit $vaddsign1 + renamable $p6 = VST_dmw_sts_w_st_pstm_nrm_imm killed renamable $wl10, killed renamable $p6, 32 :: (store (<8 x s32>) into %ir.out.phi, addrspace 6) + $p3 = MOV_scalar_pseudo $p7 + PseudoLoopEnd , %bb.1 + + bb.2.exit: + liveins: $p5, $p6 + RET implicit $lr + DelayedSchedBarrier +... diff --git a/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/maxpool-target-ii-bypass-maxii.mir b/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/maxpool-target-ii-bypass-maxii.mir new file mode 100644 index 000000000000..02b7406e2935 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/maxpool-target-ii-bypass-maxii.mir @@ -0,0 +1,112 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +# REQUIRES: enable_z3_solver + +# RUN: llc -mtriple=aie2ps -O2 --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-maxii=2 \ +# RUN: --aie-postpipeliner-target-ii=4 \ +# RUN: -o - | FileCheck %s + +# MaxPool2D inner loop (9 instructions) without VSEL. +# Locks in that --aie-postpipeliner-target-ii=4 bypasses +# --aie-postpipeliner-maxii=2: without the bypass the post-pipeliner would +# never reach II=4, but TargetII forces a one-shot attempt at exactly that II. + +--- | + target triple = "aie2ps" + + define void @maxpool_inner_9instr(ptr addrspace(5) noalias %data, ptr addrspace(6) noalias %weights, ptr addrspace(7) noalias %out, i32 %n) { + ; CHECK-LABEL: maxpool_inner_9instr: + ; CHECK: // %bb.0: // %preheader + ; CHECK-NEXT: nopa ; nopb ; movs p5, p1; nopx ; mov p7, p0; nopv + ; CHECK-NEXT: vlda wl3, [p5], #32; vldb.3d wl5, [p7], d0; nops ; nopx ; mov p3, p0; nopv + ; CHECK-NEXT: vldb wh5, [p3, #32] + ; CHECK-NEXT: movxm ls, #.LBB0_1 + ; CHECK-NEXT: add.nc lc, r2, #-2; mov r22, p3 + ; CHECK-NEXT: vlda wl3, [p5], #32; vldb.3d wl5, [p7], d0; movs p3, p7; movxm le, #.L_LEnd0; nopv + ; CHECK-NEXT: nopa ; vldb wh5, [p3, #32]; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; and r24, r22, r20; nopm ; nopv + ; CHECK-NEXT: nopa ; nopb ; movs p6, p2; nopx ; mov r22, p3; nopv + ; CHECK-NEXT: .LBB0_1: // %loop_body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vlda wl3, [p5], #32; vldb.3d wl5, [p7], d0; movs p3, p7; nopx ; vshift x1, x5, x0, r24; nopv + ; CHECK-NEXT: nopa ; vldb wh5, [p3, #32]; nops ; nopx ; vmax_lt.8 x10, r17:r16, x3, x1, vaddsign1; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; and r24, r22, r20; nopm ; nopv + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopa ; nopb ; vst wl10, [p6], #32; nopx ; mov r22, p3; nopv + ; CHECK-NEXT: // %bb.2: // %exit + ; CHECK-NEXT: movs p3, p7; vshift x1, x5, x0, r24 + ; CHECK-NEXT: vmax_lt.8 x10, r17:r16, x3, x1, vaddsign1 + ; CHECK-NEXT: and r24, r22, r20 + ; CHECK-NEXT: vst wl10, [p6], #32 + ; CHECK-NEXT: vshift x1, x5, x0, r24 + ; CHECK-NEXT: vmax_lt.8 x10, r17:r16, x3, x1, vaddsign1 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vst wl10, [p6], #32 + ; CHECK-NEXT: ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + preheader: + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %loop_body + loop_body: + %data.phi = phi ptr addrspace(5) [ %data, %preheader ], [ %data.phi, %loop_body ] + %weights.phi = phi ptr addrspace(6) [ %weights, %preheader ], [ %weights.phi, %loop_body ] + %out.phi = phi ptr addrspace(7) [ %out, %preheader ], [ %out.phi, %loop_body ] + %dec = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %dec, label %loop_body, label %exit, !llvm.loop !0 + exit: + ret void + } + + declare void @llvm.set.loop.iterations.i32(i32) + declare i1 @llvm.loop.decrement.i32(i32) + + !0 = distinct !{!0, !1} + !1 = !{!"llvm.loop.itercount.range", i64 4} +... +--- +name: maxpool_inner_9instr +tracksRegLiveness: true +body: | + bb.0.preheader: + successors: %bb.1(0x80000000) + liveins: $p0, $p1, $p2, $p5, $p6, $p7, $r2, $r20, $x0:0x0000000000000003, $d0_3d:0x3800000040001C00, $crbf8conf, $crfp8conf, $vaddsign1 + + $lc = ADD_NC_add_lc_ri $r2, 0 + MOVXM_lng_cg_ls_abs %bb.1, implicit-def $ls + MOVXM_lng_cg_le_abs , implicit-def $le + $p5 = MOV_scalar_pseudo $p1 + $p6 = MOV_scalar_pseudo $p2 + $p3 = MOV_scalar_pseudo $p0 + $p7 = MOV_scalar_pseudo $p0 + + bb.1.loop_body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $dc0, $dc1, $dc4, $dj0, $dj1, $dj4, $dn0, $dn1, $dn4, $l2:0x0000000008000000, $m0, $m1, $p0, $p1, $p2, $p3, $p4, $p5, $p6, $p7, $r0, $r1, $r2, $r4, $r6, $r20, $x0:0x0000000000000003, $d0_3d:0x3800000040001C00 + + renamable $wh5 = VLD_w_idx_imm_pseudo renamable $p3, 32 :: (load (<8 x s32>) from %ir.data.phi, addrspace 5) + $wl5, $p7, $dc0, $dc4 = VLD_3D_w_pseudo killed $p7, $d0_3d :: (load (<8 x s32>) from %ir.data.phi, addrspace 5) + renamable $wl3, renamable $p5 = VLD_w_pstm_nrm_imm_pseudo killed renamable $p5, 32 :: (load (<8 x s32>) from %ir.weights.phi, addrspace 6) + $r22 = MOV_alu_mv_mv_mv_scl killed $p3 + renamable $r24 = AND killed renamable $r22, renamable $r20 + renamable $x1 = VSHIFT killed renamable $x5, undef renamable $x0, killed renamable $r24 + renamable $x10, dead renamable $l8 = VMAX_LT_8_vaddSign1 renamable $x3, killed renamable $x1, implicit $crbf8conf, implicit $crfp8conf, implicit $vaddsign1 + renamable $p6 = VST_dmw_sts_w_st_pstm_nrm_imm killed renamable $wl10, killed renamable $p6, 32 :: (store (<8 x s32>) into %ir.out.phi, addrspace 6) + $p3 = MOV_scalar_pseudo $p7 + PseudoLoopEnd , %bb.1 + + bb.2.exit: + liveins: $p5, $p6 + RET implicit $lr + DelayedSchedBarrier +... diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 156dbd31e3db..1576aab1678f 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -1,5 +1,7 @@ # -*- Python -*- +# Modifications (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + # Configuration file for the 'lit' test runner. import os @@ -663,6 +665,9 @@ def host_unwind_supports_jit(): if config.have_httplib: config.available_features.add("httplib") +if config.have_z3_solver: + config.available_features.add("enable_z3_solver") + if config.have_opt_viewer_modules: config.available_features.add("have_opt_viewer_modules") diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in index 0d02920323d2..b51bbabdd867 100644 --- a/llvm/test/lit.site.cfg.py.in +++ b/llvm/test/lit.site.cfg.py.in @@ -1,5 +1,7 @@ @LIT_SITE_CFG_IN_HEADER@ +# Modifications (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + import sys config.host_triple = "@LLVM_HOST_TRIPLE@" @@ -65,6 +67,7 @@ config.spirv_tools_tests = @LLVM_INCLUDE_SPIRV_TOOLS_TESTS@ config.have_vc_rev = @LLVM_APPEND_VC_REV@ config.force_vc_rev = "@LLVM_FORCE_VC_REVISION@" config.has_logf128 = @LLVM_HAS_LOGF128@ +config.have_z3_solver = @LLVM_WITH_Z3@ import lit.llvm lit.llvm.initialize(lit_config, config)