diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h index 4707150ab209..b43c72c21564 100644 --- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h +++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -137,6 +137,11 @@ namespace llvm { /// Whether lane masks should get tracked. bool TrackLaneMasks = false; + /// This controls registering single defs in CurrentVRegDefs. + /// For special uses of ScheduleDAGInstrs, we can not use the assumption + /// that defs dominate all uses. + bool AbandonSingleDefs = true; + // State specific to the current scheduling region. // ------------------------------------------------ @@ -351,7 +356,8 @@ namespace llvm { /// traversal of the SUnits vector. void buildEdges(AAResults *AA, RegPressureTracker *RPTracker = nullptr, PressureDiffs *PDiffs = nullptr, - LiveIntervals *LIS = nullptr, bool TrackLaneMasks = false); + LiveIntervals *LIS = nullptr, bool TrackLaneMasks = false, + bool AbandonSingleDefs = true); /// Adds dependencies from instructions in the current list of /// instructions being scheduled to scheduling barrier. We want to make sure diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp index 4ec3e91ae044..168145f5c80f 100644 --- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -492,7 +492,7 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) { } // Shortcut: Singly defined vregs do not have output/anti dependencies. - if (MRI.hasOneDef(Reg)) + if (AbandonSingleDefs && MRI.hasOneDef(Reg)) return; // Add output dependence to the next nearest defs of this vreg. @@ -868,7 +868,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AAResults *AA, void ScheduleDAGInstrs::buildEdges(AAResults *AA, RegPressureTracker *RPTracker, PressureDiffs *PDiffs, LiveIntervals *LIS, - bool TrackLaneMasks) { + bool TrackLaneMasks, + bool AbandonSingleDefs) { const TargetSubtargetInfo &ST = MF.getSubtarget(); bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI @@ -877,6 +878,7 @@ void ScheduleDAGInstrs::buildEdges(AAResults *AA, RegPressureTracker *RPTracker, AAForDep.emplace(*AA); BarrierChain = nullptr; this->TrackLaneMasks = TrackLaneMasks; + this->AbandonSingleDefs = AbandonSingleDefs; if (PDiffs) PDiffs->init(SUnits.size()); diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp index 510546674f3d..e7c07a13a410 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp @@ -19,6 +19,7 @@ #include "AIEMachineScheduler.h" #include "AIEMaxLatencyFinder.h" #include "AIEMultiSlotInstrMaterializer.h" +#include "AIERegDefUseTracker.h" #include "Utils/AIELoopUtils.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -37,6 +38,7 @@ // --debug-only=sched-blocks,machine-scheduler #define DEBUG_LOOPAWARE(X) DEBUG_WITH_TYPE("loop-aware", X) #define DEBUG_BLOCKS(X) DEBUG_WITH_TYPE("sched-blocks", X) +#define DEBUG_REGALLOC(X) DEBUG_WITH_TYPE("aie-reg-liverange", X) using namespace llvm; @@ -76,8 +78,52 @@ static cl::opt PostPipelinerMaxTryII( "aie-postpipeliner-maxtry-ii", cl::init(20), cl::desc("[AIE] Maximum II steps to be tried in the post-ra pipeliner")); +static cl::opt TestRegDefUseTracker( + "aie-test-regdefuse-tracker", cl::Hidden, cl::init(false), + cl::desc("[AIE] TEST MODE: Run RegDefUseTracker analysis on all loops " + "(for testing only)")); + namespace llvm::AIE { +// Helper function to get the name of a PostPipelinerMode as a string +const char *getPostPipelinerModeName(PostPipelinerMode Mode) { + switch (Mode) { + case PostPipelinerMode::None: + return "None"; + case PostPipelinerMode::Physical: + return "Physical"; + case PostPipelinerMode::Virtual: + return "Virtual"; + case PostPipelinerMode::ReservedVirtual: + return "ReservedVirtual"; + } + return "Unknown"; +} + +// Option for enabling virtual register mode in the postpipeliner +static cl::opt PostPipelinerVRegMode( + "aie-postpipeliner-vreg-mode", cl::Hidden, cl::init(true), + cl::desc("[AIE] Enable virtual register mode for the postpipeliner " + "(replaces filtered physical registers with virtual registers)")); + +// Option for enabling physical register mode in the postpipeliner +static cl::opt PostPipelinerPhysMode( + "aie-postpipeliner-phys-mode", cl::Hidden, cl::init(true), + cl::desc("[AIE] Enable physical register mode for the postpipeliner " + "(use physical registers without virtualization)")); + +// Option for enabling reserved virtual register mode in the postpipeliner +static cl::opt PostPipelinerVRegReservedMode( + "aie-postpipeliner-vreg-reserved-mode", cl::Hidden, cl::init(false), + cl::desc("[AIE] Enable reserved virtual register mode for the " + "postpipeliner (virtualizes ranges overlapping RESERVED bases)")); + +// Option for filtering live ranges with no register choice +static cl::opt FilterNoChoiceRegs( + "aie-postpipeliner-filter-no-choice", cl::Hidden, cl::init(false), + cl::desc("[AIE] Filter out live ranges with only one available physical " + "register to prevent pipeliner invalidation")); + void dumpInterBlock(const InterBlockEdges &Edges) { for (const SUnit &SU : Edges) { dbgs() << "SU" << SU.NodeNum << ": " << *SU.getInstr(); @@ -235,7 +281,7 @@ void InterBlockScheduling::markEpilogueBlocks() { } void InterBlockScheduling::enterFunction(MachineFunction *MF) { - DEBUG_BLOCKS(dbgs() << ">> enterFunction " << MF->getName() << "\n"); + DEBUG_BLOCKS(dbgs() << "PSBEGIN Function " << MF->getName() << "\n"); // Get ourselves a hazard recognizer const auto &Subtarget = MF->getSubtarget(); @@ -277,14 +323,14 @@ void InterBlockScheduling::enterFunction(MachineFunction *MF) { } void InterBlockScheduling::leaveFunction() { - DEBUG_BLOCKS(dbgs() << "<< leaveFunction\n"); + DEBUG_BLOCKS(dbgs() << "PSEND Function\n"); Blocks.clear(); } void InterBlockScheduling::enterBlock(MachineBasicBlock *BB) { CurrentBlockState = &getBlockState(BB); CurrentBlockState->resetRegion(); - DEBUG_BLOCKS(dbgs() << " >> enterBlock " << BB->getNumber() << " " + DEBUG_BLOCKS(dbgs() << "PSBEGIN Block " << BB->getNumber() << " " << CurrentBlockState->kindAsString() << " FixPointIter=" << CurrentBlockState->FixPoint.NumIters << " II=" << CurrentBlockState->FixPoint.II << "\n"); @@ -371,7 +417,7 @@ class PipelineExtractor : public PipelineScheduleVisitor { } // namespace bool InterBlockScheduling::leaveBlock() { - DEBUG_BLOCKS(dbgs() << " << leaveBlock " + DEBUG_BLOCKS(dbgs() << "PSEND Block " << CurrentBlockState->TheBlock->getNumber() << "\n"); // After scheduling a basic block, check convergence to determine which block // to schedule next and with what parameters @@ -393,8 +439,7 @@ bool InterBlockScheduling::leaveBlock() { BS.clearSchedule(); PipelineExtractor GenSchedule(*this, BS, *TII); auto &PostSWP = BS.getPostSWP(); - PostSWP.visitPipelineSchedule(GenSchedule); - PostSWP.updateTripCount(); + PostSWP.materializePipeline(GenSchedule); break; } case SchedulingStage::SchedulingDone: @@ -539,6 +584,32 @@ SchedulingStage InterBlockScheduling::updateFixPoint(BlockState &BS) { return updatePipelining(BS); } +// Get the first pipeliner mode to try based on command line options. +static PostPipelinerMode firstPipelinerMode() { + if (PostPipelinerPhysMode) { + return PostPipelinerMode::Physical; + } + if (PostPipelinerVRegMode) { + return PostPipelinerMode::Virtual; + } + if (PostPipelinerVRegReservedMode) { + return PostPipelinerMode::ReservedVirtual; + } + return PostPipelinerMode::None; +} + +// Get the next pipeliner mode to try after the current one. +// Returns None when past the last mode. +static PostPipelinerMode nextPipelinerMode(PostPipelinerMode Current) { + if (Current == PostPipelinerMode::Physical && PostPipelinerVRegMode) { + return PostPipelinerMode::Virtual; + } + if (Current == PostPipelinerMode::Virtual && PostPipelinerVRegReservedMode) { + return PostPipelinerMode::ReservedVirtual; + } + return PostPipelinerMode::None; +} + SchedulingStage InterBlockScheduling::updateScheduling(BlockState &BS) { if (BS.FixPoint.NumIters > MaxExpensiveIterations + 2 * HR->getConflictHorizon()) { @@ -609,13 +680,22 @@ SchedulingStage InterBlockScheduling::updateScheduling(BlockState &BS) { << "\n"); // The loop schedule has converged, so we could declare our work done. - // But first try SWP + // But first try SWP if we have a single region and pipelining is enabled if (BS.getRegions().size() == 1) { auto &PostSWP = BS.getPostSWP(); if (PostSWP.isPostPipelineCandidate(*BS.TheBlock)) { - BS.FixPoint.II = PostSWP.getResMII(*BS.TheBlock); - BS.FixPoint.IITries = 1; - return SchedulingStage::Pipelining; + // Determine which pipelining mode to use + BS.FixPoint.PipelinerMode = firstPipelinerMode(); + if (BS.FixPoint.PipelinerMode == PostPipelinerMode::None) { + return SchedulingStage::SchedulingDone; + } + + const int ResMII = PostSWP.getResMII(*BS.TheBlock); + if (ResMII <= PostPipelinerMaxII) { + BS.FixPoint.II = ResMII; + BS.FixPoint.IITries = 1; + return SchedulingStage::Pipelining; + } } } return SchedulingStage::SchedulingDone; @@ -624,14 +704,36 @@ SchedulingStage InterBlockScheduling::updateScheduling(BlockState &BS) { SchedulingStage InterBlockScheduling::updatePipelining(BlockState &BS) { // We have been pipelining. Check whether we were successful. if (BS.FixPoint.Stage == SchedulingStage::PipeliningDone) { - return BS.FixPoint.Stage; + return SchedulingStage::PipeliningDone; } - // Otherwise try a larger II. + // If pipelining is disabled, we shouldn't be here + if (BS.FixPoint.PipelinerMode == PostPipelinerMode::None) { + return SchedulingStage::PipeliningFailed; + } + + // We failed. undo all changes that were required for this attempt. + BS.restorePipelining(); + + // Try the next mode at the same II. + const PostPipelinerMode NextMode = + nextPipelinerMode(BS.FixPoint.PipelinerMode); + if (NextMode != PostPipelinerMode::None) { + BS.FixPoint.PipelinerMode = NextMode; + DEBUG_LOOPAWARE(dbgs() << "Trying next mode at II=" << BS.FixPoint.II + << "\n"); + return SchedulingStage::Pipelining; + } + + // We progressed through all pipeliner modes and failed. + // Try a larger II. // We cut off at larger IIs to prevent excessive compilation time. if (++BS.FixPoint.II <= PostPipelinerMaxII && ++BS.FixPoint.IITries <= PostPipelinerMaxTryII) { - return SchedulingStage::Pipelining; + BS.FixPoint.PipelinerMode = firstPipelinerMode(); + if (BS.FixPoint.PipelinerMode != PostPipelinerMode::None) { + return SchedulingStage::Pipelining; + } } auto *BB = BS.TheBlock; @@ -1125,6 +1227,54 @@ void BlockState::setPipelined() { FixPoint.Stage = SchedulingStage::PipeliningDone; } +void BlockState::initPipelining() { + // Should only be called when actually pipelining. + assert(FixPoint.PipelinerMode != PostPipelinerMode::None && + "initPipelining called when not pipelining"); + + DEBUG_REGALLOC(dbgs() << "initPipelining called with mode=" + << getPostPipelinerModeName(FixPoint.PipelinerMode) + << " II=" << FixPoint.II << "\n"); + + // For virtual modes, virtualize the already-analyzed live ranges. + if (FixPoint.PipelinerMode == PostPipelinerMode::Virtual || + FixPoint.PipelinerMode == PostPipelinerMode::ReservedVirtual) { + assert(RegTracker && "RegTracker must exist in virtual modes"); + + // The analysis was already performed once in initInterBlock. + // We just need to virtualize the physical registers for this attempt. + const RegLiveRangeTracker::OverlapPolicy Policy = + (FixPoint.PipelinerMode == PostPipelinerMode::Virtual) + ? RegLiveRangeTracker::OverlapPolicy:: + DisallowOverlapWithReservedBase + : RegLiveRangeTracker::OverlapPolicy::AllowOverlapWithReservedBase; + + RegTracker->virtualizeFilteredPhysRegs(Policy); + DEBUG_REGALLOC(dbgs() << "Virtualized with policy=" + << (Policy == RegLiveRangeTracker::OverlapPolicy:: + DisallowOverlapWithReservedBase + ? "DisallowOverlap" + : "AllowOverlap") + << " for pipelining attempt at II=" << FixPoint.II + << "\n"); + } +} + +void BlockState::restorePipelining() { + // Restore to the original allocation of the virtual registers. + if (FixPoint.PipelinerMode == PostPipelinerMode::Virtual || + FixPoint.PipelinerMode == PostPipelinerMode::ReservedVirtual) { + assert(RegTracker && "RegTracker must exist in virtual modes"); + + // Only restore if registers are still virtualized. + if (RegTracker->areRegistersVirtualized()) { + // Restore physical registers but keep the analysis results. + // The analysis is invariant and will be reused for the next attempt. + RegTracker->restoreOriginalPhysRegs(); + } + } +} + int BlockState::getScheduleLength() const { int Length = 0; for (auto &R : Regions) { @@ -1185,16 +1335,70 @@ void BlockState::initInterBlock(const MachineSchedContext &Context, }) && "Loop cannot have fixed instructions"); BoundaryEdges = std::make_unique(Context); + + // Start with None - we'll determine the actual mode after scheduling + // converges + FixPoint.PipelinerMode = PostPipelinerMode::None; + if (Regions.size() == 1) { - // Don't worry, this just constructs a mostly empty container class - auto NumInstrs = getTop().getFreeInstructions().size(); - PostSWP = std::make_unique(HR, NumInstrs); - - // perform static assignment of multi-slot pseudos - if (EnableMultiSlotInstrMaterialization && - PostSWP->isPostPipelineCandidate(*TheBlock)) { - staticallyMaterializeMultiSlotInstructions(*TheBlock, HR, - MaterializePipeline); + // Create the persistent tracker that will be used throughout pipelining + RegTracker = std::make_unique(*TheBlock); + + // Create PostSWP with the persistent tracker + const auto NumInstrs = getTop().getFreeInstructions().size(); + PostSWP = std::make_unique(HR, NumInstrs, *RegTracker, + *TheBlock->getParent()); + + // Check if isPostPipelineCandidate, if so, perform materialization and + // register tracking. + // Also run analysis if TestRegDefUseTracker is enabled (for testing). + // Only proceed if at least one pipelining mode is enabled. + const bool PipeliningEnabled = + PostPipelinerVRegMode || PostPipelinerPhysMode; + if ((PipeliningEnabled && PostSWP->isPostPipelineCandidate(*TheBlock)) || + TestRegDefUseTracker) { + // Perform static assignment of multi-slot pseudos + if (EnableMultiSlotInstrMaterialization) { + staticallyMaterializeMultiSlotInstructions(*TheBlock, HR, + MaterializePipeline); + } + + // Run register live range analysis ONCE using the invariant semantic + // order. This analysis is done after static MSP materialization to + // analyze the materialized state. The semantic order and physical + // register state are invariant across all pipelining attempts, so we + // only need to analyze once. + RegTracker->analyze(*TheBlock, getTop().getFreeInstructions()); + DEBUG_REGALLOC(RegTracker->dump("FINAL LIVE RANGES\n")); + + // Optionally filter out live ranges with no register choice. + // This is also done once since the available registers don't change. + if (FilterNoChoiceRegs) { + RegTracker->filterByRegisterAvailability(); + DEBUG_REGALLOC(dbgs() << "After filtering by register availability:\n"); + DEBUG_REGALLOC(RegTracker->dump()); + } + + // Find and dump the most promising scarce range set. + const auto &ScarceRanges = RegTracker->getMostPromisingScarceRanges(); + DEBUG_REGALLOC({ + dbgs() << "Most promising scarce range set: " << ScarceRanges.size() + << " ranges\n"; + if (!ScarceRanges.empty()) { + const TargetRegisterInfo *TRI = + TheBlock->getParent()->getSubtarget().getRegisterInfo(); + dbgs() << "Register class: " + << TRI->getRegClassName(ScarceRanges[0]->getRegisterClass()) + << "\n"; + for (size_t I = 0; I < ScarceRanges.size(); ++I) { + const auto *LR = ScarceRanges[I]; + dbgs() << " [" << I + << "] BaseReg=" << TRI->getName(LR->getBaseReg()) + << " Defs=" << LR->getNumDefs() + << " Uses=" << LR->getNumUses() << "\n"; + } + } + }); } } diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h index eddc50a6ae87..817a3da1955c 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// // @@ -22,6 +22,7 @@ #include "AIEDataDependenceHelper.h" #include "AIEHazardRecognizer.h" #include "AIEPostPipeliner.h" +#include "AIERegDefUseTracker.h" #include "Utils/AIELoopUtils.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -82,6 +83,13 @@ class InterBlockEdges { // handling. enum class BlockType { Regular, Loop, Epilogue }; +// PostPipelinerMode determines whether the postpipeliner operates on physical +// registers or virtualizes them for better scheduling opportunities. +enum class PostPipelinerMode { None, Physical, Virtual, ReservedVirtual }; + +// Helper function to get the name of a PostPipelinerMode as a string +const char *getPostPipelinerModeName(PostPipelinerMode Mode); + // These are states in the state machine that drives scheduling enum class SchedulingStage { // We are gathering all regions in the block to initialize the BlockState. @@ -114,6 +122,8 @@ enum class SchedulingStage { class FixedpointState { public: SchedulingStage Stage = SchedulingStage::Scheduling; + // PostPipeliner mode - physical or virtual register mode + PostPipelinerMode PipelinerMode = PostPipelinerMode::None; // Parameters of the loop-aware convergence int LatencyMargin = 0; SmallMapVector PerMILatencyMargin; @@ -207,6 +217,9 @@ class BlockState { // This holds an instance of the PostPipeliner for candidate loops. std::unique_ptr PostSWP; + // This holds an instance of the RegLiveRangeTracker for loops. + std::unique_ptr RegTracker; + public: BlockState(MachineBasicBlock *Block); MachineBasicBlock *TheBlock = nullptr; @@ -271,6 +284,14 @@ class BlockState { void clearSchedule(); void setPipelined(); + + /// Initialize for pipelining - virtualizes physical registers if in test mode + void initPipelining(); + + /// Restore after failed pipelining - restores physical registers if + /// virtualized + void restorePipelining(); + bool isScheduled() const { return FixPoint.Stage == SchedulingStage::SchedulingDone || isPipelined() || pipeliningFailed(); diff --git a/llvm/lib/Target/AIE/AIELiveRangeUtils.cpp b/llvm/lib/Target/AIE/AIELiveRangeUtils.cpp new file mode 100644 index 000000000000..54367d8b9859 --- /dev/null +++ b/llvm/lib/Target/AIE/AIELiveRangeUtils.cpp @@ -0,0 +1,194 @@ +//===- AIELiveRangeUtils.cpp - Live Range Utilities -----------------------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// + +#include "AIELiveRangeUtils.h" +#include "AIEHazardRecognizer.h" +#include "AIERegDefUseTracker.h" +#include "AIEScheduleInterpreter.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/ResourceScoreboard.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "aie-live-range-utils" + +using namespace llvm; + +namespace llvm::AIE { + +LiveRangeScheduleResult +computeMinimalSchedule(const RegLiveRange &LR, const ScheduleDAG &DAG, + const AIEHazardRecognizer &HR, + const AIEScheduleInterpreter &Interp) { + // TODO: Determine optimal scoreboard bounds based on pipeline depth + // and latencies. For now, use a fixed range. + constexpr int ScoreboardLowerBound = -32; + constexpr int ScoreboardUpperBound = 31; + + // Create a local scoreboard for this scheduling attempt. + ResourceScoreboard LocalScoreboard; + LocalScoreboard.config(ScoreboardLowerBound, ScoreboardUpperBound); + + // Collect instructions from the live range, defs first, then uses. + // This provides a natural topological ordering for most cases. + SmallVector Instructions; + DenseSet Seen; + + // Collect def instructions. + for (const auto &DefInfo : LR.defs()) { + const MachineInstr *MI = DefInfo.getOperand()->getParent(); + if (Seen.insert(MI).second) + Instructions.push_back(MI); + } + + // Collect use instructions. + for (const auto &UseInfo : LR.uses()) { + const MachineInstr *MI = UseInfo.getOperand()->getParent(); + if (Seen.insert(MI).second) + Instructions.push_back(MI); + } + + // Build a map from MachineInstr to SUnit for dependency tracking. + // The DAG may contain multiple copies of instructions (for pipelining). + // Use try_emplace to only map the first occurrence of each instruction. + DenseMap MIToSUnit; + for (SUnit &SU : const_cast(DAG).SUnits) { + MachineInstr *MI = SU.getInstr(); + assert(MI && "SUnit must have a MachineInstr"); + MIToSUnit.try_emplace(MI, &SU); + } + + // Schedule instructions with multiple scans. + // Track which instructions have been scheduled. + DenseMap IssueCycles; + DenseSet Scheduled; + + // Keep scanning until all instructions are scheduled. + while (Scheduled.size() < Instructions.size()) { + bool MadeProgress = false; + + for (const MachineInstr *MI : Instructions) { + if (Scheduled.count(MI)) + continue; + + SUnit *SU = MIToSUnit.lookup(MI); + assert(SU && "Could not find SUnit for instruction in live range"); + + // Check if all predecessors within the live range are scheduled. + bool CanSchedule = true; + int EarliestCycle = 0; + + for (const SDep &Pred : SU->Preds) { + if (SUnit *PredSU = Pred.getSUnit()) { + const MachineInstr *PredMI = PredSU->getInstr(); + if (PredMI && Seen.count(PredMI)) { + if (!Scheduled.count(PredMI)) { + CanSchedule = false; + break; + } + // Account for latency (can be negative). + int PredCycle = IssueCycles[PredMI]; + int MinCycle = PredCycle + static_cast(Pred.getLatency()); + EarliestCycle = std::max(EarliestCycle, MinCycle); + } + } + } + + if (!CanSchedule) + continue; + + // Find the earliest cycle without structural hazards. + // Start from EarliestCycle (which can be negative). + int IssueCycle = EarliestCycle; + while (HR.getHazardType(LocalScoreboard, MI, IssueCycle) != + ScheduleHazardRecognizer::NoHazard) { + ++IssueCycle; + } + + // Schedule the instruction. + IssueCycles[MI] = IssueCycle; + Scheduled.insert(MI); + MadeProgress = true; + + // Update local scoreboard. + HR.emitInScoreboard(LocalScoreboard, *MI, MI->getDesc(), IssueCycle); + } + + // We must make progress in each iteration. + if (!MadeProgress) { + LLVM_DEBUG({ + dbgs() + << "Failed to make scheduling progress. Remaining instructions:\n"; + for (const MachineInstr *MI : Instructions) { + if (!Scheduled.count(MI)) { + dbgs() << " Unscheduled: " << *MI; + SUnit *SU = MIToSUnit.lookup(MI); + if (SU) { + dbgs() << " Waiting for predecessors:\n"; + for (const SDep &Pred : SU->Preds) { + if (SUnit *PredSU = Pred.getSUnit()) { + const MachineInstr *PredMI = PredSU->getInstr(); + if (PredMI && Seen.count(PredMI) && + !Scheduled.count(PredMI)) { + dbgs() << " " << *PredMI; + } + } + } + } + } + } + }); + } + assert(MadeProgress && "Failed to make scheduling progress"); + } + + // Generate events for all scheduled instructions. + EventSchedule Schedule; + for (const MachineInstr *MI : Instructions) { + int IssueCycle = IssueCycles[MI]; + Interp.addInstructionEvents(*MI, IssueCycle, Schedule); + } + + // Compute the minimal live length from the event schedule. + // Find the earliest def event and latest use event for this live range. + int MinDefCycle = INT_MAX; + int MaxUseCycle = INT_MIN; + + for (size_t Cycle = 0; Cycle < Schedule.size(); ++Cycle) { + for (const auto &Event : Schedule[Cycle]) { + // Check if this event belongs to an instruction in our live range. + if (!Seen.count(Event.MI)) + continue; + + if (Event.Type == EventType::Write) { + // This is a def event - update earliest def cycle. + MinDefCycle = std::min(MinDefCycle, static_cast(Cycle)); + } else if (Event.Type == EventType::Read) { + // This is a use event - update latest use cycle. + MaxUseCycle = std::max(MaxUseCycle, static_cast(Cycle)); + } + } + } + + // The minimal live length is the distance from first def event to the cycle + // before the last use event (the value is live from def until consumed). + unsigned MinimalLength = 0; + if (MinDefCycle != INT_MAX && MaxUseCycle != INT_MIN) { + MinimalLength = MaxUseCycle - MinDefCycle; + } + + return LiveRangeScheduleResult(MinimalLength); +} + +} // end namespace llvm::AIE diff --git a/llvm/lib/Target/AIE/AIELiveRangeUtils.h b/llvm/lib/Target/AIE/AIELiveRangeUtils.h new file mode 100644 index 000000000000..51c67dfdb6c2 --- /dev/null +++ b/llvm/lib/Target/AIE/AIELiveRangeUtils.h @@ -0,0 +1,59 @@ +//===- AIELiveRangeUtils.h - Live Range Utilities -------------------------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// This file contains utilities for analyzing and scheduling live ranges. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AIE_AIELIVERANGEUTILS_H +#define LLVM_LIB_TARGET_AIE_AIELIVERANGEUTILS_H + +namespace llvm { + +class AIEHazardRecognizer; +class AIEScheduleInterpreter; +class RegLiveRange; +class ScheduleDAG; + +namespace AIE { + +/// Result of live range scheduling analysis. +class LiveRangeScheduleResult { + unsigned MinimalLength; + +public: + LiveRangeScheduleResult(unsigned MinimalLength) + : MinimalLength(MinimalLength) {} + + /// Get the minimal live length for the range. + unsigned getMinimalLiveLength() const { return MinimalLength; } +}; + +/// Compute the minimal live length for a single live range. +/// +/// Schedules the instructions in the live range (defs and uses) greedily +/// using the AIEScheduleInterpreter for latency information and +/// AIEHazardRecognizer for structural resource checking. Returns the +/// minimal event-space coverage from first def to last use. +/// +/// \param LR The live range to schedule +/// \param DAG The schedule DAG providing dependency information +/// \param HR The hazard recognizer for resource checking +/// \param Interp The schedule interpreter providing latency/event mapping +/// \return Result containing the minimal live length +LiveRangeScheduleResult +computeMinimalSchedule(const RegLiveRange &LR, const ScheduleDAG &DAG, + const AIEHazardRecognizer &HR, + const AIEScheduleInterpreter &Interp); + +} // end namespace AIE +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AIE_AIELIVERANGEUTILS_H diff --git a/llvm/lib/Target/AIE/AIELivenessVector.cpp b/llvm/lib/Target/AIE/AIELivenessVector.cpp new file mode 100644 index 000000000000..90cc008708c6 --- /dev/null +++ b/llvm/lib/Target/AIE/AIELivenessVector.cpp @@ -0,0 +1,177 @@ +//===- AIELivenessVector.cpp - Liveness vector implementation ------------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// This file implements a vector-like container for liveness information that +// provides safe out-of-range access and common operations. +// +//===----------------------------------------------------------------------===// + +#include "AIELivenessVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include +#include + +using namespace llvm; + +namespace llvm { +namespace AIE { + +bool Liveness::conflictsWith(const Liveness &Other) const { + // Check register file lane conflicts. + if ((Lanes & Other.Lanes).any()) { + return true; + } + + // Check bypass conflicts: read in one, write in other (same class). + for (unsigned ReadClass : BypassReads) { + if (llvm::is_contained(Other.BypassWrites, ReadClass)) { + return true; + } + } + for (unsigned WriteClass : BypassWrites) { + if (llvm::is_contained(Other.BypassReads, WriteClass)) { + return true; + } + } + + // Check bypass vs register file conflicts. + // If one has bypass activity and the other has register lanes, they + // conflict because they share the same register address. + const bool ThisHasBypass = !BypassReads.empty() || !BypassWrites.empty(); + const bool OtherHasBypass = + !Other.BypassReads.empty() || !Other.BypassWrites.empty(); + + if (ThisHasBypass && Other.Lanes.any()) { + return true; + } + if (OtherHasBypass && Lanes.any()) { + return true; + } + + return false; +} + +LivenessVector::LivenessVector(size_t Size) : Elements(Size) {} + +LivenessVector::LivenessVector(size_t Size, LaneBitmask InitialValue) + : Elements(Size, Liveness(InitialValue)) {} + +size_t LivenessVector::size() const { return Elements.size(); } + +bool LivenessVector::empty() const { return Elements.empty(); } + +Liveness &LivenessVector::operator[](size_t Index) { + assert(Index < Elements.size() && "Index out of range"); + return Elements[Index]; +} + +const Liveness &LivenessVector::operator[](size_t Index) const { + assert(Index < Elements.size() && "Index out of range"); + return Elements[Index]; +} + +Liveness LivenessVector::at(size_t Index) const { + if (Index >= Elements.size()) { + return Liveness(); + } + return Elements[Index]; +} + +const SmallVector &LivenessVector::getElements() const { + return Elements; +} + +LivenessVector &LivenessVector::operator|=(const LivenessVector &Other) { + // Determine the maximum size needed + const size_t MaxSize = std::max(Elements.size(), Other.Elements.size()); + + // Extend this vector if needed + if (MaxSize > Elements.size()) { + Elements.resize(MaxSize); + } + + // Union using at() which returns empty for out-of-bounds + for (size_t I = 0; I < MaxSize; ++I) { + Elements[I] |= Other.at(I); + } + return *this; +} + +LivenessVector &LivenessVector::operator&=(const LivenessVector &Other) { + // Use at() which returns empty for out-of-bounds + for (size_t I = 0; I < Elements.size(); ++I) { + Elements[I] &= Other.at(I); + } + return *this; +} + +LivenessVector &LivenessVector::operator-=(const LivenessVector &Other) { + // Use at() which returns empty for out-of-bounds + for (size_t I = 0; I < Elements.size(); ++I) { + Elements[I] -= Other.at(I); + } + return *this; +} + +LivenessVector LivenessVector::operator|(const LivenessVector &Other) const { + LivenessVector Result = *this; + Result |= Other; + return Result; +} + +LivenessVector LivenessVector::operator&(const LivenessVector &Other) const { + LivenessVector Result = *this; + Result &= Other; + return Result; +} + +LivenessVector LivenessVector::operator-(const LivenessVector &Other) const { + LivenessVector Result = *this; + Result -= Other; + return Result; +} + +bool LivenessVector::overlaps(const LivenessVector &Other) const { + const size_t MinSize = std::min(Elements.size(), Other.Elements.size()); + for (size_t I = 0; I < MinSize; ++I) { + if (Elements[I].conflictsWith(Other.Elements[I])) { + return true; + } + } + return false; +} + +bool LivenessVector::any() const { + return llvm::any_of(Elements, [](const Liveness &L) { return L.any(); }); +} + +bool LivenessVector::none() const { + return llvm::none_of(Elements, [](const Liveness &L) { return L.any(); }); +} + +void LivenessVector::dump() const { + print(dbgs()); + dbgs() << '\n'; +} + +void LivenessVector::print(raw_ostream &OS) const { + OS << "["; + for (size_t I = 0; I < Elements.size(); ++I) { + if (I > 0) + OS << ", "; + OS << PrintLaneMask(Elements[I].getLanes()); + } + OS << "]"; +} + +} // namespace AIE +} // namespace llvm diff --git a/llvm/lib/Target/AIE/AIELivenessVector.h b/llvm/lib/Target/AIE/AIELivenessVector.h new file mode 100644 index 000000000000..ee1901484529 --- /dev/null +++ b/llvm/lib/Target/AIE/AIELivenessVector.h @@ -0,0 +1,222 @@ +//===- AIELivenessVector.h - Liveness vector container ---------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// This file defines a vector-like container for liveness information that +// provides safe out-of-range access and common operations. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AIE_AIELIVENESSVECTOR_H +#define LLVM_LIB_TARGET_AIE_AIELIVENESSVECTOR_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/MC/LaneBitmask.h" + +namespace llvm { + +class raw_ostream; + +namespace AIE { + +/// Liveness information for a single cycle/offset. +/// Tracks both register file lanes and bypass usage to detect conflicts. +class Liveness { +private: + LaneBitmask Lanes; + // Set of bypass classes being read from at this cycle + SmallVector BypassReads; + // Set of bypass classes being written to at this cycle + SmallVector BypassWrites; + +public: + /// Construct with no lanes live + Liveness() : Lanes(LaneBitmask::getNone()) {} + + /// Construct with specific lane mask + Liveness(LaneBitmask L) : Lanes(L) {} + + /// Get the lane mask + LaneBitmask getLanes() const { return Lanes; } + + /// Set the lane mask + void setLanes(LaneBitmask L) { Lanes = L; } + + /// Add a bypass read for a specific forwarding class + void addBypassRead(unsigned ForwardingClass) { + if (ForwardingClass != 0 && + !llvm::is_contained(BypassReads, ForwardingClass)) { + BypassReads.push_back(ForwardingClass); + } + } + + /// Add a bypass write for a specific forwarding class + void addBypassWrite(unsigned ForwardingClass) { + if (ForwardingClass != 0 && + !llvm::is_contained(BypassWrites, ForwardingClass)) { + BypassWrites.push_back(ForwardingClass); + } + } + + /// Get bypass reads + ArrayRef getBypassReads() const { return BypassReads; } + + /// Get bypass writes + ArrayRef getBypassWrites() const { return BypassWrites; } + + /// Check if this liveness conflicts with another. + /// Conflicts occur when: + /// 1. Register file lanes overlap, OR + /// 2. A bypass read and bypass write use the same forwarding class, OR + /// 3. One has bypass activity and the other has register lanes + /// (they share the same register address) + bool conflictsWith(const Liveness &Other) const; + + /// Union with another liveness + Liveness &operator|=(const Liveness &Other) { + Lanes |= Other.Lanes; + // Merge bypass reads + for (unsigned FC : Other.BypassReads) { + addBypassRead(FC); + } + // Merge bypass writes + for (unsigned FC : Other.BypassWrites) { + addBypassWrite(FC); + } + return *this; + } + + /// Intersection with another liveness + Liveness &operator&=(const Liveness &Other) { + Lanes &= Other.Lanes; + // For intersection, keep only bypass classes present in both + SmallVector NewBypassReads; + for (unsigned FC : BypassReads) { + if (llvm::is_contained(Other.BypassReads, FC)) { + NewBypassReads.push_back(FC); + } + } + BypassReads = std::move(NewBypassReads); + + SmallVector NewBypassWrites; + for (unsigned FC : BypassWrites) { + if (llvm::is_contained(Other.BypassWrites, FC)) { + NewBypassWrites.push_back(FC); + } + } + BypassWrites = std::move(NewBypassWrites); + return *this; + } + + /// Difference with another liveness + Liveness &operator-=(const Liveness &Other) { + Lanes &= ~Other.Lanes; + // For difference, remove bypass classes present in Other + SmallVector NewBypassReads; + for (unsigned FC : BypassReads) { + if (!llvm::is_contained(Other.BypassReads, FC)) { + NewBypassReads.push_back(FC); + } + } + BypassReads = std::move(NewBypassReads); + + SmallVector NewBypassWrites; + for (unsigned FC : BypassWrites) { + if (!llvm::is_contained(Other.BypassWrites, FC)) { + NewBypassWrites.push_back(FC); + } + } + BypassWrites = std::move(NewBypassWrites); + return *this; + } + + /// Check if any lanes are live or any bypasses are active + bool any() const { + return Lanes.any() || !BypassReads.empty() || !BypassWrites.empty(); + } + + /// Check if no lanes are live and no bypasses are active + bool none() const { + return Lanes.none() && BypassReads.empty() && BypassWrites.empty(); + } + + /// Get the number of lanes set + unsigned getNumLanes() const { return Lanes.getNumLanes(); } + + /// Implicit conversion to LaneBitmask for compatibility + operator LaneBitmask() const { return Lanes; } +}; + +/// A vector-like container for liveness information that provides safe +/// out-of-range access and common operations. +class LivenessVector { +private: + SmallVector Elements; + +public: + /// Construct with given size, all elements initialized to no liveness + explicit LivenessVector(size_t Size = 0); + + /// Construct with given size and initial lane mask + LivenessVector(size_t Size, LaneBitmask InitialValue); + + /// Get the size of the vector + size_t size() const; + + /// Check if empty + bool empty() const; + + /// Access element with bounds checking in debug mode + Liveness &operator[](size_t Index); + const Liveness &operator[](size_t Index) const; + + /// Safe access - returns empty liveness if out of range + Liveness at(size_t Index) const; + + /// Get the underlying elements + const SmallVector &getElements() const; + + /// Union with another vector + LivenessVector &operator|=(const LivenessVector &Other); + + /// Intersection with another vector + LivenessVector &operator&=(const LivenessVector &Other); + + /// Difference with another vector (this & ~Other) + LivenessVector &operator-=(const LivenessVector &Other); + + /// Create union with another vector + LivenessVector operator|(const LivenessVector &Other) const; + + /// Create intersection with another vector + LivenessVector operator&(const LivenessVector &Other) const; + + /// Create difference with another vector + LivenessVector operator-(const LivenessVector &Other) const; + + /// Check if any liveness overlaps with another vector + bool overlaps(const LivenessVector &Other) const; + + /// Check if any element has liveness + bool any() const; + + /// Check if no elements have liveness + bool none() const; + + /// Debug dump + void dump() const; + + /// Print to stream + void print(raw_ostream &OS) const; +}; + +} // namespace AIE +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AIE_AIELIVENESSVECTOR_H diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp index e1c969d26e57..295c31bd376d 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/ResourceScoreboard.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/Support/Debug.h" #include @@ -1498,7 +1499,7 @@ void llvm::AIEPostRASchedStrategy::buildGraph(ScheduleDAGMI &DAG, AAResults *AA, auto &BS = InterBlock.getBlockState(CurMBB); const auto &Region = BS.getCurrentRegion(); int NCopies = 1; - if (BS.FixPoint.II) { + if (BS.FixPoint.Stage == SchedulingStage::Pipelining) { assert(BS.Kind == BlockType::Loop); assert(BS.getRegions().size() == 1); assert(Region.getBotFixedBundles().empty()); @@ -1508,6 +1509,8 @@ void llvm::AIEPostRASchedStrategy::buildGraph(ScheduleDAGMI &DAG, AAResults *AA, // dependences appear as forward dependences between the first and the // second iteration. NCopies = 2; + // Initialize pipelining. + BS.initPipelining(); } DEBUG_BLOCKS(dbgs() << " buildGraph, NCopies=" << NCopies << "\n"); for (int S = 0; S < NCopies; S++) { @@ -1519,7 +1522,9 @@ void llvm::AIEPostRASchedStrategy::buildGraph(ScheduleDAGMI &DAG, AAResults *AA, } DAG.ExitSU.setInstr(Region.getExitInstr()); DAG.makeMaps(); - DAG.buildEdges(Context->AA); + // We are in the postscheduler, RPTracker, PDiffs and LIS are null. + // For VirtMode, we do want to track LaneMasks though. + DAG.buildEdges(Context->AA, RPTracker, PDiffs, LIS, true, false); static_cast(DAG).recordDbgInstrs(Region); } @@ -1580,6 +1585,9 @@ void AIEScheduleDAGMI::schedule() { if (PostSWP.schedule(*this, BS.FixPoint.II, More)) { BS.setPipelined(); LLVM_DEBUG(PostSWP.dump()); + } else { + // Pipelining failed, restore original physical registers. + BS.restorePipelining(); } return; } diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp index 5a0552b5a1f5..c1506788d512 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp @@ -12,7 +12,14 @@ //===----------------------------------------------------------------------===// #include "AIEPostPipeliner.h" +#include "AIEDataDependenceHelper.h" +#include "AIELiveRangeUtils.h" +#include "AIEMachineScheduler.h" +#include "AIEPostRegAlloc.h" +#include "AIERegDefUseTracker.h" #include "AIESWPSolver.h" +#include "AIEScarceRegScheduling.h" +#include "AIEScheduleInterpreter.h" #include "AIESlotUtils.h" #include "Utils/AIELoopUtils.h" #include "Utils/AIEMachineInstrPrint.h" @@ -23,6 +30,7 @@ #include "llvm/CodeGen/ResourceScoreboard.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/MC/MCInstrItineraries.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include #include @@ -47,6 +55,15 @@ static cl::opt PresetII("aie-postpipeliner-target-ii", cl::desc("II for which to allow the solver"), cl::init(0), cl::Hidden); +// Debug option. Setting it to one will implement the linear schedule +// without pipeline parallelism. +static cl::opt + ForcedStageCount("aie-postpipeliner-force-stagecount", + cl::desc("Extract a pipeline with the given stage" + " count. This is only granted if it divides the" + " computed stage count."), + cl::init(0), cl::Hidden); + PipelineScheduleVisitor::~PipelineScheduleVisitor() {} std::optional PostPipelinerStrategy::fitInInterval( @@ -95,8 +112,10 @@ class PostPipelineDumper : public PipelineScheduleVisitor { // The latency state is maintained in an 'Earliest' entry for each SUnit, // which is updated whenvever we schedule a predecessor of that SUnit. -PostPipeliner::PostPipeliner(const AIEHazardRecognizer &HR, int NInstr) - : HR(HR), NInstr(NInstr) {} +PostPipeliner::PostPipeliner(const AIEHazardRecognizer &HR, int NInstr, + RegLiveRangeTracker &RegTracker, + const MachineFunction &MF) + : HR(HR), RegTracker(RegTracker), Interpreter(MF), NInstr(NInstr) {} bool PostPipeliner::isPostPipelineCandidate(MachineBasicBlock &LoopBlock) { // We leave the single-block loop criterion to our caller. It is fulfilled @@ -455,6 +474,68 @@ void PostPipeliner::computeRecMII() { LLVM_DEBUG(dbgs() << "RecMII=" << RecMII << "\n"); } +int PostPipeliner::computeScarceRegMII() { + int ScarceRegMII = 0; + + // Group scarce live ranges by their base register. + DenseMap> ScarceRangesByReg; + for (const auto &LR : RegTracker.getLiveRanges()) { + // Only consider ranges that are marked as scarce. + if (!LR.isScarce()) { + continue; + } + + MCRegister BaseReg = LR.getBaseReg(); + if (BaseReg != MCRegister::NoRegister) { + ScarceRangesByReg[BaseReg].push_back(&LR); + } + } + + // For each register with multiple competing scarce ranges, compute the sum + // of minimal live lengths. + DEBUG_WITH_TYPE("aie-reg-liverange", { + dbgs() << "\n=== Scarce Register Analysis (II=" << II << ") ===\n"; + }); + + for (const auto &[Reg, Ranges] : ScarceRangesByReg) { + // Only consider registers with multiple competing ranges. + if (Ranges.size() <= 1) + continue; + + unsigned TotalLength = 0; + DEBUG_WITH_TYPE("aie-reg-liverange", { + const auto *TRI = DAG->MF.getSubtarget().getRegisterInfo(); + dbgs() << "Register " << TRI->getName(Reg) << " has " << Ranges.size() + << " competing ranges (1 available):\n"; + }); + + for (const RegLiveRange *LR : Ranges) { + auto Result = AIE::computeMinimalSchedule(*LR, *DAG, HR, Interpreter); + unsigned MinLength = Result.getMinimalLiveLength(); + TotalLength += MinLength; + + DEBUG_WITH_TYPE("aie-reg-liverange", { + dbgs() << " Range with " << LR->getNumDefs() << " defs, " + << LR->getNumUses() << " uses: minimal length = " << MinLength + << "\n"; + }); + } + + DEBUG_WITH_TYPE("aie-reg-liverange", + { dbgs() << " Total length: " << TotalLength << "\n"; }); + + ScarceRegMII = std::max(ScarceRegMII, static_cast(TotalLength)); + } + + DEBUG_WITH_TYPE("aie-reg-liverange", { + dbgs() << "ScarceRegMII=" << ScarceRegMII << "\n"; + dbgs() << "============================\n\n"; + }); + + LLVM_DEBUG(dbgs() << "ScarceRegMII=" << ScarceRegMII << "\n"); + return ScarceRegMII; +} + bool PostPipeliner::computeLoopCarriedParameters() { // Initialize slot counts. @@ -581,12 +662,37 @@ const char *getEdgeColor(SDep::Kind Kind) { return "gray"; } +// Returns edge attributes string including label (latency + register) and +// color. +std::string edgeAttributes(const SDep &Dep, const TargetRegisterInfo *TRI) { + std::string Label = std::to_string(Dep.getSignedLatency()); + switch (Dep.getKind()) { + case SDep::Data: + case SDep::Output: + case SDep::Anti: { + const Register Reg = Dep.getReg(); + if (Reg.isPhysical()) { + Label += " "; + Label += TRI->getName(Reg); + } else if (Reg.isVirtual()) { + Label += " VR"; + Label += std::to_string(Register::virtReg2Index(Reg)); + } + break; + } + case SDep::Order: + break; + } + return "[label=\"" + Label + "\", color=" + getEdgeColor(Dep.getKind()) + "]"; +} + void dumpGraph(const ScheduleInfo &Info, ScheduleDAGInstrs *DAG) { dbgs() << "digraph {\n"; const auto *TRI = DAG->MF.getSubtarget().getRegisterInfo(); // Collect backedge sources and destinations for mirroring. - SmallVector, 16> Lcds; + // Store the full SDep to preserve latency, kind, and register information. + SmallVector, 16> Lcds; SmallSet LcdSrc; SmallSet LcdDst; @@ -603,7 +709,7 @@ void dumpGraph(const ScheduleInfo &Info, ScheduleDAGInstrs *DAG) { } // This is a backedge from S to D in the next iteration. // Add it to the Lcds, and register src and dst nodes. - Lcds.emplace_back(S, D0, Dep.getSignedLatency(), Dep.getKind()); + Lcds.emplace_back(S, D0, Dep); LcdSrc.insert(S); LcdDst.insert(D0); } @@ -629,43 +735,26 @@ void dumpGraph(const ScheduleInfo &Info, ScheduleDAGInstrs *DAG) { << "\"]\n"; } - for (const auto &[Src, Dst, Latency, Kind] : Lcds) { + // Emit loop-carried dependency edges (mirror edges). + for (const auto &[Src, Dst, Dep] : Lcds) { + const std::string Attrs = edgeAttributes(Dep, TRI); // Create an edge from the split source to the destination. - dbgs() << format("\tSU%d_src -> SU%d [label=%d, color=%s]\n", Src, Dst, - Latency, getEdgeColor(Kind)); - // Create an edge from the source to the split destination - dbgs() << format("\tSU%d -> SU%d_dst [label=%d, color=%s]\n", Src, Dst, - Latency, getEdgeColor(Kind)); + dbgs() << format("\tSU%d_src -> SU%d ", Src, Dst) << Attrs << "\n"; + // Create an edge from the source to the split destination. + dbgs() << format("\tSU%d -> SU%d_dst ", Src, Dst) << Attrs << "\n"; } + // Emit regular (intra-iteration) edges. for (int K = 0; K < Info.NInstr; K++) { - auto &SU = DAG->SUnits[K]; - for (auto &Dep : SU.Succs) { - auto *Succ = Dep.getSUnit(); + const SUnit &SU = DAG->SUnits[K]; + for (const SDep &Dep : SU.Succs) { + const SUnit *Succ = Dep.getSUnit(); const int S = Succ->NodeNum; - if (S > Info.NInstr || S % Info.NInstr == K || Succ->isBoundaryNode()) { + if (S >= Info.NInstr || S % Info.NInstr == K || Succ->isBoundaryNode()) { continue; } - - dbgs() << "\tSU" << K << " -> " << "SU" << S; - dbgs() << " [ label=\"" << Dep.getSignedLatency(); - switch (Dep.getKind()) { - case SDep::Data: - case SDep::Output: - case SDep::Anti: { - const Register Reg = Dep.getReg(); - if (Reg.isPhysical()) { - dbgs() << format(" %s ", TRI->getName(Reg)); - } else { - dbgs() << format(" VR%d ", Register::virtReg2Index(Reg)); - } - break; - } - case SDep::Order: - break; - } - dbgs() << "\" color=" << getEdgeColor(Dep.getKind()) << " ] "; - dbgs() << "\n"; + dbgs() << "\tSU" << K << " -> SU" << S << " " << edgeAttributes(Dep, TRI) + << "\n"; } } dbgs() << "}\n"; @@ -764,6 +853,7 @@ int PostPipeliner::mostUrgent(PostPipelinerStrategy &Strategy) { void PostPipeliner::resetSchedule(bool FullReset) { Scoreboard.clear(); + EventSched.clear(); int K = 0; for (auto &N : Info.Nodes) { N.reset(FullReset); @@ -835,6 +925,9 @@ bool PostPipeliner::scheduleFirstIteration(PostPipelinerStrategy &Strategy) { scheduleNode(SU, Actual, Strategy); Info.commitCycle(N); + // Populate event schedule for this representative instruction + Interpreter.addInstructionEvents(*SU.getInstr(), Actual, EventSched); + DEBUG_FULL(dbgs() << "Scoreboard\n"; Scoreboard.dumpFull();); } @@ -868,6 +961,7 @@ int computeEarliestFromPreds(const SUnit &SU, const ScheduleInfo &Info) { return Earliest; } #endif + } // namespace bool PostPipeliner::scheduleOtherIterations(PostPipelinerStrategy &Strategy) { @@ -953,6 +1047,47 @@ bool PostPipeliner::scheduleOtherIterations(PostPipelinerStrategy &Strategy) { return true; } +bool PostPipeliner::tryScarceRangePacking() { + // Check applicability: get the cached most promising scarce range set. + const auto &ScarceRangePtrs = RegTracker.getMostPromisingScarceRanges(); + + // If no scarce ranges found, this approach is not applicable. + if (ScarceRangePtrs.empty()) { + return false; + } + + // Build ScarceRange objects from the RegLiveRange pointers. + std::vector ScarceRanges; + ScarceRanges.reserve(ScarceRangePtrs.size()); + for (const RegLiveRange *LR : ScarceRangePtrs) { + ScarceRanges.emplace_back(*LR, *DAG); + } + + // Build the scarce-only DAG. + buildScarceDAG(ScarceRanges, Info, *DAG); + + // The scarce-only DAG must be acyclic by construction (strict ordering of + // uses/defs on the same physreg). + assert(checkAcyclic(ScarceRanges) && + "Scarce-only DAG must be acyclic by construction"); + + // Create the strategy once (precomputes predecessors and members). + BurstMostUrgentStrategy Strategy(*DAG, Info, ScarceRanges, MinLength + II); + + // Enumerate orders and try scheduling with different orderings. + return enumerateRangeOrders( + ScarceRanges, [this, &Strategy](const SmallVector &Order) { + // Reset before each attempt. + resetSchedule(/*FullReset=*/false); + + // Initialize the strategy with this order. + Strategy.init(Order); + + // Try scheduling with this strategy. + return scheduleWithStrategy(Strategy); + }); +} + bool PostPipeliner::scheduleWithStrategy(PostPipelinerStrategy &S) { DEBUG_SUMMARY(dbgs() << "Starting " << S.name() << "\n"); if (!scheduleFirstIteration(S)) { @@ -971,6 +1106,10 @@ bool PostPipeliner::scheduleWithStrategy(PostPipelinerStrategy &S) { Info.applyRotation(II); Info.resetRotation(); + if (!tryAllocateRegisters()) { + return false; + } + DEBUG_SUMMARY(dbgs() << " Register allocation successful\n"); return true; } @@ -1247,6 +1386,15 @@ static const ConfigStrategy::Configuration Heuristics[] = { bool PostPipeliner::tryApproaches() { DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength << "\n"); + + // Try scarce range packing approach (VRegMode only). + if (RegTracker.areRegistersVirtualized()) { + if (tryScarceRangePacking()) { + DEBUG_SUMMARY(dbgs() << " Scarce range packing succeeded\n"); + return true; + } + } + int HeuristicIndex = 0; for (const auto &Config : Heuristics) { if (Heuristic >= 0 && Heuristic != HeuristicIndex++) { @@ -1379,9 +1527,9 @@ bool PostPipeliner::applySolver(const SolverData &Data, SWPSolver &Solver, bool PostPipeliner::schedule(ScheduleDAGMI &TheDAG, int InitiationInterval, MachineOptimizationRemarkEmitter &More) { - II = InitiationInterval; DAG = &TheDAG; + DEBUG_SUMMARY(dbgs() << format("PSBEGIN II=%d\n", II)); // We need to set up a scoreboard that gives us some look-ahead. // The look-ahead is used heuristically, to see conflicts with future @@ -1412,25 +1560,99 @@ bool PostPipeliner::schedule(ScheduleDAGMI &TheDAG, int InitiationInterval, << "Longest circuit does not fit II." << ore::NV("II", II) << ore::NV("BasicBlock", BB->getName()); }); + DEBUG_SUMMARY(dbgs() << "PSEND\n"); return false; } + + // Check scarce register MII (VRegMode only). + if (RegTracker.areRegistersVirtualized()) { + const int ScarceRegMII = computeScarceRegMII(); + if (II < ScarceRegMII) { + More.emit([&]() { + return MachineOptimizationRemarkMissed("postpipeliner", "schedule", + DbgLoc, BB) + << "Scarce register pressure does not fit II." + << ore::NV("II", II) << ore::NV("ScarceRegMII", ScarceRegMII) + << ore::NV("BasicBlock", BB->getName()); + }); + DEBUG_SUMMARY(dbgs() << "PSEND\n"); + return false; + } + } LLVM_DEBUG(dumpIntervals(Info, MinLength, II)); if (!tryApproaches()) { More.emit([&]() { return MachineOptimizationRemarkMissed("postpipeliner", "schedule", DbgLoc, BB) - << "No schedule found."; + << "No schedule found with register allocation."; }); - LLVM_DEBUG(dbgs() << "PostPipeliner: No schedule found\n"); + LLVM_DEBUG( + dbgs() + << "PostPipeliner: No schedule found with register allocation\n"); + DEBUG_SUMMARY(dbgs() << "PSEND\n"); return false; } More.emit([&]() { return MachineOptimizationRemark("postpipeliner", "schedule", DbgLoc, BB) - << "Schedule found" << ore::NV("NS", NStages) << ore::NV("II", II) + << "Schedule found with register allocation" + << ore::NV("NS", NStages) << ore::NV("II", II) << ore::NV("BasicBlock", BB->getName()); }); + LLVM_DEBUG(dbgs() << "PostPipeliner: Success\n"); + DEBUG_SUMMARY(dbgs() << "PSEND\n"); + return true; +} + +bool PostPipeliner::tryAllocateRegisters() { + // In physical mode, registers are not virtualized and no allocation is needed + // This is a trivial allocation that always succeeds + if (!RegTracker.areRegistersVirtualized()) { + LLVM_DEBUG( + dbgs() << "PostPipeliner: Physical mode - no allocation needed\n"); + return true; + } + + auto &MF = *DAG->getBB()->getParent(); + auto &MRI = MF.getRegInfo(); + const auto &ST = MF.getSubtarget(); + const auto *TRI = ST.getRegisterInfo(); + + // Compute modulo live lanes from the event schedule populated during + // scheduling + auto LiveLanesByVirtReg = Interpreter.buildLiveLanes(EventSched, II); + + // Debug dump if requested. + DEBUG_WITH_TYPE("aie-postregalloc", { + dbgs() << "\n=== Live Intervals ===\n"; + Interpreter.dumpEventSchedule(EventSched, dbgs()); + dbgs() << "\n"; + Interpreter.dumpLiveLanes(LiveLanesByVirtReg, II, dbgs()); + dbgs() << "=================================\n\n"; + }); + + // Perform register allocation. + DenseMap VRegToPhysReg; + const bool Success = AIEPostRegAlloc::allocate( + LiveLanesByVirtReg, II, RegTracker, MF, *TRI, MRI, VRegToPhysReg); + + if (!Success) { + LLVM_DEBUG(dbgs() << "PostPipeliner: Register allocation failed\n"); + return false; + } + + LLVM_DEBUG(dbgs() << "PostPipeliner: Register allocation succeeded with " + << VRegToPhysReg.size() << " assignments\n"); + + // Apply the register assignments through RegTracker + // This properly handles the virtualization state and updates the + // MachineFunction + RegTracker.rewriteToPhysRegs(VRegToPhysReg); + + LLVM_DEBUG(dbgs() << "PostPipeliner: Applied register allocation through " + "RegTracker\n"); + return true; } @@ -1520,14 +1742,14 @@ bool PostPipeliner::checkStages() { } void PostPipeliner::visitPipelineSection( - PipelineScheduleVisitor &Visitor, int StageCount, + PipelineScheduleVisitor &Visitor, int Repeat, std::function Filter) const { - // This runs StageCount times across the original body instructions and + // This runs Repeat times across the original body instructions and // calls the bundle emission callbacks according to Filter. // It provide the stage and the modulo cycle in that stage // (both starting at zero) to the filter - for (int Stage = 0; Stage < StageCount; Stage++) { + for (int Stage = 0; Stage < Repeat; Stage++) { for (int M = 0; M < II; M++) { Visitor.startBundle(); for (int K = 0; K < NInstr; K++) { @@ -1593,6 +1815,28 @@ int PostPipeliner::getFinalMinTripCount() const { return MinTripCount - Delta; } +void PostPipeliner::materializePipeline(PipelineScheduleVisitor &Visitor) { + // A schedule NS=N, II=L is compatible with NS=1, II=N*L. We provide an + // actual implementation of such less dense schedules, since it can provide + // debugging insights. + if (ForcedStageCount && NStages % ForcedStageCount == 0 && + NPrologueStages == NStages - 1) { + // Fix the II, recompute ModuloCycle and Stage, fix stagecount and + // prologue stages count + const int Factor = NStages / ForcedStageCount; + II *= Factor; + for (int K = 0; K < NInstr; K++) { + auto &Node = Info[K]; + Node.update(II); + } + NStages = ForcedStageCount; + NPrologueStages = NStages - 1; + } + + visitPipelineSchedule(Visitor); + updateTripCount(); +} + void NodeInfo::reset(bool FullReset) { Cycle = 0; Scheduled = false; diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.h b/llvm/lib/Target/AIE/AIEPostPipeliner.h index d664a79700dc..96740f559d78 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.h +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_AIE_AIEPOSTPIPELINER_H #include "AIEHazardRecognizer.h" +#include "AIEScheduleInterpreter.h" #include "AIESlotCounts.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/ResourceScoreboard.h" @@ -27,7 +28,12 @@ class AIEHazardRecognizer; class MachineOptimizationRemarkEmitter; } // namespace llvm +namespace llvm { +class RegLiveRangeTracker; // Forward declaration +} + namespace llvm::AIE { + namespace Solver { class SolverData; class SWPSolver; @@ -220,9 +226,16 @@ class PipelineScheduleVisitor { class PostPipeliner { const AIEHazardRecognizer &HR; + RegLiveRangeTracker &RegTracker; ScheduleDAGMI *DAG = nullptr; const AIEBaseInstrInfo *TII = nullptr; + // Schedule interpreter for computing modulo live ranges + AIEScheduleInterpreter Interpreter; + + // Event schedule populated during scheduling + EventSchedule EventSched; + int FirstUnscheduled = 0; int LastUnscheduled = -1; @@ -288,6 +301,7 @@ class PostPipeliner { void computeForward(); bool computeBackward(); void computeRecMII(); + int computeScarceRegMII(); /// Given Earliest and Latest of each node in the first iteration, /// compute the smallest length of the linear schedule that is feasible. @@ -323,13 +337,24 @@ class PostPipeliner { /// Top level strategy scheduler bool scheduleWithStrategy(PostPipelinerStrategy &Strategy); + /// Try to schedule scarce ranges by enumerating orders and using + /// BurstMostUrgentStrategy. + /// Checks applicability, finds scarce ranges, and attempts scheduling. + /// Returns true if scheduling succeeded, false otherwise. + bool tryScarceRangePacking(); + /// Reset dynamic scheduling data. /// If FullReset is set, also reset information collected from earlier /// data mining scheduling rounds. void resetSchedule(bool FullReset); + /// Try to allocate registers for the current schedule + /// Returns true if register allocation succeeds + bool tryAllocateRegisters(); + public: - PostPipeliner(const AIEHazardRecognizer &HR, int NInstr); + PostPipeliner(const AIEHazardRecognizer &HR, int NInstr, + RegLiveRangeTracker &RegTracker, const MachineFunction &MF); /// Check whether this is a suitable loop for the PostPipeliner. It also /// leaves some useful information. @@ -360,12 +385,14 @@ class PostPipeliner { // It will not call the section delimitor methods. // \param Filter will decide on calling Visitor.addToBundle(). void visitPipelineSection( - PipelineScheduleVisitor &Visitor, int StageCount, + PipelineScheduleVisitor &Visitor, int Repeat, std::function Filter) const; // Modify the tripcount to run StageCount-1 less iterations. void updateTripCount() const; + void materializePipeline(PipelineScheduleVisitor &Visitor); + int getFinalMinTripCount() const; void dump() const; diff --git a/llvm/lib/Target/AIE/AIEPostRegAlloc.cpp b/llvm/lib/Target/AIE/AIEPostRegAlloc.cpp new file mode 100644 index 000000000000..e7f291c4b135 --- /dev/null +++ b/llvm/lib/Target/AIE/AIEPostRegAlloc.cpp @@ -0,0 +1,581 @@ +//===- AIEPostRegAlloc.cpp - Post-scheduling register allocator ----------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// This file implements a post-scheduling register allocator for AIE targets. +// +//===----------------------------------------------------------------------===// + +#include "AIEPostRegAlloc.h" +#include "AIELivenessVector.h" +#include "AIERegDefUseTracker.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" +#include +#include + +#define DEBUG_TYPE "aie-postregalloc" + +using namespace llvm; +using namespace llvm::AIE; + +// Initialize allocation state and compute interference graphs. +void AIEPostRegAlloc::AllocState::init( + const TargetRegisterInfo *InTRI, + const DenseMap &LiveLanesByVReg, + const RegLiveRangeTracker *RegTracker, const MachineRegisterInfo &MRI) { + this->RegUnitOccupancy.clear(); + this->PhysOccupancy.clear(); + this->TRI = InTRI; + + const auto &AvailableRegs = RegTracker->getAvailablePhysRegs(); + + // Build register class interference graph once. + // Iterate over LiveRanges to get register class IDs. + DenseSet UsedRCIds; + for (const RegLiveRange &LR : RegTracker->getLiveRanges()) { + if (const TargetRegisterClass *RC = LR.getRegisterClass()) + UsedRCIds.insert(RC->getID()); + } + this->RCInterferenceGraph = + AIEPostRegAlloc::buildRCInterferenceGraph(UsedRCIds, *InTRI); + + // Build virtual register interference graph once. + this->VRegInterferenceGraph = AIEPostRegAlloc::buildVRegInterferenceGraph( + LiveLanesByVReg, MRI, RCInterferenceGraph); + + // Pre-compute metrics for all LiveRanges. + this->AllMetrics.clear(); + for (const RegLiveRange &LR : RegTracker->getLiveRanges()) { + const unsigned VReg = LR.getVReg().id(); + auto It = LiveLanesByVReg.find(VReg); + if (It == LiveLanesByVReg.end()) + continue; + const AIE::LivenessVector &Masks = It->second; + AllMetrics[VReg] = AIEPostRegAlloc::computeMetrics( + LR, Masks, VRegInterferenceGraph, LiveLanesByVReg, RCInterferenceGraph, + AvailableRegs, MRI, *InTRI); + } +} + +// Check if VReg can be placed in PhysReg without conflicts. +bool AIEPostRegAlloc::AllocState::canPlace( + unsigned VReg, Register PhysReg, const AIE::LivenessVector &VRegMasks, + const TargetRegisterClass *RC) const { + + // Check RegUnit conflicts - this handles aliasing automatically. + // Two registers interfere if they share any RegUnits. + for (MCRegUnitIterator Units(PhysReg.asMCReg(), TRI); Units.isValid(); + ++Units) { + unsigned Unit = *Units; + auto It = RegUnitOccupancy.find(Unit); + if (It != RegUnitOccupancy.end()) { + // This RegUnit is occupied. Check if it conflicts with our VRegMasks. + const auto &UnitOcc = It->second; + if (VRegMasks.overlaps(UnitOcc)) { + LLVM_DEBUG(dbgs() << " RegUnit conflict detected for " + << printReg(VReg, TRI) << " in " + << printReg(PhysReg, TRI) << " (unit " << Unit + << ")\n"); + return false; + } + } + } + + return true; +} + +// Place VReg in PhysReg (updates occupancy). +void AIEPostRegAlloc::AllocState::place(unsigned VReg, Register PhysReg, + const AIE::LivenessVector &VRegMasks, + const TargetRegisterClass *RC) { + + // Update lane mask occupancy for the specific register (for compatibility). + PhysOccupancy[PhysReg] |= VRegMasks; + + // Update RegUnit occupancy - this automatically handles aliasing. + unsigned NumUnits = 0; + for (MCRegUnitIterator Units(PhysReg.asMCReg(), TRI); Units.isValid(); + ++Units) { + RegUnitOccupancy[*Units] |= VRegMasks; + NumUnits++; + } + + LLVM_DEBUG(dbgs() << " Placed " << printReg(VReg, TRI) << " in " + << printReg(PhysReg, TRI) << " (updated " << NumUnits + << " RegUnits)\n"); +} + +// Build register class interference graph with asymmetric weights. +AIEPostRegAlloc::WeightedAsymmetricGraph +AIEPostRegAlloc::buildRCInterferenceGraph(const DenseSet &UsedRCIds, + const TargetRegisterInfo &TRI) { + WeightedAsymmetricGraph Graph; + + // Check all ordered pairs of register classes. + for (unsigned RCId1 : UsedRCIds) { + const TargetRegisterClass *RC1 = TRI.getRegClass(RCId1); + + for (unsigned RCId2 : UsedRCIds) { + if (RCId1 == RCId2) + continue; + + const TargetRegisterClass *RC2 = TRI.getRegClass(RCId2); + unsigned RC2Size = std::distance(RC2->begin(), RC2->end()); + + // Count how many RC1 registers are blocked by each RC2 register. + // For asymmetric weight: if I allocate one register from RC2, + // how many RC1 registers become unavailable on average? + unsigned TotalRC1Blocked = 0; + + for (MCPhysReg Reg2 : *RC2) { + unsigned RC1BlockedByThisReg2 = 0; + for (MCPhysReg Reg1 : *RC1) { + if (TRI.regsOverlap(Reg1, Reg2)) { + RC1BlockedByThisReg2++; + } + } + TotalRC1Blocked += RC1BlockedByThisReg2; + } + + if (TotalRC1Blocked > 0) { + // Weight = average number of RC1 registers blocked per RC2 register. + // Scale by 100 to preserve precision. + // This gives asymmetric weights: + // - eY -> VEC512: each VEC512 blocks ~0.5 eY registers + // - VEC512 -> eY: each eY blocks ~2 VEC512 registers + unsigned Weight = (TotalRC1Blocked * 100) / RC2Size; + // Ensure minimum weight of 1 for any overlap. + Weight = std::max(1u, Weight); + Graph.addInterference(RCId1, RCId2, Weight); + + LLVM_DEBUG(dbgs() << "RC interference: " << TRI.getRegClassName(RC1) + << " -> " << TRI.getRegClassName(RC2) + << " weight=" << Weight << " (avg " << TotalRC1Blocked + << "/" << RC2Size << ")\n"); + } + } + } + + return Graph; +} + +// Build virtual register interference graph (symmetric). +AIEPostRegAlloc::WeightedSymmetricGraph +AIEPostRegAlloc::buildVRegInterferenceGraph( + const DenseMap &LiveLanesByVReg, + const MachineRegisterInfo &MRI, + const WeightedAsymmetricGraph &RCInterferenceGraph) { + + WeightedSymmetricGraph Graph; + + // Build a vector of VRegs for iteration (to ensure consistent ordering). + std::vector VRegs; + for (const auto &[VReg, _] : LiveLanesByVReg) { + VRegs.push_back(VReg); + } + + // Check all pairs of virtual registers. + // Use symmetry: only check pairs where I < J. + for (size_t I = 0; I < VRegs.size(); ++I) { + unsigned VReg1 = VRegs[I]; + const auto &Masks1 = LiveLanesByVReg.find(VReg1)->second; + unsigned RCId1 = MRI.getRegClass(VReg1)->getID(); + + for (size_t J = I + 1; J < VRegs.size(); ++J) { + unsigned VReg2 = VRegs[J]; + const auto &Masks2 = LiveLanesByVReg.find(VReg2)->second; + unsigned RCId2 = MRI.getRegClass(VReg2)->getID(); + + // First check if their register classes can interfere. + if (!RCInterferenceGraph.interferes(RCId1, RCId2)) + continue; + + // Then check if their live ranges overlap temporally. + if (Masks1.overlaps(Masks2)) { + Graph.addInterference(VReg1, VReg2); + } + } + } + + return Graph; +} + +// Compute metrics for a live range. +AIEPostRegAlloc::VRegMetrics AIEPostRegAlloc::computeMetrics( + const RegLiveRange &LR, const AIE::LivenessVector &Masks, + const WeightedSymmetricGraph &VRegInterferenceGraph, + const DenseMap &AllVRegs, + const WeightedAsymmetricGraph &RCInterferenceGraph, + const DenseSet &AvailableRegs, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) { + VRegMetrics Metrics = {0, 0, 0, 0, 0, 0}; + + const Register VReg = LR.getVReg(); + + // Compute basic metrics. + for (const auto &Mask : Masks.getElements()) { + if (Mask.any()) { + unsigned LanesInCycle = Mask.getNumLanes(); + Metrics.TotalLanes += LanesInCycle; + Metrics.MaxWidth = std::max(Metrics.MaxWidth, LanesInCycle); + Metrics.Duration++; + } + } + + // Compute pure and aliasing interference degrees. + // Use the register class from the LiveRange. + const TargetRegisterClass *RC = LR.getRegisterClass(); + unsigned RCId = RC->getID(); + + for (const auto &[OtherVReg, _] : AllVRegs) { + if (OtherVReg != VReg && + VRegInterferenceGraph.interferes(VReg, OtherVReg)) { + // For interference with other VRegs, we still need MRI to look up + // their register class. A future optimization could pass a map + // from VReg to LiveRange to avoid this MRI dependency. + const TargetRegisterClass *OtherRC = MRI.getRegClass(OtherVReg); + unsigned OtherRCId = OtherRC->getID(); + + if (RCId == OtherRCId) { + // Same register class - pure interference. + Metrics.PureInterferenceDegree++; + } else if (RCInterferenceGraph.interferes(RCId, OtherRCId)) { + // Different but overlapping register classes - aliasing interference. + // Use asymmetric weight: how much does OtherVReg's class affect + // VReg's class? + unsigned Weight = + RCInterferenceGraph.getInterferenceWeight(RCId, OtherRCId); + Metrics.AliasingInterferenceDegree += Weight; + } + } + } + + // Count available registers using per-LR AdmissibleRegs. + std::vector Candidates = + getCandidatePhysRegs(LR.getAdmissibleRegs(), AvailableRegs); + Metrics.NumAvailableRegs = Candidates.size(); + + return Metrics; +} + +// Get allocatable physical registers for a live range. +// Returns the intersection of AdmissibleRegs (semantic constraint from +// instruction encoding) and AvailableRegs (global availability). +std::vector AIEPostRegAlloc::getCandidatePhysRegs( + const DenseSet &AdmissibleRegs, + const DenseSet &AvailableRegs) { + + std::vector Candidates; + + // Return the intersection of admissible and available registers. + // AdmissibleRegs represents the semantic constraint from the LiveRange. + // AvailableRegs represents the global set of registers available for + // reallocation. + for (MCRegister PhysReg : AdmissibleRegs) { + if (AvailableRegs.count(PhysReg)) { + Candidates.push_back(PhysReg); + } + } + + return Candidates; +} + +// Try to allocate using a specific scoring function for ordering. +AIEPostRegAlloc::AllocResult AIEPostRegAlloc::tryAllocate( + const DenseMap &LiveLanesByVReg, + const RegLiveRangeTracker *RegTracker, const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI, AllocState &State, ScoringFunction ScoreFn, + DenseMap &OutAssign) { + + // Clear per-attempt state. + State.RegUnitOccupancy.clear(); + State.PhysOccupancy.clear(); + OutAssign.clear(); + + const auto &AvailableRegs = RegTracker->getAvailablePhysRegs(); + + // Build sorted list of LiveRanges by difficulty. + struct LRInfo { + const RegLiveRange *LR; + unsigned VReg; + unsigned Score; + const AIE::LivenessVector *Masks; + }; + + // Score and collect LiveRanges using pre-computed metrics from State. + std::vector LRInfos; + for (const RegLiveRange &LR : RegTracker->getLiveRanges()) { + const unsigned VReg = LR.getVReg().id(); + auto It = LiveLanesByVReg.find(VReg); + if (It == LiveLanesByVReg.end()) + continue; + + LRInfo Info; + Info.LR = &LR; + Info.VReg = VReg; + Info.Score = ScoreFn(State.AllMetrics[VReg]); + Info.Masks = &It->second; + LRInfos.push_back(Info); + } + + // Sort by descending score (hardest first). + // Use VReg as tiebreaker for deterministic ordering when scores are equal. + llvm::sort(LRInfos, [](const LRInfo &A, const LRInfo &B) { + if (A.Score != B.Score) + return A.Score > B.Score; + return A.VReg < B.VReg; + }); + + // Try to allocate each LiveRange. + for (const auto &Info : LRInfos) { + const RegLiveRange &LR = *Info.LR; + const unsigned VReg = Info.VReg; + const auto &VRegMasks = *Info.Masks; + const TargetRegisterClass *RC = LR.getRegisterClass(); + const auto &Metrics = State.AllMetrics[VReg]; + + LLVM_DEBUG(dbgs() << "Allocating " << printReg(VReg, &TRI) << " class=" + << TRI.getRegClassName(RC) << " (score=" << Info.Score + << ", available=" << Metrics.NumAvailableRegs + << ", pure_int=" << Metrics.PureInterferenceDegree + << ", alias_int=" << Metrics.AliasingInterferenceDegree + << ")\n"); + + // Check for infeasible schedule: pure interference >= available registers. + // This is a global failure - no scoring function can fix this. + if (Metrics.PureInterferenceDegree >= Metrics.NumAvailableRegs) { + LLVM_DEBUG(dbgs() << " Infeasible schedule detected: pure interference (" + << Metrics.PureInterferenceDegree + << ") >= available registers (" + << Metrics.NumAvailableRegs << ")\n"); + return AllocResult(/*InfeasibleSchedule=*/true); + } + + // Get candidate physical registers using AdmissibleRegs from LiveRange. + std::vector Candidates = + getCandidatePhysRegs(LR.getAdmissibleRegs(), AvailableRegs); + + if (Candidates.empty()) { + LLVM_DEBUG(dbgs() << " No candidates available!\n"); + return AllocResult(/*InfeasibleSchedule=*/false); + } + + // Try to find a suitable physical register (first-fit). + Register ChosenPhys = Register(); + + for (Register PhysReg : Candidates) { + LLVM_DEBUG(dbgs() << " Trying " << printReg(PhysReg, &TRI) << "\n"); + if (State.canPlace(VReg, PhysReg, VRegMasks, RC)) { + ChosenPhys = PhysReg; + break; + } + } + + if (!ChosenPhys.isValid()) { + LLVM_DEBUG(dbgs() << " Failed to find suitable physreg!\n"); + return AllocResult(/*InfeasibleSchedule=*/false); + } + + // Place the VReg and record in output. + State.place(VReg, ChosenPhys, VRegMasks, RC); + OutAssign[Register(VReg)] = ChosenPhys.asMCReg(); + } + + LLVM_DEBUG(dbgs() << "Allocation succeeded with " << OutAssign.size() + << " assignments\n"); + return AllocResult(); +} + +// Dump virtual register metrics for debugging. +void AIEPostRegAlloc::dumpVRegMetrics( + const DenseMap &AllMetrics, + const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) { + + dbgs() << "=== Virtual Register Metrics Dump ===\n"; + dbgs() << "Total Virtual Registers: " << AllMetrics.size() << "\n\n"; + + // Collect and sort VRegs for consistent output. + std::vector> VRegMetricsList; + for (const auto &[VReg, Metrics] : AllMetrics) { + VRegMetricsList.push_back({VReg, Metrics}); + } + + // Sort by VReg number for consistent output. + llvm::sort(VRegMetricsList, [](const auto &A, const auto &B) { + return Register::virtReg2Index(A.first) < Register::virtReg2Index(B.first); + }); + + // Print header. + dbgs() << "VReg RegClass Avail Pure Alias " + "TotalLanes MaxWidth Duration\n"; + dbgs() << "-------- ----------------------- ----- ---- ----- " + "---------- -------- --------\n"; + + // Print metrics for each VReg. + for (const auto &[VReg, Metrics] : VRegMetricsList) { + const TargetRegisterClass *RC = MRI.getRegClass(VReg); + const char *Status = + (Metrics.PureInterferenceDegree >= Metrics.NumAvailableRegs) ? " FAIL" + : ""; + dbgs() << format("%%vreg%-4u %-23s %5u %4u %5u %10u %8u %8u%s\n", + Register::virtReg2Index(VReg), TRI.getRegClassName(RC), + Metrics.NumAvailableRegs, Metrics.PureInterferenceDegree, + Metrics.AliasingInterferenceDegree, Metrics.TotalLanes, + Metrics.MaxWidth, Metrics.Duration, Status); + } + + // Print summary statistics. + dbgs() << "\n=== Summary Statistics ===\n"; + + // Compute aggregate statistics. + unsigned TotalLanesSum = 0; + unsigned MaxWidthMax = 0; + unsigned MaxDuration = 0; + unsigned MaxPureInterferenceDegree = 0; + unsigned MaxAliasingInterferenceDegree = 0; + double AvgPureInterferenceDegree = 0.0; + double AvgAliasingInterferenceDegree = 0.0; + + for (const auto &[_, Metrics] : VRegMetricsList) { + TotalLanesSum += Metrics.TotalLanes; + MaxWidthMax = std::max(MaxWidthMax, Metrics.MaxWidth); + MaxDuration = std::max(MaxDuration, Metrics.Duration); + MaxPureInterferenceDegree = + std::max(MaxPureInterferenceDegree, Metrics.PureInterferenceDegree); + MaxAliasingInterferenceDegree = std::max( + MaxAliasingInterferenceDegree, Metrics.AliasingInterferenceDegree); + AvgPureInterferenceDegree += Metrics.PureInterferenceDegree; + AvgAliasingInterferenceDegree += Metrics.AliasingInterferenceDegree; + } + + if (!VRegMetricsList.empty()) { + AvgPureInterferenceDegree /= VRegMetricsList.size(); + AvgAliasingInterferenceDegree /= VRegMetricsList.size(); + } + + dbgs() << "Total Lanes (sum): " << TotalLanesSum << "\n"; + dbgs() << "Max Width (max): " << MaxWidthMax << "\n"; + dbgs() << "Max Duration: " << MaxDuration << "\n"; + dbgs() << "Max Pure Interference Degree: " << MaxPureInterferenceDegree + << "\n"; + dbgs() << "Max Aliasing Interference Deg: " << MaxAliasingInterferenceDegree + << "\n"; + dbgs() << format("Avg Pure Interference Degree: %.2f\n", + AvgPureInterferenceDegree); + dbgs() << format("Avg Aliasing Interference Deg: %.2f\n", + AvgAliasingInterferenceDegree); + + // Count register classes used. + DenseMap RCCounts; + for (const auto &[VReg, _] : VRegMetricsList) { + RCCounts[MRI.getRegClass(VReg)]++; + } + + dbgs() << "\n=== Register Class Distribution ===\n"; + std::vector> RCCountVec; + for (const auto &[RC, Count] : RCCounts) { + RCCountVec.push_back({RC, Count}); + } + llvm::sort(RCCountVec, [](const auto &A, const auto &B) { + // Sort by count descending. + return A.second > B.second; + }); + + for (const auto &[RC, Count] : RCCountVec) { + dbgs() << format(" %-25s: %u\n", TRI.getRegClassName(RC), Count); + } + + dbgs() << "\n=== End Virtual Register Metrics ===\n\n"; +} + +// Main allocation entry point. +bool AIEPostRegAlloc::allocate( + const DenseMap &LiveLanesByVReg, int II, + RegLiveRangeTracker &RegTracker, const MachineFunction &MF, + const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, + DenseMap &OutAssign) { + + LLVM_DEBUG(dbgs() << "AIEPostRegAlloc::allocate for " + << LiveLanesByVReg.size() << " vregs, II=" << II << "\n"); + + if (LiveLanesByVReg.empty()) { + LLVM_DEBUG(dbgs() << "No vregs to allocate\n"); + return true; + } + + LLVM_DEBUG(dbgs() << "Available " << RegTracker.getAvailablePhysRegs().size() + << " physical registers\n"); + + // Initialize allocation state with interference graphs computed once. + AllocState State; + State.init(&TRI, LiveLanesByVReg, &RegTracker, MRI); + + // Dump virtual register metrics when debug output is enabled. + LLVM_DEBUG(dumpVRegMetrics(State.AllMetrics, MRI, TRI)); + + // Define the allocation strategies to try. + struct AllocationStrategy { + const char *Name; + ScoringFunction ScoreFn; + }; + + std::vector Strategies = { + // Try scarce register class priority scoring first. + {"scarce register class scoring", scoreByScarceRegClass}, + // Try interference-based scoring (graph coloring inspired). + {"interference degree scoring", scoreByInterference}, + // Try with area+width scoring (original). + {"area+width scoring", scoreByAreaPlusWidth}, + // Try with pure area scoring. + {"area scoring", scoreByArea}, + // Try with width-priority scoring. + {"width scoring", scoreByWidth}, + // Try with duration scoring. + {"duration scoring", scoreByDuration}, + // Try a custom non-linear scoring function. + {"quadratic width scoring", + [](const VRegMetrics &M) { + if (M.PureInterferenceDegree >= M.NumAvailableRegs) + return UINT_MAX; + // Quadratic penalty for width, linear for duration. + return M.MaxWidth * M.MaxWidth + M.Duration; + }}, + }; + + // Try each strategy in order. + for (const auto &Strategy : Strategies) { + LLVM_DEBUG(dbgs() << "Trying allocation with " << Strategy.Name << "\n"); + + AllocResult Result = tryAllocate(LiveLanesByVReg, &RegTracker, TRI, MRI, + State, Strategy.ScoreFn, OutAssign); + + if (Result) { + LLVM_DEBUG(dbgs() << "Allocation succeeded with " << Strategy.Name + << "\n"); + return true; + } + + LLVM_DEBUG(dbgs() << Strategy.Name << " failed\n"); + + // If the schedule is infeasible, no other scoring function will succeed. + if (Result.isInfeasibleSchedule()) { + LLVM_DEBUG(dbgs() << "Schedule is infeasible - skipping remaining " + << "allocation strategies\n"); + break; + } + } + + LLVM_DEBUG(dbgs() << "All allocation attempts failed\n"); + return false; +} diff --git a/llvm/lib/Target/AIE/AIEPostRegAlloc.h b/llvm/lib/Target/AIE/AIEPostRegAlloc.h new file mode 100644 index 000000000000..63ccd3c7625a --- /dev/null +++ b/llvm/lib/Target/AIE/AIEPostRegAlloc.h @@ -0,0 +1,320 @@ +//===- AIEPostRegAlloc.h - Post-scheduling register allocator ------------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// This file defines a post-scheduling register allocator for AIE targets. +// It performs modulo-aware register allocation for pipelined loops and can +// also be used for non-loop blocks. The allocator is transactional and does +// not spill - it returns false if allocation fails. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AIE_AIEPOSTREGALLOC_H +#define LLVM_LIB_TARGET_AIE_AIEPOSTREGALLOC_H + +#include "AIELivenessVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/CodeGen/Register.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCRegister.h" +#include +#include + +namespace llvm { + +class MachineFunction; +class MachineRegisterInfo; +class TargetRegisterInfo; +class TargetRegisterClass; +class RegLiveRangeTracker; +class RegLiveRange; + +namespace AIE { + +/// Post-scheduling register allocator for AIE targets. +/// +/// This allocator performs modulo-aware register allocation using lane masks +/// to track sub-register liveness. It properly handles physical register +/// aliasing, ensuring that allocating a register blocks all its aliases +/// (sub-registers and super-registers). It is transactional (does not modify +/// MRI until a complete solution is found) and does not spill (returns false +/// if allocation fails). +class AIEPostRegAlloc { +private: + /// Interference graph with configurable weight type and symmetry. + /// @tparam WeightT Type of edge weights (bool for simple, unsigned for + /// weighted). + /// @tparam IsSymmetric Whether the graph is symmetric (undirected) or + /// asymmetric (directed). + template + class InterferenceGraph { + // For symmetric graphs, store upper triangle; for asymmetric, store + // full matrix. Key is (from, to) pair - order matters for asymmetric. + DenseMap, WeightT> Edges; + + public: + /// Add an interference edge with optional weight. + /// For symmetric graphs, order doesn't matter. + /// For asymmetric graphs, this is the weight from A to B. + void addInterference(unsigned A, unsigned B, WeightT Weight = WeightT(1)) { + if constexpr (IsSymmetric) { + if (A > B) + std::swap(A, B); + } + Edges[std::make_pair(A, B)] = Weight; + } + + /// Check if A and B interfere. + bool interferes(unsigned A, unsigned B) const { + if (A == B) + return true; // A node interferes with itself. + if constexpr (IsSymmetric) { + if (A > B) + std::swap(A, B); + } + auto It = Edges.find(std::make_pair(A, B)); + if constexpr (std::is_same_v) { + return It != Edges.end() && It->second; + } else { + return It != Edges.end() && It->second > 0; + } + } + + /// Get the weight of interference from A to B. + /// For asymmetric graphs, this is directional. + WeightT getInterferenceWeight(unsigned A, unsigned B) const { + if (A == B) + return WeightT(0); // No weight for self-interference. + if constexpr (IsSymmetric) { + if (A > B) + std::swap(A, B); + } + auto It = Edges.find(std::make_pair(A, B)); + return (It != Edges.end()) ? It->second : WeightT(0); + } + }; + + // Type aliases for common use cases. + using SimpleSymmetricGraph = InterferenceGraph; + using WeightedSymmetricGraph = InterferenceGraph; + using WeightedAsymmetricGraph = InterferenceGraph; + + /// Pre-computed metrics for a virtual register. + struct VRegMetrics { + // Sum of lanes across all cycles. + unsigned TotalLanes; + // Maximum lanes in any single cycle. + unsigned MaxWidth; + // Number of cycles where register is live. + unsigned Duration; + // Number of other VRegs in the SAME register class that interfere. + unsigned PureInterferenceDegree; + // Weighted interference from VRegs in aliasing register classes. + unsigned AliasingInterferenceDegree; + // Number of available registers in this register class. + unsigned NumAvailableRegs; + }; + + /// Result of an allocation attempt. + /// Default construction indicates success. + /// Construction with bool parameter indicates failure (true = infeasible). + class AllocResult { + bool Success = true; + bool InfeasibleSchedule = false; + + public: + // Default constructor - indicates success. + AllocResult() = default; + + // Constructor for failure cases. + // InfeasibleSchedule=true means no scoring function can succeed. + // InfeasibleSchedule=false means this scoring function failed but another + // might work. + explicit AllocResult(bool InfeasibleSchedule) + : Success(false), InfeasibleSchedule(InfeasibleSchedule) {} + + // Check if the schedule is provably infeasible. + bool isInfeasibleSchedule() const { return InfeasibleSchedule; } + + // Implicit conversion to bool - true if allocation succeeded. + operator bool() const { return Success; } + }; + + /// Internal allocation state with RegUnit-based interference tracking. + struct AllocState { + /// RegUnit occupancy - tracks lane masks for each register unit. + /// RegUnits are the fundamental units of register interference in LLVM. + /// Two registers interfere if they share any RegUnits. + DenseMap RegUnitOccupancy; + + /// Physical register occupancy - tracks lane masks for each allocated + /// physical register (kept for compatibility with existing code). + DenseMap PhysOccupancy; + + /// Pre-computed interference graphs (reused across scoring attempts). + WeightedAsymmetricGraph RCInterferenceGraph; + WeightedSymmetricGraph VRegInterferenceGraph; + + /// Pre-computed metrics for all LiveRanges (reused across scoring + /// attempts). Keyed by VReg since there is a 1:1 mapping. + DenseMap AllMetrics; + + /// Target register info for RegUnit computation. + const TargetRegisterInfo *TRI = nullptr; + + /// Initialize occupancy and compute interference graphs. + /// The RegTracker provides the problem description (LiveRanges, + /// AvailableRegs, AdmissibleRegs per LR). LiveLanesByVReg provides the + /// temporal liveness data computed during scheduling. + void init(const TargetRegisterInfo *TRI, + const DenseMap &LiveLanesByVReg, + const RegLiveRangeTracker *RegTracker, + const MachineRegisterInfo &MRI); + + /// Check if VReg can be placed in PhysReg without conflicts. + /// This checks RegUnit conflicts to handle aliasing properly. + bool canPlace(unsigned VReg, Register PhysReg, + const AIE::LivenessVector &VRegMasks, + const TargetRegisterClass *RC) const; + + /// Place VReg in PhysReg (updates RegUnit occupancy). + void place(unsigned VReg, Register PhysReg, + const AIE::LivenessVector &VRegMasks, + const TargetRegisterClass *RC); + }; + + /// Scoring function type - takes pre-computed metrics and returns a score. + using ScoringFunction = std::function; + +public: + /// Allocate physical registers for virtual registers. + /// + /// \param LiveLanesByVReg Map from virtual register to per-cycle lane masks. + /// \param II Initiation interval for pipelined loops (>= 1). + /// For non-pipelined blocks, use 0 or the schedule length. + /// \param RegTracker RegLiveRangeTracker providing register information. + /// \param MF Machine function being processed. + /// \param TRI Target register info. + /// \param MRI Machine register info (not modified). + /// \param OutAssign Output map from virtual to physical registers. + /// \return True if allocation succeeded, false if no solution found. + static bool + allocate(const DenseMap &LiveLanesByVirtReg, + int II, RegLiveRangeTracker &RegTracker, const MachineFunction &MF, + const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, + DenseMap &OutAssign); + +private: + /// Try to allocate using a specific scoring function for ordering. + /// Returns AllocResult which implicitly converts to bool (true = success). + /// On success, OutAssign contains the virtual to physical register mapping. + /// The RegTracker provides the problem description (LiveRanges, + /// AvailableRegs, AdmissibleRegs per LR). + static AllocResult + tryAllocate(const DenseMap &LiveLanesByVReg, + const RegLiveRangeTracker *RegTracker, + const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, + AllocState &State, ScoringFunction ScoreFn, + DenseMap &OutAssign); + + /// Compute metrics for a live range. + /// \param LR The live range to compute metrics for. + /// \param Masks The lane masks for this live range. + /// \param VRegInterferenceGraph Pre-computed virtual register interference + /// graph. + /// \param AllVRegs All virtual registers to compute degree against. + /// \param RCInterferenceGraph Register class interference graph with + /// weights. + /// \param AvailableRegs Available physical registers. + /// \param MRI Machine register info (for looking up other VRegs' RCs). + /// \param TRI Target register info. + static VRegMetrics + computeMetrics(const RegLiveRange &LR, const AIE::LivenessVector &Masks, + const WeightedSymmetricGraph &VRegInterferenceGraph, + const DenseMap &AllVRegs, + const WeightedAsymmetricGraph &RCInterferenceGraph, + const DenseSet &AvailableRegs, + const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI); + + /// Build register class interference graph with asymmetric weights. + static WeightedAsymmetricGraph + buildRCInterferenceGraph(const DenseSet &UsedRCIds, + const TargetRegisterInfo &TRI); + + /// Build virtual register interference graph (symmetric). + static WeightedSymmetricGraph buildVRegInterferenceGraph( + const DenseMap &LiveLanesByVirtReg, + const MachineRegisterInfo &MRI, + const WeightedAsymmetricGraph &RCInterferenceGraph); + + /// Predefined scoring functions. + /// All return infinite score when pure degree >= available registers. + static unsigned scoreByArea(const VRegMetrics &M) { + if (M.PureInterferenceDegree >= M.NumAvailableRegs) + return UINT_MAX; + return M.TotalLanes; + } + static unsigned scoreByWidth(const VRegMetrics &M) { + if (M.PureInterferenceDegree >= M.NumAvailableRegs) + return UINT_MAX; + return M.MaxWidth; + } + static unsigned scoreByDuration(const VRegMetrics &M) { + if (M.PureInterferenceDegree >= M.NumAvailableRegs) + return UINT_MAX; + return M.Duration; + } + static unsigned scoreByAreaPlusWidth(const VRegMetrics &M) { + if (M.PureInterferenceDegree >= M.NumAvailableRegs) + return UINT_MAX; + return M.TotalLanes * 10 + M.MaxWidth; + } + // Score by interference degree - considers both pure and aliasing. + static unsigned scoreByInterference(const VRegMetrics &M) { + if (M.PureInterferenceDegree >= M.NumAvailableRegs) + return UINT_MAX; + // Pure interference is critical, aliasing interference is secondary. + return M.PureInterferenceDegree * 1000 + M.AliasingInterferenceDegree * 10 + + M.TotalLanes; + } + // Score prioritizing scarce register classes (fewer available registers). + // Register classes with fewer available registers get HIGHER scores, + // so they are allocated FIRST, giving them first pick of registers. + static unsigned scoreByScarceRegClass(const VRegMetrics &M) { + if (M.PureInterferenceDegree >= M.NumAvailableRegs) + return UINT_MAX; + // Fewer available registers = higher scarceness bonus. + // This ensures scarce register classes are allocated first. + // Use a large multiplier to make this the dominant factor. + unsigned ScarcenessBonus = (100 - M.NumAvailableRegs) * 10000; + // Add interference as secondary factor. + unsigned InterferenceScore = M.PureInterferenceDegree * 1000 + + M.AliasingInterferenceDegree * 10 + + M.TotalLanes; + return ScarcenessBonus + InterferenceScore; + } + + /// Get allocatable physical registers for a live range. + /// Returns the intersection of AdmissibleRegs (semantic constraint from + /// instruction encoding) and AvailableRegs (global availability). + static std::vector + getCandidatePhysRegs(const DenseSet &AdmissibleRegs, + const DenseSet &AvailableRegs); + + /// Dump virtual register metrics for debugging. + static void dumpVRegMetrics(const DenseMap &AllMetrics, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI); +}; + +} // namespace AIE +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AIE_AIEPOSTREGALLOC_H diff --git a/llvm/lib/Target/AIE/AIERegDefUseTracker.cpp b/llvm/lib/Target/AIE/AIERegDefUseTracker.cpp new file mode 100644 index 000000000000..212e577c561c --- /dev/null +++ b/llvm/lib/Target/AIE/AIERegDefUseTracker.cpp @@ -0,0 +1,1780 @@ +//===- AIERegDefUseTracker.cpp - Track Register Live Ranges --------------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// This file implements tracking and analysis of register live ranges in a +// MachineBasicBlock. The tracker performs the following: +// - Identifies register definitions and uses that form live ranges +// - Merges aliasing register accesses into unified live ranges +// - Filters out unsafe ranges (tied operands, live-in/out, implicit uses) +// - Computes appropriate register classes for each live range +// - Optionally replaces physical registers with virtual registers for testing +// +//===----------------------------------------------------------------------===// + +#include "AIERegDefUseTracker.h" +#include "AIEBaseInstrInfo.h" +#include "AIEBaseRegisterInfo.h" +#include "Utils/AIEMachineInstrPrint.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "aie-reg-liverange" + +using namespace llvm; + +namespace { + +/// Check if a register overlaps with a RegisterMaskPair (live-in/out entry). +/// Currently uses conservative full-register overlap; lane mask support can +/// be added later. +bool overlapsRMP(MCRegister Reg, const MachineBasicBlock::RegisterMaskPair &RMP, + const TargetRegisterInfo *TRI) { + return TRI->regsOverlap(Reg, RMP.PhysReg); +} + +} // end anonymous namespace + +void RegLiveRange::dumpBrief(const TargetRegisterInfo *TRI) const { + StringRef Name = + (BaseReg != MCRegister::NoRegister) ? TRI->getName(BaseReg) : "unknown"; + + dbgs() << " - LR#" << ID << " Base=" << Name << " defs=" << getNumDefs() + << " uses=" << getNumUses(); + + if (IsReserved) { + dbgs() << " [RESERVED]"; + } + + // Print first def if available + if (!Defs.empty()) { + const MachineInstr *MI = Defs[0].getOperand()->getParent(); + assert(MI && "Def operand must have a parent instruction"); + dbgs() << " firstDef: " << AIE::NoDebug(*MI); + } + + dbgs() << "\n"; +} + +static cl::opt ExcludeLiveRangesByRegClass( + "aie-exclude-liveranges-by-regclass", cl::Hidden, cl::init(""), + cl::desc("[AIE] Exclude live ranges of the specified register class name. " + "Empty string means no filtering.")); + +static cl::opt AddUnusedCallerSavedRegs( + "aie-add-unused-caller-saved-regs", cl::Hidden, cl::init(false), + cl::desc("[AIE] Add unused caller-saved registers to the available " + "register pool for pipelining. Only safe when loops with calls " + "are excluded from pipelining.")); + +RegLiveRangeTracker::RegLiveRangeTracker(MachineBasicBlock &MBB) + : MF(MBB.getParent()), TRI(MF->getSubtarget().getRegisterInfo()), + TII(static_cast( + MF->getSubtarget().getInstrInfo())) { + assert(MF && "MachineFunction cannot be null"); + assert(TRI && "TargetRegisterInfo cannot be null"); + assert(TII && "TargetInstrInfo cannot be null"); +} + +void RegLiveRange::addDef(MachineOperand *DefOp, unsigned SubRegIdx) { + Defs.emplace_back(DefOp, SubRegIdx); +} + +void RegLiveRange::addUse(MachineOperand *UseOp, unsigned SubRegIdx) { + Uses.emplace_back(UseOp, SubRegIdx); +} + +void RegLiveRange::mergeFrom(const RegLiveRange &Other, + const TargetRegisterInfo *TRI) { + // Helper to compute sub-register index. + auto GetSubRegIdx = [TRI](MCRegister AccessReg, + MCRegister NewBaseReg) -> unsigned { + if (AccessReg == NewBaseReg) + return 0; + for (MCSubRegIndexIterator SubRegIdxIt(NewBaseReg, TRI); + SubRegIdxIt.isValid(); ++SubRegIdxIt) { + if (SubRegIdxIt.getSubReg() == AccessReg) { + return SubRegIdxIt.getSubRegIndex(); + } + } + return 0; + }; + + // Helper to check if Reg1 is a sub-register of Reg2 (Reg2 is larger). + auto IsSubReg = [TRI](MCRegister Reg1, MCRegister Reg2) -> bool { + for (MCSubRegIndexIterator SubRegIdxIt(Reg2, TRI); SubRegIdxIt.isValid(); + ++SubRegIdxIt) { + if (SubRegIdxIt.getSubReg() == Reg1) { + return true; + } + } + return false; + }; + + // Helper to check if a candidate register contains all operand registers. + // A register R "contains" an operand register OR if OR == R or OR is a + // sub-register of R. + auto ContainsAllOperands = + [&IsSubReg](MCRegister Candidate, + ArrayRef OperandRegs) -> bool { + for (MCRegister OpReg : OperandRegs) { + if (OpReg != Candidate && !IsSubReg(OpReg, Candidate)) { + return false; + } + } + return true; + }; + + // Collect all operand registers from both ranges. + SmallVector AllOperandRegs; + for (const auto &DefInfo : Defs) { + AllOperandRegs.push_back(DefInfo.getOperand()->getReg().asMCReg()); + } + for (const auto &UseInfo : Uses) { + AllOperandRegs.push_back(UseInfo.getOperand()->getReg().asMCReg()); + } + for (const auto &DefInfo : Other.Defs) { + AllOperandRegs.push_back(DefInfo.getOperand()->getReg().asMCReg()); + } + for (const auto &UseInfo : Other.Uses) { + AllOperandRegs.push_back(UseInfo.getOperand()->getReg().asMCReg()); + } + + // Compute the new base register: the smallest register that contains all + // operand registers. Start with the current base registers as candidates. + MCRegister NewBaseReg = BaseReg; + if (NewBaseReg == MCRegister::NoRegister) { + NewBaseReg = Other.BaseReg; + } else if (Other.BaseReg != MCRegister::NoRegister) { + // Check if we need to update to a larger base register. + if (IsSubReg(NewBaseReg, Other.BaseReg)) { + NewBaseReg = Other.BaseReg; + } + } + + // If the current NewBaseReg doesn't contain all operands (e.g., sibling + // registers like cml4 and cmh4), find the smallest common super-register. + if (NewBaseReg != MCRegister::NoRegister && + !ContainsAllOperands(NewBaseReg, AllOperandRegs)) { + // Search for the smallest super-register that contains all operands. + // We iterate through super-registers of NewBaseReg in ascending order + // (MCSuperRegIterator yields them from smallest to largest). + for (MCSuperRegIterator SuperIt(NewBaseReg, TRI); SuperIt.isValid(); + ++SuperIt) { + if (ContainsAllOperands(*SuperIt, AllOperandRegs)) { + NewBaseReg = *SuperIt; + break; + } + } + } + + // Re-add existing operands with updated sub-register indices if base + // changed. + if (NewBaseReg != BaseReg) { + SmallVector OldDefs = std::move(Defs); + SmallVector OldUses = std::move(Uses); + Defs.clear(); + Uses.clear(); + + for (const auto &DefInfo : OldDefs) { + const MCRegister DefReg = DefInfo.getOperand()->getReg().asMCReg(); + Defs.emplace_back(DefInfo.getOperand(), GetSubRegIdx(DefReg, NewBaseReg)); + } + for (const auto &UseInfo : OldUses) { + const MCRegister UseReg = UseInfo.getOperand()->getReg().asMCReg(); + Uses.emplace_back(UseInfo.getOperand(), GetSubRegIdx(UseReg, NewBaseReg)); + } + + BaseReg = NewBaseReg; + } + + // Merge defs from Other with computed sub-register indices. + for (const auto &DefInfo : Other.defs()) { + const MCRegister DefReg = DefInfo.getOperand()->getReg().asMCReg(); + Defs.emplace_back(DefInfo.getOperand(), GetSubRegIdx(DefReg, NewBaseReg)); + } + + // Merge uses from Other with computed sub-register indices. + for (const auto &UseInfo : Other.uses()) { + const MCRegister UseReg = UseInfo.getOperand()->getReg().asMCReg(); + Uses.emplace_back(UseInfo.getOperand(), GetSubRegIdx(UseReg, NewBaseReg)); + } + + // Propagate reserved status: if Other is reserved, this becomes reserved. + if (Other.IsReserved) { + IsReserved = true; + } +} + +void RegLiveRange::expandBaseToInclude(MCRegister ExtReg, + const TargetRegisterInfo *TRI) { + if (ExtReg == MCRegister::NoRegister) + return; + + // Helper to compute sub-register index. + auto GetSubRegIdx = [TRI](MCRegister AccessReg, + MCRegister NewBaseReg) -> unsigned { + if (AccessReg == NewBaseReg) + return 0; + for (MCSubRegIndexIterator SubRegIdxIt(NewBaseReg, TRI); + SubRegIdxIt.isValid(); ++SubRegIdxIt) { + if (SubRegIdxIt.getSubReg() == AccessReg) { + return SubRegIdxIt.getSubRegIndex(); + } + } + return 0; + }; + + // Helper to check if Reg1 is a sub-register of Reg2 (Reg2 is larger). + auto IsSubReg = [TRI](MCRegister Reg1, MCRegister Reg2) -> bool { + for (MCSubRegIndexIterator SubRegIdxIt(Reg2, TRI); SubRegIdxIt.isValid(); + ++SubRegIdxIt) { + if (SubRegIdxIt.getSubReg() == Reg1) { + return true; + } + } + return false; + }; + + // If BaseReg is not set, just use ExtReg. + if (BaseReg == MCRegister::NoRegister) { + BaseReg = ExtReg; + return; + } + + // If ExtReg is already contained by BaseReg, nothing to do. + if (ExtReg == BaseReg || IsSubReg(ExtReg, BaseReg)) + return; + + // If BaseReg is contained by ExtReg, upgrade to ExtReg. + if (IsSubReg(BaseReg, ExtReg)) { + // Recompute SubRegIdx for existing operands. + SmallVector OldDefs = std::move(Defs); + SmallVector OldUses = std::move(Uses); + Defs.clear(); + Uses.clear(); + + for (const auto &DefInfo : OldDefs) { + const MCRegister DefReg = DefInfo.getOperand()->getReg().asMCReg(); + Defs.emplace_back(DefInfo.getOperand(), GetSubRegIdx(DefReg, ExtReg)); + } + for (const auto &UseInfo : OldUses) { + const MCRegister UseReg = UseInfo.getOperand()->getReg().asMCReg(); + Uses.emplace_back(UseInfo.getOperand(), GetSubRegIdx(UseReg, ExtReg)); + } + + BaseReg = ExtReg; + return; + } + + // Neither is a subreg of the other - find the smallest common super-register. + // Collect all operand registers plus ExtReg. + SmallVector AllRegs; + AllRegs.push_back(ExtReg); + for (const auto &DefInfo : Defs) { + AllRegs.push_back(DefInfo.getOperand()->getReg().asMCReg()); + } + for (const auto &UseInfo : Uses) { + AllRegs.push_back(UseInfo.getOperand()->getReg().asMCReg()); + } + + // Helper to check if a candidate register contains all registers. + auto ContainsAll = [&IsSubReg](MCRegister Candidate, + ArrayRef Regs) -> bool { + for (MCRegister R : Regs) { + if (R != Candidate && !IsSubReg(R, Candidate)) { + return false; + } + } + return true; + }; + + // Search for the smallest super-register that contains all. + MCRegister NewBaseReg = BaseReg; + for (MCSuperRegIterator SuperIt(BaseReg, TRI); SuperIt.isValid(); ++SuperIt) { + if (ContainsAll(*SuperIt, AllRegs)) { + NewBaseReg = *SuperIt; + break; + } + } + + // Recompute SubRegIdx for existing operands. + if (NewBaseReg != BaseReg) { + SmallVector OldDefs = std::move(Defs); + SmallVector OldUses = std::move(Uses); + Defs.clear(); + Uses.clear(); + + for (const auto &DefInfo : OldDefs) { + const MCRegister DefReg = DefInfo.getOperand()->getReg().asMCReg(); + Defs.emplace_back(DefInfo.getOperand(), GetSubRegIdx(DefReg, NewBaseReg)); + } + for (const auto &UseInfo : OldUses) { + const MCRegister UseReg = UseInfo.getOperand()->getReg().asMCReg(); + Uses.emplace_back(UseInfo.getOperand(), GetSubRegIdx(UseReg, NewBaseReg)); + } + + BaseReg = NewBaseReg; + } +} + +void RegLiveRange::clear() { + Defs.clear(); + Uses.clear(); + BaseReg = MCRegister::NoRegister; + RegisterClass = nullptr; + AdmissibleRegs.clear(); + VReg = Register(); + IsScarce = false; + IsReserved = false; + ID = -1; +} + +/// Get the sub-register index if AccessReg is a sub-register of BaseReg. +/// Returns 0 if AccessReg is not a sub-register of BaseReg. +unsigned RegLiveRangeTracker::getSubRegIndex(MCRegister AccessReg, + MCRegister BaseReg) const { + if (AccessReg == BaseReg) + return 0; + + // Check if AccessReg is a sub-register of BaseReg + for (MCSubRegIndexIterator SubRegIdxIt(BaseReg, TRI); SubRegIdxIt.isValid(); + ++SubRegIdxIt) { + if (SubRegIdxIt.getSubReg() == AccessReg) { + return SubRegIdxIt.getSubRegIndex(); + } + } + + return 0; +} + +bool RegLiveRangeTracker::overlapsAnyInSet( + MCRegister Reg, const DenseSet &RegSet) const { + for (MCRegister R : RegSet) { + if (TRI->regsOverlap(Reg, R)) + return true; + } + return false; +} + +bool RegLiveRangeTracker::isFullyDefined( + const RegLiveRange &LR, + const DenseMap &LocalLiveLaneMasks, + const MachineBasicBlock &MBB) const { + // A live range is fully defined if its algorithm-local live lanemasks + // do not intersect with the live-in set of the block. + // + // This is more precise than just checking register overlap: it allows + // ranges where the live lanes are disjoint from the live-in lanes. + // + // Importantly, this can discriminate between a truly undefined register + // (which is not in the live-in set and is safe to virtualize) and a + // register that was defined outside of the loop (which is in the live-in + // set and should be rejected because changing it would affect loop-carried + // values). + + // Check each register in LocalLiveLaneMasks that overlaps with the base + // register. + for (const auto &[LiveReg, LocalLanes] : LocalLiveLaneMasks) { + if (!TRI->regsOverlap(LR.getBaseReg(), LiveReg)) + continue; + + // Found an overlapping register with non-zero live lanes. + // Check if these lanes intersect with the live-in set. + for (const auto &LiveIn : MBB.liveins()) { + if (!TRI->regsOverlap(LiveReg, LiveIn.PhysReg)) + continue; + + // Check if the algorithm-local live lanes intersect with the live-in + // lanes. + if ((LocalLanes & LiveIn.LaneMask).any()) { + return false; + } + } + } + + return true; +} + +bool RegLiveRangeTracker::hasTiedOperands(const RegLiveRange &LR) const { + assert(TII); + + // Check if any operand in this live range is tied + for (const auto &Def : LR.defs()) { + MachineOperand *MO = Def.getOperand(); + if (MO->isTied()) + return true; + + MachineInstr *MI = MO->getParent(); + assert(MI); + + // Get the operand index for this def + unsigned OpIdx = MO->getOperandNo(); + + // Check AIE-specific tied register info + const auto TiedInfo = TII->getTiedRegInfo(*MI); + for (const auto &TiedSet : TiedInfo) { + // Check if this operand is in the destination operands of a tied set + for (const auto &DstOp : TiedSet.DstOps) { + if (DstOp.OpIdx == OpIdx) + return true; + } + // Check if this operand is in the source operands of a tied set + for (const auto &SrcOp : TiedSet.SrcOps) { + if (SrcOp.OpIdx == OpIdx) + return true; + } + } + + const MCRegister R = MO->getReg().asMCReg(); + const int DefIdx = MI->findRegisterDefOperandIdx(R, TRI); + if (DefIdx >= 0 && MI->isRegTiedToUseOperand(DefIdx)) + return true; + } + + // Also check uses for tied operands + for (const auto &Use : LR.uses()) { + MachineOperand *MO = Use.getOperand(); + + // Get the operand index for this use + unsigned OpIdx = MO->getOperandNo(); + + MachineInstr *MI = Use.getOperand()->getParent(); + assert(MI); + + // Check AIE-specific tied register info + const auto TiedInfo = TII->getTiedRegInfo(*MI); + for (const auto &TiedSet : TiedInfo) { + // Check if this operand is in the source operands of a tied set + for (const auto &SrcOp : TiedSet.SrcOps) { + if (SrcOp.OpIdx == OpIdx) + return true; + } + } + } + + return false; +} + +void RegLiveRangeTracker::pruneByFullCoverage() { + LLVM_DEBUG(dbgs() << "\nPrune by full coverage: " << LiveRanges.size() + << " ranges before pruning\n"); + + // We run this in a fixed point loop, since pruning a range may uncover ranges + // that were previously covered by it. + bool Changed = true; + while (Changed) { + Changed = false; + + // Build coverage map from current LiveRanges + DenseSet CoveredOps; + for (const RegLiveRange &LR : LiveRanges) { + for (const auto &R : LR.operands()) { + CoveredOps.insert(R.getOperand()); + } + } + + // Check if there are any uncovered operands that alias with this LR's + // registers + auto HasUncoveredAlias = [&](const DenseSet &LRRegs, + MCRegister *SampleUncovered = nullptr) { + for (MachineOperand *MO : AllPhysRegOperands) { + if (!CoveredOps.contains(MO)) { + MCRegister UncoveredReg = MO->getReg().asMCReg(); + // Check if this uncovered operand aliases with any register in this + // LR + for (const MCRegister LRReg : LRRegs) { + if (TRI->regsOverlap(UncoveredReg, LRReg)) { + if (SampleUncovered) + *SampleUncovered = UncoveredReg; + return true; + } + } + } + } + return false; + }; + + // For each live range, check if ALL operands of its register group are + // covered + SmallVector NewLiveRanges; + for (const RegLiveRange &LR : LiveRanges) { + // Collect all registers used in this live range + DenseSet LRRegs; + for (const auto &R : LR.operands()) { + LRRegs.insert(R.getOperand()->getReg().asMCReg()); + } + + MCRegister SampleUncovered = MCRegister::NoRegister; + if (!HasUncoveredAlias(LRRegs, &SampleUncovered)) { + NewLiveRanges.push_back(LR); + } else { + LLVM_DEBUG({ + dbgs() << "Reject: pruned by full coverage"; + if (SampleUncovered != MCRegister::NoRegister) + dbgs() << " (uncovered alias " << TRI->getName(SampleUncovered) + << ")"; + dbgs() << ": "; + LR.dumpBrief(TRI); + }); + Changed = true; + } + } + + LiveRanges = std::move(NewLiveRanges); + } + + LLVM_DEBUG(dbgs() << "After pruning: " << LiveRanges.size() << " ranges\n"); + +#ifndef NDEBUG + // Verify that all remaining operands are covered + DenseSet FinalCoveredOps; + for (const RegLiveRange &LR : LiveRanges) { + for (const auto &R : LR.operands()) { + FinalCoveredOps.insert(R.getOperand()); + } + } + + for (MachineOperand *MO : AllPhysRegOperands) { + if (!FinalCoveredOps.contains(MO)) { + const MCRegister U = MO->getReg().asMCReg(); + // Verify no LR overlaps with this uncovered operand + for (const RegLiveRange &LR : LiveRanges) { + for (const auto &R : LR.operands()) { + assert(!TRI->regsOverlap(U, R.getOperand()->getReg().asMCReg()) && + "Uncovered operand overlaps with kept live range!"); + } + } + } + } +#endif +} + +void RegLiveRangeTracker::mergeAliasingLiveRanges( + unsigned DefLRIdx, MCRegister DefReg, + DenseMap> &LiveRegs, + DenseMap &OperandToLiveRange) { + + // Helper to check if a def register's lanes overlap with a live register's + // current lanes. This is critical for separating live ranges: after x10 is + // defined, any y5 (containing x10) should only have x11's lanes live, and a + // subsequent x10 def should NOT merge into that y5 range. + auto LanesOverlap = [this](MCRegister DefR, MCRegister LiveR, + LaneBitmask LiveLanes) -> bool { + // If registers are equal, check if any lanes are live. + if (DefR == LiveR) + return LiveLanes.any(); + + // Check if DefR is a subreg of LiveR. + for (MCSubRegIndexIterator SubIdxIt(LiveR, TRI); SubIdxIt.isValid(); + ++SubIdxIt) { + if (SubIdxIt.getSubReg() == DefR) { + // DefR is a subreg of LiveR - check if DefR's lanes are live. + const LaneBitmask DefLanes = + TRI->getSubRegIndexLaneMask(SubIdxIt.getSubRegIndex()); + return (LiveLanes & DefLanes).any(); + } + } + + // Check if LiveR is a subreg of DefR. + for (MCSubRegIndexIterator SubIdxIt(DefR, TRI); SubIdxIt.isValid(); + ++SubIdxIt) { + if (SubIdxIt.getSubReg() == LiveR) { + // LiveR is a subreg of DefR - if any lanes of LiveR are live, + // they overlap with DefR. + return LiveLanes.any(); + } + } + + // Registers overlap but no subreg relationship - conservatively treat + // as overlapping if any lanes are live. + return LiveLanes.any(); + }; + + // Collect all aliasing live registers and their live ranges. + // Only include registers where the lanes actually overlap. + SmallVector, 8> AliasingLiveRegs; + for (const auto &[LiveReg, Info] : LiveRegs) { + if (TRI->regsOverlap(DefReg, LiveReg) && + LanesOverlap(DefReg, LiveReg, Info.second)) { + AliasingLiveRegs.push_back({LiveReg, Info.first}); + } + } + + if (AliasingLiveRegs.empty()) + return; + + // Collect all unique live range indices to merge (excluding NoLiveRange + // sentinels which represent live-out registers without actual ranges). + SmallVector ToMerge; + for (const auto &[LiveReg, LRIdx] : AliasingLiveRegs) { + if (LRIdx != RegLiveRange::NoLiveRange) { + // Check if we already have this index. + if (llvm::find(ToMerge, static_cast(LRIdx)) == ToMerge.end() && + static_cast(LRIdx) != DefLRIdx) { + ToMerge.push_back(static_cast(LRIdx)); + } + } + } + + // Compute reserved status before merging. + // Check if any aliasing live register is a live-out sentinel. + bool IsReservedFromLiveOut = false; + for (const auto &[LiveReg, LRIdx] : AliasingLiveRegs) { + if (LRIdx == RegLiveRange::NoLiveRange) { + IsReservedFromLiveOut = true; + break; + } + } + + // Also check if any subreg of DefReg is live-out. + if (!IsReservedFromLiveOut) { + for (MCSubRegIterator SubIt(DefReg, TRI, /*IncludeSelf=*/true); + SubIt.isValid(); ++SubIt) { + auto It = LiveRegs.find(*SubIt); + if (It != LiveRegs.end() && + It->second.first == RegLiveRange::NoLiveRange) { + IsReservedFromLiveOut = true; + break; + } + } + } + + // Get the target live range and update its reserved status. + RegLiveRange &TargetLR = LiveRanges[DefLRIdx]; + if (IsReservedFromLiveOut) { + TargetLR.setIsReserved(true); + } + + // Expand TargetLR's base to include any external registers from + // AliasingLiveRegs that don't have actual live ranges (live-out sentinels). + // These registers affect the base register size but have no operands. + for (const auto &[LiveReg, LRIdx] : AliasingLiveRegs) { + if (LRIdx == RegLiveRange::NoLiveRange) { + TargetLR.expandBaseToInclude(LiveReg, TRI); + } + } + + // Incrementally merge all other live ranges into the target. + // The enhanced mergeFrom() automatically computes the smallest common + // super-register that contains all operands from both ranges. + for (unsigned LRIdx : ToMerge) { + TargetLR.mergeFrom(LiveRanges[LRIdx], TRI); + + // Clear the source range (mark as invalid). + LiveRanges[LRIdx].clear(); + + // Update all LiveRegs entries that pointed to the merged range. + for (auto &[LiveReg, Info] : LiveRegs) { + if (Info.first == static_cast(LRIdx)) { + Info.first = static_cast(DefLRIdx); + } + } + + // Update OperandToLiveRange. + for (auto &Entry : OperandToLiveRange) { + if (Entry.second == LRIdx) { + Entry.second = DefLRIdx; + } + } + } + + // Remove fully redefined registers from LiveRegs. + for (const auto &[LiveReg, LRIdx] : AliasingLiveRegs) { + if (DefReg == LiveReg || getSubRegIndex(LiveReg, DefReg) != 0) { + LiveRegs.erase(LiveReg); + } + } + + // Update lane masks for partially redefined super-registers. + // When DefReg is a subreg of LiveReg, the def kills DefReg's lanes within + // LiveReg. This is critical for separating live ranges: after x10 is defined, + // any y5 (containing x10) should only have x11's lanes live, not x10's. + for (const auto &[LiveReg, OrigLRIdx] : AliasingLiveRegs) { + // Skip if already erased (fully redefined). + auto LiveIt = LiveRegs.find(LiveReg); + if (LiveIt == LiveRegs.end()) + continue; + + // Check if DefReg is a subreg of LiveReg (DefReg partially kills LiveReg). + const unsigned SubRegIdx = getSubRegIndex(DefReg, LiveReg); + if (SubRegIdx != 0) { + // DefReg is a subreg of LiveReg - update LiveReg's lane mask. + const LaneBitmask DefLanes = TRI->getSubRegIndexLaneMask(SubRegIdx); + LiveIt->second.second &= ~DefLanes; + + // If no lanes remain live, remove the entry entirely. + if (LiveIt->second.second.none()) { + LiveRegs.erase(LiveIt); + } + } + } + + // Check if this def, combined with other defs in the merged range, + // fully defines a super-register. If so, remove the super-register from + // LiveRegs. + const MCRegister MergedBaseReg = TargetLR.getBaseReg(); + + // Collect all defined sub-registers. + DenseSet AllDefinedRegs; + for (const auto &DefInfo : TargetLR.defs()) { + const MCRegister DefRegister = DefInfo.getOperand()->getReg().asMCReg(); + AllDefinedRegs.insert(DefRegister); + // Also add all sub-registers of this defined register. + for (MCSubRegIterator SubIt(DefRegister, TRI, /*IncludeSelf=*/false); + SubIt.isValid(); ++SubIt) { + AllDefinedRegs.insert(*SubIt); + } + } + + // Check if all sub-registers of a register are defined. + auto FullyCovered = [&](MCRegister Reg) { + for (MCSubRegIterator SubIt(Reg, TRI, /*IncludeSelf=*/false); + SubIt.isValid(); ++SubIt) { + if (!AllDefinedRegs.count(*SubIt)) { + return false; + } + } + return true; + }; + + // Check BaseReg and its super-registers. + SmallVector RegsToCheck; + RegsToCheck.push_back(MergedBaseReg); + for (MCSuperRegIterator SuperIt(MergedBaseReg, TRI); SuperIt.isValid(); + ++SuperIt) { + RegsToCheck.push_back(*SuperIt); + } + + for (const MCRegister CheckReg : RegsToCheck) { + if (FullyCovered(CheckReg)) { + LiveRegs.erase(CheckReg); + for (MCSuperRegIterator SuperIt(CheckReg, TRI); SuperIt.isValid(); + ++SuperIt) { + LiveRegs.erase(*SuperIt); + } + } + } +} + +DenseSet RegLiveRangeTracker::collectReservedBaseRegs() const { + DenseSet ReservedRegs; + for (const RegLiveRange &LR : LiveRanges) { + if (LR.isReserved()) { + ReservedRegs.insert(LR.getBaseReg()); + } + } + return ReservedRegs; +} + +void RegLiveRangeTracker::computeAvailableFromLiveRanges( + const DenseSet &ReservedRegs) { + + // Lambda to check if a register overlaps with any reserved register. + auto OverlapsReserved = [&](MCRegister Reg) { + return llvm::any_of(ReservedRegs, [&](MCRegister Reserved) { + return TRI->regsOverlap(Reg, Reserved); + }); + }; + + // Build AvailablePhysRegs from non-reserved ranges, excluding any + // register that overlaps with a reserved register. + AvailablePhysRegs.clear(); + for (const RegLiveRange &LR : LiveRanges) { + assert(LR.getRegisterClass() && + "Live range must have a valid register class"); + assert(LR.getBaseReg() != MCRegister::NoRegister && + "Live range must have a base register"); + assert(LR.getBaseReg().isPhysical() && + "BaseReg must be a physical register"); + + // Skip if this range is reserved. + if (LR.isReserved()) { + continue; + } + + // Skip if base register overlaps with any reserved register. + // Sub-registers are contained within the base, so if the base doesn't + // overlap with reserved, neither will any sub-register. + if (OverlapsReserved(LR.getBaseReg())) { + continue; + } + + // Add base register and all its sub-registers. + AvailablePhysRegs.insert(LR.getBaseReg()); + for (MCSubRegIterator SubIt(LR.getBaseReg(), TRI, /*IncludeSelf=*/false); + SubIt.isValid(); ++SubIt) { + AvailablePhysRegs.insert(*SubIt); + } + } +} + +void RegLiveRangeTracker::deriveSuperRegsFromSubRegs() { + // If all sub-registers of a super-register are available, add the + // super-register as well. This avoids repeated computation in PostRegAlloc. + SmallVector RegsToCheck(AvailablePhysRegs.begin(), + AvailablePhysRegs.end()); + for (MCRegister AvailReg : RegsToCheck) { + for (MCSuperRegIterator SuperIt(AvailReg, TRI, /*IncludeSelf=*/false); + SuperIt.isValid(); ++SuperIt) { + const MCRegister SuperReg = *SuperIt; + + // Skip if already available. + if (AvailablePhysRegs.count(SuperReg)) + continue; + + // Check if all sub-registers of SuperReg are available. + bool AllSubregsAvailable = true; + unsigned SubregCount = 0; + for (MCSubRegIterator SubIt(SuperReg, TRI, /*IncludeSelf=*/false); + SubIt.isValid(); ++SubIt) { + ++SubregCount; + if (!AvailablePhysRegs.count(*SubIt)) { + AllSubregsAvailable = false; + break; + } + } + + // If we have at least 2 sub-registers and all are available, + // add this super-register. + if (AllSubregsAvailable && SubregCount >= 2) { + AvailablePhysRegs.insert(SuperReg); + } + } + } +} + +void RegLiveRangeTracker::addUnusedCallerSavedRegs( + MachineBasicBlock &MBB, const DenseSet &ImplicitRegs, + const DenseSet &ReservedRegs) { + + // This feature is controlled by a command-line option because it changes + // the available register pool, which can affect register allocation results. + if (!AddUnusedCallerSavedRegs) + return; + + // Augment AvailablePhysRegs with caller-saved registers that are completely + // unused in this block. Since pipelining excludes loops with calls, these + // registers are safe to use as additional allocation candidates. + // + // A caller-saved register is safe to add if: + // 1. It is allocatable (not reserved by the target) + // 2. It belongs to a register class used by at least one live range + // 3. It does not overlap with any register used in the block (explicit ops) + // 4. It does not overlap with any register used implicitly + // 5. It does not overlap with any live-in register (respecting lane masks) + // 6. It does not overlap with any live-out register (respecting lane masks) + // 7. It does not overlap with any reserved live range + + // Collect the set of register classes used by live ranges. + SmallPtrSet UsedRegClasses; + for (const RegLiveRange &LR : LiveRanges) { + if (LR.getRegisterClass()) { + UsedRegClasses.insert(LR.getRegisterClass()); + } + } + + // If no live ranges have register classes, nothing to add. + if (UsedRegClasses.empty()) + return; + + const auto *AIERII = static_cast(TRI); + + // Get the call-preserved mask. clobbersPhysReg returns true for caller-saved + // registers (those NOT preserved across calls). + const uint32_t *PreservedMask = + AIERII->getCallPreservedMask(*MF, CallingConv::C); + const BitVector AllocatableRegs = TRI->getAllocatableSet(*MF); + + // Generic lambda to check if a register overlaps with any register in a + // range. Works with any range that yields MCRegister. + auto OverlapsAny = [this](MCRegister Reg, auto &&Range) { + return llvm::any_of(Range, + [&](MCRegister R) { return TRI->regsOverlap(Reg, R); }); + }; + + // Generic lambda to check if a register overlaps with any RegisterMaskPair + // in a range. Works with MBB.liveins() and MBB.liveouts(). + auto OverlapsAnyRMP = [this](MCRegister Reg, auto &&Range) { + return llvm::any_of(Range, + [&](const MachineBasicBlock::RegisterMaskPair &RMP) { + return overlapsRMP(Reg, RMP, TRI); + }); + }; + + // Helper to check if Reg is caller-saved (clobbered by calls). + auto IsCallerSaved = [PreservedMask](MCRegister Reg) { + return MachineOperand::clobbersPhysReg(PreservedMask, Reg); + }; + + // Transformer for AllPhysRegOperands to yield MCRegister. + auto ToReg = [](const MachineOperand *MO) { return MO->getReg().asMCReg(); }; + + // Iterate over allocatable registers and add unused caller-saved ones. + unsigned NumUnusedCallerSavedAdded = 0; + for (unsigned RegIdx = 0, E = TRI->getNumRegs(); RegIdx < E; ++RegIdx) { + const MCRegister Reg = MCRegister::from(RegIdx); + + // Skip if already available. + if (AvailablePhysRegs.count(Reg)) + continue; + + // Must be allocatable. + if (!AllocatableRegs.test(RegIdx)) + continue; + + // Must be caller-saved (clobbered by calls). + if (!IsCallerSaved(Reg)) + continue; + + // Must belong to at least one register class used by live ranges. + bool BelongsToUsedClass = llvm::any_of( + UsedRegClasses, [Reg](auto *RC) { return RC->contains(Reg); }); + if (!BelongsToUsedClass) + continue; + + // Must not overlap with any explicitly used register in the block. + if (OverlapsAny(Reg, llvm::map_range(AllPhysRegOperands, ToReg))) + continue; + + // Must not overlap with any implicit register. + if (OverlapsAny(Reg, ImplicitRegs)) + continue; + + // Must not overlap with any live-in register (respecting lane masks). + if (OverlapsAnyRMP(Reg, MBB.liveins())) + continue; + + // Must not overlap with any live-out register (respecting lane masks). + if (OverlapsAnyRMP(Reg, MBB.liveouts())) + continue; + + // Must not overlap with any reserved base register. + if (OverlapsAny(Reg, ReservedRegs)) + continue; + + // This register is safe to use as an additional allocation candidate. + AvailablePhysRegs.insert(Reg); + ++NumUnusedCallerSavedAdded; + + LLVM_DEBUG(dbgs() << "Added unused caller-saved register: " + << TRI->getName(Reg) << "\n"); + } + + LLVM_DEBUG(dbgs() << "Added " << NumUnusedCallerSavedAdded + << " unused caller-saved registers to available set\n"); +} + +void RegLiveRangeTracker::markScarceRanges() { + // Mark live ranges as scarce if they have exactly 1 available register. + for (RegLiveRange &LR : LiveRanges) { + const TargetRegisterClass *RC = LR.getRegisterClass(); + if (!RC) { + continue; + } + + unsigned AvailableCount = 0; + for (MCPhysReg PhysReg : *RC) { + if (AvailablePhysRegs.count(PhysReg)) { + ++AvailableCount; + if (AvailableCount > 1) { + break; + } + } + } + + LR.setIsScarce(AvailableCount == 1); + } +} + +//===----------------------------------------------------------------------===// +// Analyze helper methods (decomposition of analyze()) +//===----------------------------------------------------------------------===// + +void RegLiveRangeTracker::buildInstructionOrderAndCollectOperands( + ArrayRef SemanticOrder, LivenessScanState &State) { + unsigned InstrIdx = 0; + for (MachineInstr *MI : SemanticOrder) { + InstrOrder[MI] = InstrIdx++; + + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg() || !MO.getReg().isPhysical()) { + continue; + } + if (MO.isImplicit()) { + // Track implicit registers - we won't create live ranges for these + // but will use them to invalidate explicit ranges. + const MCRegister Reg = MO.getReg().asMCReg(); + + // Add all aliases. + for (MCRegAliasIterator AI(Reg, TRI, /*IncludeSelf=*/true); + AI.isValid(); ++AI) { + State.ImplicitRegs.insert(*AI); + } + } else { + AllPhysRegOperands.push_back(&MO); + } + } + } +} + +void RegLiveRangeTracker::initLiveRegsFromLiveOuts(const MachineBasicBlock &MBB, + LivenessScanState &State) { + // Initialize with live-out registers using NoLiveRange as sentinel and their + // lane masks. + for (const auto &RMP : MBB.liveouts()) { + State.LiveRegs[RMP.PhysReg] = {RegLiveRange::NoLiveRange, RMP.LaneMask}; + } +} + +unsigned RegLiveRangeTracker::getOrCreateLiveRangeForOperand( + MCRegister Reg, MachineOperand *MO, LivenessScanState &State) { + bool IsReserved = false; + + // Check if this register or an aliasing register is already live. + // We need to find an entry where the lanes actually overlap, not just + // the registers. This is critical for separating live ranges: after + // x10 is defined, any y5 (containing x10) should only have x11's lanes + // live, and a subsequent x10 access should NOT merge into that y5 range. + auto It = llvm::find_if(State.LiveRegs, [Reg, TRI = TRI](const auto &Entry) { + if (!TRI->regsOverlap(Reg, Entry.first)) + return false; + + // Registers overlap - now check if lanes overlap. + const MCRegister LiveReg = Entry.first; + const LaneBitmask LiveLanes = Entry.second.second; + + // If LiveReg equals Reg, check if any lanes are live. + if (LiveReg == Reg) + return LiveLanes.any(); + + // Check if Reg is a subreg of LiveReg. + for (MCSubRegIndexIterator SubIdxIt(LiveReg, TRI); SubIdxIt.isValid(); + ++SubIdxIt) { + if (SubIdxIt.getSubReg() == Reg) { + // Reg is a subreg of LiveReg - check if Reg's lanes are live. + const LaneBitmask RegLanes = + TRI->getSubRegIndexLaneMask(SubIdxIt.getSubRegIndex()); + return (LiveLanes & RegLanes).any(); + } + } + + // Check if LiveReg is a subreg of Reg. + for (MCSubRegIndexIterator SubIdxIt(Reg, TRI); SubIdxIt.isValid(); + ++SubIdxIt) { + if (SubIdxIt.getSubReg() == LiveReg) { + // LiveReg is a subreg of Reg - if any lanes of LiveReg are live, + // they overlap with Reg. + return LiveLanes.any(); + } + } + + // Registers overlap but no subreg relationship - conservatively treat + // as overlapping if any lanes are live. + return LiveLanes.any(); + }); + + if (It != State.LiveRegs.end()) { + const int LRIdx = It->second.first; + + if (LRIdx == RegLiveRange::NoLiveRange) { + // Found a live-out register (NoLiveRange sentinel). + // Mark the new range as reserved. + IsReserved = true; + } else { + // Found an aliasing live register with an actual live range. + assert(LRIdx >= 0 && "LRIdx must be valid"); + State.OperandToLiveRange[MO] = LRIdx; + + // Update base register for this live range if needed. + MCRegister CurrentBase = LiveRanges[LRIdx].getBaseReg(); + if (CurrentBase == MCRegister::NoRegister) { + // No base yet - expand base to include this register. + LiveRanges[LRIdx].expandBaseToInclude(Reg, TRI); + } else { + // Check if we need to update to a larger base register. + assert(CurrentBase.isPhysical() && "CurrentBase must be physical"); + assert(Reg.isPhysical() && "Reg must be physical"); + if (getSubRegIndex(Reg, CurrentBase) == 0 && + getSubRegIndex(CurrentBase, Reg) != 0) { + // Reg is larger than current base - update BaseReg and recompute + // SubRegIdx for all existing operands. + LiveRanges[LRIdx].expandBaseToInclude(Reg, TRI); + } + } + + return LRIdx; + } + } + + // Create a new live range. + const unsigned NewLRIdx = LiveRanges.size(); + LiveRanges.emplace_back(NextLiveRangeID++, Reg, IsReserved); + State.LiveRegs[Reg] = {static_cast(NewLRIdx), LaneBitmask::getAll()}; + State.OperandToLiveRange[MO] = NewLRIdx; + return NewLRIdx; +} + +void RegLiveRangeTracker::processDefsInInstruction(MachineInstr &MI, + LivenessScanState &State) { + for (MachineOperand &MO : MI.defs()) { + if (!MO.isReg() || !MO.getReg().isPhysical() || MO.isImplicit()) + continue; + + const MCRegister Reg = MO.getReg().asMCReg(); + const unsigned DefLRIdx = getOrCreateLiveRangeForOperand(Reg, &MO, State); + + // Add def to the live range with SubRegIdx relative to base. + const MCRegister CurrentBase = LiveRanges[DefLRIdx].getBaseReg(); + const unsigned SubRegIdx = getSubRegIndex(Reg, CurrentBase); + LiveRanges[DefLRIdx].addDef(&MO, SubRegIdx); + + // Merge with any aliasing live ranges. + mergeAliasingLiveRanges(DefLRIdx, Reg, State.LiveRegs, + State.OperandToLiveRange); + } +} + +void RegLiveRangeTracker::processUsesInInstruction(MachineInstr &MI, + LivenessScanState &State) { + for (MachineOperand &MO : MI.uses()) { + if (!MO.isReg() || !MO.getReg().isPhysical() || MO.isImplicit()) + continue; + + const MCRegister Reg = MO.getReg().asMCReg(); + const unsigned LRIdx = getOrCreateLiveRangeForOperand(Reg, &MO, State); + + // Add use to the live range with SubRegIdx relative to base. + const MCRegister CurrentBase = LiveRanges[LRIdx].getBaseReg(); + const unsigned SubRegIdx = getSubRegIndex(Reg, CurrentBase); + LiveRanges[LRIdx].addUse(&MO, SubRegIdx); + } +} + +void RegLiveRangeTracker::performLivenessScan( + ArrayRef SemanticOrder, LivenessScanState &State) { + // Process instructions in reverse semantic order (backward pass). + for (MachineInstr *MI : llvm::reverse(SemanticOrder)) { + // In backward pass: process defs first (they kill liveness), then uses + // (they start liveness). This order is critical for read-modify-write + // instructions where the same register is both read and written. + // The def terminates the current live range, and the use starts a new one. + processDefsInInstruction(*MI, State); + processUsesInInstruction(*MI, State); + } +} + +void RegLiveRangeTracker::applySafetyFiltering( + const MachineBasicBlock &MBB, const LivenessScanState &State, + const DenseMap &LocalLiveLaneMasks) { + LLVM_DEBUG({ dump("CANDIDATE LIVE RANGES\n"); }); + LLVM_DEBUG(dbgs() << "\nFirst-stage filtering: " << LiveRanges.size() + << " candidate ranges\n"); + + SmallVector SafeRanges; + for (const RegLiveRange &LR : LiveRanges) { + // Skip invalid/cleared ranges from merging. + if (LR.getID() < 0) + continue; + + // Filter out live ranges whose base register is not fully defined. + // This checks that the range doesn't read from live-in values, which + // would make it unsafe to virtualize (we'd be changing loop-carried + // values). This also implicitly handles use-before-def cases. + if (!isFullyDefined(LR, LocalLiveLaneMasks, MBB)) { + LLVM_DEBUG({ + dbgs() << "Reject: base register not fully defined in block: "; + LR.dumpBrief(TRI); + }); + continue; + } + + // Filter out any live range that uses an implicit register. + auto UsesImplicitReg = [&State](const RegOperandInfo &OperInfo) { + const MCRegister Reg = OperInfo.getOperand()->getReg().asMCReg(); + return State.ImplicitRegs.count(Reg) > 0; + }; + + if (llvm::any_of(LR.operands(), UsesImplicitReg)) { + LLVM_DEBUG({ + dbgs() << "Reject: uses implicit register "; + for (const auto &OI : LR.operands()) { + MCRegister R = OI.getOperand()->getReg().asMCReg(); + if (State.ImplicitRegs.count(R)) { + dbgs() << TRI->getName(R) << " "; + break; + } + } + dbgs() << ": "; + LR.dumpBrief(TRI); + }); + continue; + } + + // Reject tied operands. + if (hasTiedOperands(LR)) { + LLVM_DEBUG({ + dbgs() << "Reject: has tied operands: "; + LR.dumpBrief(TRI); + }); + continue; + } + + // Note: We don't check killedBeforeEndOfBlock because: + // 1. Live-out is already filtered by isCarriedByLiveInOut check + // 2. We want to allow def-only ranges (garbage bin registers) + + LLVM_DEBUG({ + dbgs() << "Keep: "; + LR.dumpBrief(TRI); + }); + SafeRanges.push_back(LR); + } + + LLVM_DEBUG(dbgs() << "After first-stage: " << SafeRanges.size() + << " safe ranges\n"); + + LiveRanges = std::move(SafeRanges); +} + +void RegLiveRangeTracker::computeRegisterClassesAndFilter() { + LLVM_DEBUG(dbgs() << "\nRegister class computation and filtering\n"); + + SmallVector ValidRanges; + for (RegLiveRange &LR : LiveRanges) { + computeRegisterClass(LR); + + // Filter out ranges with no valid register class. + if (!LR.getRegisterClass()) { + LLVM_DEBUG({ + dbgs() << "Reject: no valid register class: "; + LR.dumpBrief(TRI); + }); + continue; + } + + // Apply register class filtering if specified. + if (!ExcludeLiveRangesByRegClass.empty() && + StringRef(TRI->getRegClassName(LR.getRegisterClass())) == + ExcludeLiveRangesByRegClass) { + LLVM_DEBUG({ + dbgs() << "Reject: excluded register class " + << TRI->getRegClassName(LR.getRegisterClass()) << ": "; + LR.dumpBrief(TRI); + }); + continue; + } + + ValidRanges.push_back(std::move(LR)); + } + LiveRanges = std::move(ValidRanges); + + LLVM_DEBUG(dbgs() << "After register class filtering: " << LiveRanges.size() + << " ranges\n"); +} + +void RegLiveRangeTracker::finalizeAvailabilityAndScarcity( + MachineBasicBlock &MBB, const LivenessScanState &State) { + // Second-stage full coverage pruning. + // This happens AFTER register class filtering. + pruneByFullCoverage(); + + // Compute and cache available physical registers. + const DenseSet ReservedRegs = collectReservedBaseRegs(); + computeAvailableFromLiveRanges(ReservedRegs); + deriveSuperRegsFromSubRegs(); + + addUnusedCallerSavedRegs(MBB, State.ImplicitRegs, ReservedRegs); + markScarceRanges(); + + // Compute and cache the most promising scarce range set. + MostPromisingScarceRanges = findMostPromisingScarceRanges(AvailablePhysRegs); +} + +void RegLiveRangeTracker::analyze(MachineBasicBlock &MBB, + ArrayRef SemanticOrder) { + assert(!SemanticOrder.empty() && "SemanticOrder must be provided - MBB order " + "is unreliable after scheduling"); + clear(); + + // Initialize state for liveness scan. + LivenessScanState State; + + // Build instruction order map and collect operands. + buildInstructionOrderAndCollectOperands(SemanticOrder, State); + + // Initialize live registers from live-outs. + initLiveRegsFromLiveOuts(MBB, State); + + // Perform the liveness scan to build live ranges. + performLivenessScan(SemanticOrder, State); + + // Extract lane masks from LiveRegs for the isFullyDefined check. + DenseMap LocalLiveLaneMasks; + for (const auto &[Reg, Info] : State.LiveRegs) { + LocalLiveLaneMasks[Reg] = Info.second; + } + + // Apply first-stage safety filtering. + applySafetyFiltering(MBB, State, LocalLiveLaneMasks); + + // Compute register classes and apply filtering. + computeRegisterClassesAndFilter(); + + // Finalize availability and scarcity. + finalizeAvailabilityAndScarcity(MBB, State); +} + +void RegLiveRange::setRegisterClass(const TargetRegisterClass *RC) { + RegisterClass = RC; + + // Populate AdmissibleRegs from RegisterClass. + // This is initially equivalent to the RC membership, but can be further + // constrained later by per-LR requirements (e.g., bypass constraints). + AdmissibleRegs.clear(); + if (RC) { + for (MCPhysReg Reg : *RC) { + AdmissibleRegs.insert(Reg); + } + } +} + +void RegLiveRangeTracker::computeRegisterClass(RegLiveRange &LR) const { + if (LR.getBaseReg() == MCRegister::NoRegister) + return; + + // Start with nullptr, representing the universe of all register classes. + // Intersection with nullptr is identity: intersect(nullptr, X) = X + const TargetRegisterClass *CommonRC = nullptr; + + // Process all operands (defs and uses) to compute register class constraints + for (const auto &OpInfo : LR.operands()) { + MachineInstr *MI = OpInfo.getOperand()->getParent(); + const unsigned OpIdx = OpInfo.getOperand()->getOperandNo(); + + // Get the register class constraint for this operand + const TargetRegisterClass *OpRC = + MI->getRegClassConstraint(OpIdx, TII, TRI); + + if (OpRC) { + // Account for subregister access + if (OpInfo.getSubRegIdx() != 0) { + // Get the class that can be used with this subreg index + OpRC = TRI->getSubClassWithSubReg(OpRC, OpInfo.getSubRegIdx()); + } + + if (OpRC) { + // Intersect: nullptr is identity, otherwise find common subclass + if (!CommonRC) { + CommonRC = OpRC; + } else { + CommonRC = TRI->getCommonSubClass(CommonRC, OpRC); + if (!CommonRC) { + // No common class possible - this live range is illegal. + LR.setRegisterClass(nullptr); + return; + } + } + } + } + } + + // If no operand constraints were found, fall back to minimal class. + if (!CommonRC) { + CommonRC = TRI->getMinimalPhysRegClass(LR.getBaseReg()); + assert(CommonRC && "Physical register must have a register class"); + } + + LR.setRegisterClass(CommonRC); +} + +void RegLiveRangeTracker::virtualizeFilteredPhysRegs(OverlapPolicy Policy) { + assert(!RegistersVirtualized && "Registers are already virtualized"); + + MachineRegisterInfo &MRI = MF->getRegInfo(); + + // Clear the NoVRegs property. + MF->getProperties().reset(MachineFunctionProperties::Property::NoVRegs); + + // Build the set of RESERVED base registers. + DenseSet ReservedBases; + for (const RegLiveRange &LR : LiveRanges) { + if (LR.isReserved()) { + ReservedBases.insert(LR.getBaseReg()); + } + } + + // Create and rewrite virtual registers. Live ranges are created in reverse, + // so we run this loop in reverse order to make the dumps more intuitive. + for (RegLiveRange &LR : reverse(LiveRanges)) { + // The analysis should have filtered out any live ranges without a valid + // register class. + assert(LR.getRegisterClass() && + "Live range must have a valid register class"); + + // The analysis should have assigned a base register to every live range. + assert(LR.getBaseReg() != MCRegister::NoRegister && + "Live range must have a base register"); + + // Never virtualize RESERVED ranges themselves. + if (LR.isReserved()) { + continue; + } + + // Apply the overlap policy. + if (Policy == OverlapPolicy::DisallowOverlapWithReservedBase) { + // Check if this LR's base register overlaps any RESERVED base. + bool OverlapsReserved = false; + for (MCRegister ReservedBase : ReservedBases) { + if (TRI->regsOverlap(LR.getBaseReg(), ReservedBase)) { + OverlapsReserved = true; + break; + } + } + if (OverlapsReserved) { + // Skip virtualization for this range. + continue; + } + } + // If Policy == AllowOverlapWithReservedBase, we proceed to virtualize. + + // Create a virtual register for this live range. + const Register VReg = MRI.createVirtualRegister(LR.getRegisterClass()); + + // Store the VReg in the LiveRange for later mapping. + LR.setVReg(VReg); + + // Replace all operands in this live range with the virtual register. + const auto RewriteOperand = [VReg](const RegOperandInfo &Info) { + MachineOperand *MO = Info.getOperand(); + MO->setReg(VReg); + MO->setSubReg(Info.getSubRegIdx()); + }; + + // Rewrite all operands. + for (const auto &OpInfo : LR.operands()) { + RewriteOperand(OpInfo); + } + } + + // Mark as virtualized even if no live ranges were virtualized. + RegistersVirtualized = true; +} + +void RegLiveRangeTracker::rewriteToPhysRegs( + const DenseMap &VRegToPhysMap) { + assert(RegistersVirtualized && "Registers are not virtualized"); + + MachineRegisterInfo &MRI = MF->getRegInfo(); + + for (const RegLiveRange &LR : LiveRanges) { + const Register VReg = LR.getVReg(); + + // Skip live ranges that were not virtualized (partial virtualization). + if (!VReg.isValid()) { + continue; + } + + // Look up the physical register for this virtual register. + auto It = VRegToPhysMap.find(VReg); + assert(It != VRegToPhysMap.end() && + "VReg must have a mapping in VRegToPhysMap"); + + const MCRegister PhysReg = It->second; + + // Rewrite all operands in this live range to the physical register. + for (const auto &OpInfo : LR.operands()) { + MachineOperand *MO = OpInfo.getOperand(); + if (MO->getReg() == VReg) { + // Compute the actual physical register considering subregs. + Register FinalReg = PhysReg; + if (OpInfo.getSubRegIdx() != 0) { + FinalReg = TRI->getSubReg(PhysReg, OpInfo.getSubRegIdx()); + assert(FinalReg && "Invalid subregister index for physical register"); + } + MO->setReg(FinalReg); + MO->setSubReg(0); + } + } + } + + // Clear virtual registers from MRI and restore NoVRegs property. + MRI.clearVirtRegs(); + MF->getProperties().set(MachineFunctionProperties::Property::NoVRegs); + + // Mark as no longer virtualized. + RegistersVirtualized = false; + + LLVM_DEBUG(dbgs() << "Rewritten virtual registers to physical registers\n"); +} + +void RegLiveRangeTracker::restoreOriginalPhysRegs() { + // Build the mapping from VRegs to their original PhysRegs + DenseMap VRegToPhysMap; + for (const RegLiveRange &LR : LiveRanges) { + if (LR.getVReg().isValid()) { + VRegToPhysMap[LR.getVReg()] = LR.getBaseReg(); + } + } + + // Use the general rewrite method + rewriteToPhysRegs(VRegToPhysMap); + LLVM_DEBUG(dbgs() << "Restored original physical registers\n"); +} + +bool RegLiveRangeTracker::areRegistersVirtualized() const { + return RegistersVirtualized; +} + +void RegLiveRangeTracker::filterByRegisterAvailability() { + // Lambda to check if a live range has only one choice of physical register. + auto HasNoChoice = [&](const RegLiveRange &LR) -> bool { + // By this point, all live ranges should have a register class. + assert(LR.getRegisterClass() && "Live range must have a register class"); + + // Count how many physical registers from this register class are available. + unsigned AvailableCount = 0; + for (MCPhysReg PhysReg : *LR.getRegisterClass()) { + if (AvailablePhysRegs.count(PhysReg)) { + AvailableCount++; + // If we find at least 2, this live range has choices. + if (AvailableCount > 1) { + return false; + } + } + } + + // Has no choice if 0 or 1 available registers. + return true; + }; + + // Build a new list of live ranges, excluding those with no choice. + SmallVector FilteredLiveRanges; + + for (const RegLiveRange &LR : LiveRanges) { + // Skip live ranges that have no choice of physical register. + if (HasNoChoice(LR)) { + LLVM_DEBUG(dbgs() << "Filtering out live range for " + << TRI->getName(LR.getBaseReg()) + << " - no alternative physical registers\n"); + continue; + } + + // This live range has choices, keep it. + FilteredLiveRanges.push_back(LR); + } + + // Replace the live ranges with the filtered set. + LiveRanges = std::move(FilteredLiveRanges); + + LLVM_DEBUG(dbgs() << "Register availability filtering complete: " + << LiveRanges.size() << " live ranges remaining\n"); +} + +void RegLiveRangeTracker::clear() { + // Clear all containers. + LiveRanges.clear(); + AllPhysRegOperands.clear(); + InstrOrder.clear(); + + // Reset the virtualization flag. + RegistersVirtualized = false; + + // Reset the ID counter. + NextLiveRangeID = 0; + + // Note: MF, TRI, and TII are not cleared as they are set in the constructor + // and represent the context in which this tracker operates. +} + +void RegLiveRangeTracker::dump(const char *Header) const { + if (Header) { + dbgs() << Header; + } + dbgs() << "================================\n"; + dbgs() << "Total live ranges: " << LiveRanges.size() << "\n\n"; + + // Create a sorted index array to ensure deterministic output + SmallVector SortedIndices; + for (size_t LRIdx = 0; LRIdx < LiveRanges.size(); ++LRIdx) { + SortedIndices.push_back(LRIdx); + } + + // Sort by base register ID first, then by first def instruction pointer + // This ensures a stable, deterministic order + llvm::sort(SortedIndices, [this](size_t A, size_t B) { + const RegLiveRange &LRA = LiveRanges[A]; + const RegLiveRange &LRB = LiveRanges[B]; + + // First sort by base register ID + if (LRA.getBaseReg() != LRB.getBaseReg()) { + return LRA.getBaseReg() < LRB.getBaseReg(); + } + + // Then by first def instruction address (if any) + if (!LRA.defs().empty() && !LRB.defs().empty()) { + const MachineInstr *MIA = LRA.defs().begin()->getOperand()->getParent(); + const MachineInstr *MIB = LRB.defs().begin()->getOperand()->getParent(); + if (MIA != MIB) { + // Use instruction order if available + auto ItA = InstrOrder.find(MIA); + auto ItB = InstrOrder.find(MIB); + if (ItA != InstrOrder.end() && ItB != InstrOrder.end()) { + return ItA->second < ItB->second; + } + } + } + + // Finally by original index for stability + return A < B; + }); + + for (size_t SortedIdx = 0; SortedIdx < SortedIndices.size(); ++SortedIdx) { + const size_t LRIdx = SortedIndices[SortedIdx]; + const RegLiveRange &LR = LiveRanges[LRIdx]; + + // Skip invalid/cleared ranges + if (LR.getID() < 0) + continue; + + // Use the stored base register + const MCRegister BaseReg = LR.getBaseReg(); + StringRef PrimaryReg = "unknown"; + if (BaseReg != MCRegister::NoRegister) { + PrimaryReg = TRI->getName(BaseReg); + } + + dbgs() << "Live Range #" << LR.getID() << " for " << PrimaryReg; + if (LR.isReserved()) { + dbgs() << " [RESERVED]"; + } + dbgs() << ":\n"; + + dbgs() << " Definitions (" << LR.getNumDefs() << "):\n"; + size_t DefIdx = 0; + for (const RegOperandInfo &DefInfo : LR.defs()) { + dbgs() << " [" << DefIdx++ << "] "; + Register Reg = DefInfo.getOperand()->getReg(); + if (Reg.isPhysical()) { + dbgs() << "Register: " << TRI->getName(Reg); + } else { + dbgs() << "Register: %vreg" << Reg.virtRegIndex(); + } + if (DefInfo.getSubRegIdx() != 0) { + dbgs() << " (SubRegIdx: " << DefInfo.getSubRegIdx() << ")"; + } + dbgs() << " "; + if (MachineInstr *DefInstr = DefInfo.getOperand()->getParent()) { + dbgs() << AIE::NoDebug(*DefInstr) << "\n"; + } else { + dbgs() << "\n"; + } + } + + dbgs() << " Uses (" << LR.getNumUses() << "):\n"; + size_t UseIdx = 0; + for (const RegOperandInfo &UseInfo : LR.uses()) { + dbgs() << " [" << UseIdx++ << "] "; + Register Reg = UseInfo.getOperand()->getReg(); + if (Reg.isPhysical()) { + dbgs() << "Register: " << TRI->getName(Reg); + } else { + dbgs() << "Register: %vreg" << Reg.virtRegIndex(); + } + if (UseInfo.getSubRegIdx() != 0) { + dbgs() << " (SubRegIdx: " << UseInfo.getSubRegIdx() << ")"; + } + dbgs() << " "; + if (MachineInstr *UseInstr = UseInfo.getOperand()->getParent()) { + dbgs() << AIE::NoDebug(*UseInstr) << "\n"; + } else { + dbgs() << "\n"; + } + } + dbgs() << "\n"; + } + + // Dump available physical registers if live ranges exist. + if (!LiveRanges.empty()) { + DenseSet AvailablePhysRegs = getAvailablePhysRegs(); + dbgs() << "Available Physical Registers for Reallocation:\n"; + dbgs() << "==============================================\n"; + SmallVector SortedRegs(AvailablePhysRegs.begin(), + AvailablePhysRegs.end()); + llvm::sort(SortedRegs); + for (MCRegister Reg : SortedRegs) { + // MCRegister should always be physical, but check to be safe. + if (Reg.isPhysical()) { + dbgs() << " " << TRI->getName(Reg) << "\n"; + } + } + dbgs() << "Total: " << AvailablePhysRegs.size() << " registers\n\n"; + } + + // Emit end marker if header was provided + if (Header) { + dbgs() << "=== END " << Header; + } +} + +std::vector +RegLiveRangeTracker::findMostPromisingScarceRanges( + const DenseSet &AvailablePhysRegs) const { + + // Group live ranges by base register (not register class). + // This ensures we only get ranges for the same physical register. + DenseMap> RangesByBaseReg; + + for (const auto &LR : LiveRanges) { + // Only consider ranges that are marked as scarce. + if (!LR.isScarce()) { + continue; + } + + const MCRegister BaseReg = LR.getBaseReg(); + assert(BaseReg != MCRegister::NoRegister && + "LiveRange must have a BaseReg after analysis"); + + RangesByBaseReg[BaseReg].push_back(&LR); + } + + // Helper to check if a set of ranges has overlapping instructions. + auto HasOverlap = [](const std::vector &Ranges) { + DenseSet SeenInstrs; + for (const RegLiveRange *LR : Ranges) { + for (const auto &Info : LR->operands()) { + if (!SeenInstrs.insert(Info.getOperand()->getParent()).second) { + return true; + } + } + } + return false; + }; + + // Find the largest non-overlapping set with actual competition. + std::vector LargestSet; + for (const auto &Entry : RangesByBaseReg) { + const auto &Ranges = Entry.second; + + if (Ranges.size() > 1 && !HasOverlap(Ranges) && + Ranges.size() > LargestSet.size()) { + LargestSet = Ranges; + } + } + + return LargestSet; +} diff --git a/llvm/lib/Target/AIE/AIERegDefUseTracker.h b/llvm/lib/Target/AIE/AIERegDefUseTracker.h new file mode 100644 index 000000000000..459c34fcb18d --- /dev/null +++ b/llvm/lib/Target/AIE/AIERegDefUseTracker.h @@ -0,0 +1,427 @@ +//===- AIERegDefUseTracker.h - Track Register Live Ranges ----------------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// This file contains declarations for tracking and analyzing register live +// ranges in a MachineBasicBlock. The tracker performs the following: +// - Identifies register definitions and uses that form live ranges +// - Merges aliasing register accesses into unified live ranges +// - Filters out unsafe ranges (tied operands, live-in/out, implicit uses) +// - Computes appropriate register classes for each live range +// - Optionally replaces physical registers with virtual registers for testing +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AIE_AIEREGDEFUSETRACKER_H +#define LLVM_LIB_TARGET_AIE_AIEREGDEFUSETRACKER_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/CodeGen/Register.h" +#include "llvm/MC/MCRegister.h" + +namespace llvm { + +struct AIEBaseInstrInfo; +struct LaneBitmask; +class MachineBasicBlock; +class MachineFunction; +class MachineInstr; +class MachineOperand; +class MachineRegisterInfo; +class TargetRegisterInfo; +class TargetRegisterClass; + +/// Represents a register operand with its sub-register index +class RegOperandInfo { + MachineOperand *Operand; + unsigned SubRegIdx; + +public: + RegOperandInfo(MachineOperand *Op, unsigned SubIdx = 0) + : Operand(Op), SubRegIdx(SubIdx) {} + + MachineOperand *getOperand() const { return Operand; } + unsigned getSubRegIdx() const { return SubRegIdx; } +}; + +/// Structure representing a live range for a register +/// A live range can have multiple definitions (e.g., when different +/// sub-registers are defined separately) and multiple uses +class RegLiveRange { +public: + // Sentinel value for live-out registers not yet associated with a live range + static constexpr int NoLiveRange = -1; + +private: + // All definitions that contribute to this live range + SmallVector Defs; + + // All uses of this live range + SmallVector Uses; + + // Base register for this live range (largest register that covers all + // operands) + MCRegister BaseReg = MCRegister::NoRegister; + + // Register class that satisfies all constraints for this live range. + const TargetRegisterClass *RegisterClass = nullptr; + + // Explicit set of admissible physical registers for this live range. + // This represents the semantic constraint: which registers can be used + // based on instruction encoding. Initially populated from RegisterClass, + // but can be further constrained by per-LR requirements (e.g., bypass). + // Note: this is separate from availability - PostRegAlloc intersects this + // with the global available registers set to get candidates. + DenseSet AdmissibleRegs; + + // Virtual register assigned to this live range (if virtualized) + Register VReg; + + // Whether this live range is scarce (has exactly 1 available register) + bool IsScarce = false; + + // Whether this live range is reserved (virtualizable but register reserved). + // This is used for disjoint live ranges that share a physical register with + // subsequent full defs. The range can be virtualized to allow pipelining, + // but its physical register must remain reserved for the subsequent def. + bool IsReserved = false; + + // Unique ID for this live range (for debugging/tracking) + // Use -1 as sentinel for invalid/cleared ranges + int ID = -1; + +public: + RegLiveRange() = default; + + /// Construct a live range with the given ID, base register, and reserved + /// status. This is the primary constructor used when creating new ranges. + RegLiveRange(int ID, MCRegister BaseReg, bool IsReserved = false) + : BaseReg(BaseReg), IsReserved(IsReserved), ID(ID) {} + + void addDef(MachineOperand *DefOp, unsigned SubRegIdx); + void addUse(MachineOperand *UseOp, unsigned SubRegIdx); + + /// Get the number of definitions + size_t getNumDefs() const { return Defs.size(); } + + /// Get the number of uses + size_t getNumUses() const { return Uses.size(); } + + /// Iterator access to definitions + auto defs() const { return llvm::make_range(Defs.begin(), Defs.end()); } + + /// Iterator access to uses + auto uses() const { return llvm::make_range(Uses.begin(), Uses.end()); } + + /// Iterator across all defs and uses. + auto operands() const { + return llvm::concat(Uses, Defs); + } + + /// Get the base register for this live range. + MCRegister getBaseReg() const { return BaseReg; } + + /// Get the register class for this live range. + const TargetRegisterClass *getRegisterClass() const { return RegisterClass; } + + /// Get the admissible physical registers for this live range. + const DenseSet &getAdmissibleRegs() const { + return AdmissibleRegs; + } + + /// Check if a register is admissible for this live range. + bool isAdmissible(MCRegister Reg) const { + return AdmissibleRegs.contains(Reg); + } + + /// Get the number of admissible registers. + size_t getNumAdmissibleRegs() const { return AdmissibleRegs.size(); } + + /// Get the virtual register assigned to this live range + Register getVReg() const { return VReg; } + + /// Set the virtual register for this live range + void setVReg(Register R) { VReg = R; } + + /// Check if this live range is scarce (has exactly 1 available register) + bool isScarce() const { return IsScarce; } + + /// Set whether this live range is scarce + void setIsScarce(bool Scarce) { IsScarce = Scarce; } + + /// Check if this live range is reserved (virtualizable but register reserved) + bool isReserved() const { return IsReserved; } + + /// Set whether this live range is reserved + void setIsReserved(bool Reserved) { IsReserved = Reserved; } + + /// Get the unique ID for this live range. + int getID() const { return ID; } + + /// Set the register class and populate AdmissibleRegs. + /// AdmissibleRegs is initially populated from the register class membership. + void setRegisterClass(const TargetRegisterClass *RC); + + /// Merge another live range into this one. + /// Copies all defs and uses from Other into this range. + /// Updates BaseReg to the smallest register that contains all operands from + /// both ranges. This handles sibling registers (e.g., cml4 and cmh4) by + /// finding their common super-register (dm4). + /// Other is NOT cleared after the merge (caller must do that if needed). + /// @param Other The live range to merge from. + /// @param TRI Target register info for computing sub-register indices. + void mergeFrom(const RegLiveRange &Other, const TargetRegisterInfo *TRI); + + /// Expand the base register to include an external register. + /// This is used for registers that affect the live range's base (e.g., + /// live-out sentinels) but don't have corresponding operands. + /// If ExtReg is larger than BaseReg, or if they are siblings requiring + /// a common super-register, BaseReg is updated accordingly. + /// Existing operands have their SubRegIdx values recomputed. + /// @param ExtReg The external register to include. + /// @param TRI Target register info for computing sub-register indices. + void expandBaseToInclude(MCRegister ExtReg, const TargetRegisterInfo *TRI); + + /// Clear all state, making this an invalid/empty range. + void clear(); + + /// Check if this live range is empty/invalid. + bool isEmpty() const { return ID < 0; } + + /// Dump a brief summary of this live range for debugging. + void dumpBrief(const TargetRegisterInfo *TRI) const; +}; + +/// Tracker for register live ranges in a MachineBasicBlock +class RegLiveRangeTracker { + MachineFunction *MF; + const TargetRegisterInfo *TRI; + const AIEBaseInstrInfo *TII; + + // List of all live ranges found in the block + SmallVector LiveRanges; + + // All physical register operands in the block + SmallVector AllPhysRegOperands; + + // Instruction order mapping for determining earliest operand + DenseMap InstrOrder; + + // Track whether registers have been virtualized + mutable bool RegistersVirtualized = false; + + // Cached available physical registers (computed during analyze) + DenseSet AvailablePhysRegs; + + // Cached most promising scarce range set (computed during analyze) + std::vector MostPromisingScarceRanges; + + // Counter for assigning unique IDs to live ranges + int NextLiveRangeID = 0; + + /// Get the sub-register index if AccessReg is a sub-register of BaseReg + /// Returns 0 if AccessReg is not a sub-register of BaseReg + unsigned getSubRegIndex(MCRegister AccessReg, MCRegister BaseReg) const; + + /// Check if a register overlaps with any register in a set + bool overlapsAnyInSet(MCRegister Reg, + const DenseSet &RegSet) const; + + /// Compute the register class for a live range based on all its operands + void computeRegisterClass(RegLiveRange &LR) const; + + /// First-stage safety filtering. + bool hasTiedOperands(const RegLiveRange &LR) const; + + /// Check if a live range's base register is fully defined in the block. + /// Uses lane mask intersection with the block's live-in set to determine + /// if the register is truly defined within the block or comes from outside. + /// This can discriminate between a truly undefined register (not in live-in, + /// safe to virtualize) and a register defined outside the loop (in live-in, + /// should be rejected to preserve loop-carried values). + bool + isFullyDefined(const RegLiveRange &LR, + const DenseMap &LocalLiveLaneMasks, + const MachineBasicBlock &MBB) const; + + /// Second-stage full coverage pruning + void pruneByFullCoverage(); + + /// Merge aliasing live ranges when a definition is encountered. + void mergeAliasingLiveRanges( + unsigned DefLRIdx, MCRegister DefReg, + DenseMap> &LiveRegs, + DenseMap &OperandToLiveRange); + + /// Helper to find the most promising scarce range set. + /// Called by analyze() to populate MostPromisingScarceRanges. + std::vector findMostPromisingScarceRanges( + const DenseSet &AvailablePhysRegs) const; + + /// Collect base registers from RESERVED live ranges. + DenseSet collectReservedBaseRegs() const; + + /// Populate AvailablePhysRegs from non-reserved live ranges. + /// Adds base registers and sub-registers that don't overlap with reserved. + void computeAvailableFromLiveRanges(const DenseSet &ReservedRegs); + + /// Extend AvailablePhysRegs with super-registers whose sub-regs are all + /// available. + void deriveSuperRegsFromSubRegs(); + + /// Add caller-saved registers that are completely unused in the block. + /// Uses AllPhysRegOperands member for used registers, and iterates + /// MBB.liveins() and MBB.liveouts() directly (with lane mask support). + /// @param MBB The machine basic block (for live-in/out iteration). + /// @param ImplicitRegs Registers used implicitly. + /// @param ReservedRegs Reserved base registers. + void addUnusedCallerSavedRegs(MachineBasicBlock &MBB, + const DenseSet &ImplicitRegs, + const DenseSet &ReservedRegs); + + /// Mark live ranges as scarce if they have exactly 1 available register. + void markScarceRanges(); + + //===--------------------------------------------------------------------===// + // Analyze helper methods (decomposition of analyze()) + //===--------------------------------------------------------------------===// + + /// State passed through the liveness scan. + /// Groups the mutable state that is threaded through the backward scan. + struct LivenessScanState { + /// Map from register to its current live range index (signed) and lane + /// mask. Use NoLiveRange as sentinel for live-out registers not yet + /// associated with a range. + DenseMap> LiveRegs; + + /// Map from operand to live range index. + DenseMap OperandToLiveRange; + + /// Set of registers used implicitly (invalidates explicit ranges). + DenseSet ImplicitRegs; + }; + + /// Build instruction order map and collect physical register operands. + /// Also populates ImplicitRegs. + void buildInstructionOrderAndCollectOperands( + ArrayRef SemanticOrder, LivenessScanState &State); + + /// Initialize LiveRegs from live-out registers. + void initLiveRegsFromLiveOuts(const MachineBasicBlock &MBB, + LivenessScanState &State); + + /// Get or create a live range for a register operand. + /// Returns the live range index. + unsigned getOrCreateLiveRangeForOperand(MCRegister Reg, MachineOperand *MO, + LivenessScanState &State); + + /// Process def operands for a single instruction (reverse pass). + void processDefsInInstruction(MachineInstr &MI, LivenessScanState &State); + + /// Process use operands for a single instruction (reverse pass). + void processUsesInInstruction(MachineInstr &MI, LivenessScanState &State); + + /// Perform the liveness scan over all instructions. + void performLivenessScan(ArrayRef SemanticOrder, + LivenessScanState &State); + + /// Apply first-stage safety filtering to live ranges. + /// Returns the lane masks collected during analysis for isFullyDefined. + void applySafetyFiltering( + const MachineBasicBlock &MBB, const LivenessScanState &State, + const DenseMap &LocalLiveLaneMasks); + + /// Compute register classes and apply register class filtering. + void computeRegisterClassesAndFilter(); + + /// Finalize available registers and scarcity after all filtering. + void finalizeAvailabilityAndScarcity(MachineBasicBlock &MBB, + const LivenessScanState &State); + +public: + RegLiveRangeTracker(MachineBasicBlock &MBB); + + /// Process a MachineBasicBlock to find all register live ranges + /// @param MBB The machine basic block to analyze + /// @param SemanticOrder The semantic instruction order (required - must be + /// non-empty) + void analyze(MachineBasicBlock &MBB, ArrayRef SemanticOrder); + + /// Get all live ranges + ArrayRef getLiveRanges() const { return LiveRanges; } + + /// Dump the live range information for debugging + /// @param Header Optional header string to print before the dump + void dump(const char *Header = nullptr) const; + + /// Overlap policy for virtualization with respect to RESERVED ranges. + enum class OverlapPolicy { + /// Do not virtualize any range that overlaps a RESERVED base register. + /// This is the safe default that prevents regressions. + DisallowOverlapWithReservedBase, + /// Allow virtualizing ranges that overlap RESERVED bases. + /// This enables the RESERVED semantics for disjoint ranges sharing a base. + AllowOverlapWithReservedBase + }; + + /// Replace filtered physical registers with virtual registers. + /// This modifies the MachineBasicBlock and updates LiveRanges with VReg info. + /// RESERVED ranges themselves are never virtualized. + /// Other ranges may be filtered based on the policy. + /// This is a non-destructive operation that supports partial virtualization. + void virtualizeFilteredPhysRegs( + OverlapPolicy Policy = OverlapPolicy::DisallowOverlapWithReservedBase); + + /// Get the set of physical registers that would be available for reallocation + /// Returns the cached value computed during analyze() + const DenseSet &getAvailablePhysRegs() const { + return AvailablePhysRegs; + } + + /// Rewrite virtual registers to physical registers using the provided + /// mapping. + /// @param VRegToPhysMap Mapping from virtual registers to physical registers + void rewriteToPhysRegs(const DenseMap &VRegToPhysMap); + + /// Restore original physical registers from virtual registers + /// Uses the LiveRanges to map VRegs back to their original PhysRegs + /// This is a convenience method that builds the mapping and calls + /// rewriteToPhysRegs + void restoreOriginalPhysRegs(); + + /// Check if registers are currently virtualized + bool areRegistersVirtualized() const; + + /// Filter live ranges based on available physical registers. + /// Removes live ranges that have only one available physical register + /// for their register class, as these should stay physical to avoid + /// pipeliner invalidation. + /// Uses the cached AvailablePhysRegs computed during analyze(). + void filterByRegisterAvailability(); + + /// Clear all state and bring the tracker back to its default constructed + /// state + void clear(); + + /// Get the most promising scarce range set for packing. + /// Returns the cached value computed during analyze(). + /// An empty vector signals that no such set could be found. + const std::vector & + getMostPromisingScarceRanges() const { + return MostPromisingScarceRanges; + } +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AIE_AIEREGDEFUSETRACKER_H diff --git a/llvm/lib/Target/AIE/AIEScarceRegScheduling.cpp b/llvm/lib/Target/AIE/AIEScarceRegScheduling.cpp new file mode 100644 index 000000000000..b4063e8b2440 --- /dev/null +++ b/llvm/lib/Target/AIE/AIEScarceRegScheduling.cpp @@ -0,0 +1,418 @@ +//===- AIEScarceRegScheduling.cpp - Scarce Register Scheduling Strategy --===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// This file implements a PostPipelinerStrategy that prioritizes scheduling +// decisions based on scarce register pressure. +//===----------------------------------------------------------------------===// + +#include "AIEScarceRegScheduling.h" +#include "AIERegDefUseTracker.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/TargetSchedule.h" + +#define DEBUG_TYPE "scarce-reg-sched" + +namespace llvm::AIE { + +ScarceRange::ScarceRange(const RegLiveRange &LR, const ScheduleDAGInstrs &DAG) + : LiveRange(LR) { + // Collect all unique MachineInstr pointers from defs and uses. + DenseSet UniqueInstrs; + + for (const auto &DefInfo : LR.defs()) { + MachineOperand *const DefOp = DefInfo.getOperand(); + assert(DefOp && "DefOp should be valid"); + MachineInstr *const DefMI = DefOp->getParent(); + assert(DefMI && "Every operand should have a parent MachineInstr"); + UniqueInstrs.insert(DefMI); + } + + for (const auto &UseInfo : LR.uses()) { + MachineOperand *const UseOp = UseInfo.getOperand(); + assert(UseOp && "UseOp should be valid"); + MachineInstr *const UseMI = UseOp->getParent(); + assert(UseMI && "Every operand should have a parent MachineInstr"); + UniqueInstrs.insert(UseMI); + } + + // Iterate over all SUnits and collect those whose instruction is in the set. + // This handles the case where multiple SUnits reference the same instruction. + // We only need the first (representative) SUnit for each instruction. + for (const auto &SU : DAG.SUnits) { + const MachineInstr *const MI = SU.getInstr(); + assert(MI && "Every SUnit should have a MachineInstr"); + if (UniqueInstrs.count(MI)) { + Members.push_back(SU.NodeNum); + // Early break when we've found all unique instructions. + if (Members.size() == UniqueInstrs.size()) { + break; + } + } + } + + // Members are in SUnit order, which is deterministic. +} + +ScarceRegScheduling::ScarceRegScheduling(ScheduleDAGInstrs &DAG, + ScheduleInfo &Info, + RegLiveRangeTracker &RegTracker, + int II) + : PostPipelinerStrategy(DAG, Info, /*LatestBias=*/0), + RegTracker(RegTracker), II(II) {} + +BurstMostUrgentStrategy::BurstMostUrgentStrategy( + ScheduleDAGInstrs &DAG, ScheduleInfo &Info, + const std::vector &ScarceRanges, int LatestBias) + : PostPipelinerStrategy(DAG, Info, LatestBias), ScarceRanges(ScarceRanges), + CurrentSet(0) { + + assert(!ScarceRanges.empty() && + "BurstMostUrgentStrategy requires at least one scarce range"); + + // Build a set to track which SUnits are part of scarce ranges. + const size_t NumSUnits = Info.NInstr; + SmallVector IsScarceRangeMember(NumSUnits, false); + for (const auto &Range : ScarceRanges) { + for (int MemberIdx : Range.Members) { + assert(MemberIdx >= 0 && static_cast(MemberIdx) < NumSUnits && + "Scarce range member index out of bounds"); + IsScarceRangeMember[MemberIdx] = true; + } + } + + // Precompute predecessors and members for each range (in original order). + Predecessors.reserve(ScarceRanges.size()); + Members.reserve(ScarceRanges.size()); + + for (const auto &Range : ScarceRanges) { + // Collect non-scarce predecessors for this range. + SmallVector RangePredecessors; + for (int MemberIdx : Range.Members) { + const auto &MemberNode = Info[MemberIdx]; + for (int AncestorIdx : MemberNode.Ancestors) { + // Only include non-scarce ancestors. + if (static_cast(AncestorIdx) < IsScarceRangeMember.size() && + !IsScarceRangeMember[AncestorIdx]) { + // Avoid duplicates. + if (std::find(RangePredecessors.begin(), RangePredecessors.end(), + AncestorIdx) == RangePredecessors.end()) { + RangePredecessors.push_back(AncestorIdx); + } + } + } + } + + Predecessors.push_back(std::move(RangePredecessors)); + Members.push_back(Range.Members); + } + + // Pre-size OrderedMembers (will be populated by init()). + OrderedMembers.resize(ScarceRanges.size() * 2); +} + +void BurstMostUrgentStrategy::init(const SmallVector &RangeOrder) { + assert(RangeOrder.size() == ScarceRanges.size() && + "RangeOrder must have the same size as ScarceRanges"); + + // Reset state. + CurrentSet = 0; + + // Build OrderedMembers by interleaving predecessors and members in the given + // order. + for (size_t I = 0; I < RangeOrder.size(); ++I) { + const int RangeIdx = RangeOrder[I]; + OrderedMembers[2 * I] = Predecessors[RangeIdx]; + OrderedMembers[2 * I + 1] = Members[RangeIdx]; + } +} + +bool BurstMostUrgentStrategy::better(const SUnit &A, const SUnit &B) { + const int AIdx = A.NodeNum; + const int BIdx = B.NodeNum; + + // Check if either is in the current set. + if (CurrentSet < OrderedMembers.size()) { + const auto &CurrentMembers = OrderedMembers[CurrentSet]; + const bool AInSet = std::find(CurrentMembers.begin(), CurrentMembers.end(), + AIdx) != CurrentMembers.end(); + const bool BInSet = std::find(CurrentMembers.begin(), CurrentMembers.end(), + BIdx) != CurrentMembers.end(); + + // Prefer members of the current set. + if (AInSet != BInSet) { + return AInSet; + } + } + + // Default: prefer earlier earliest. + return Info[AIdx].Earliest < Info[BIdx].Earliest; +} + +void BurstMostUrgentStrategy::selected(const SUnit &N) { + // Check if we've completed the current set. + if (CurrentSet < OrderedMembers.size()) { + const auto &CurrentMembers = OrderedMembers[CurrentSet]; + + // Check if all members of the current set are scheduled. + const bool AllMembersScheduled = + llvm::all_of(CurrentMembers, [this](int MemberIdx) { + return Info[MemberIdx].Scheduled; + }); + + // If all members are scheduled, advance to the next set. + if (AllMembersScheduled) { + ++CurrentSet; + LLVM_DEBUG(dbgs() << format("Completed set %zu, advancing to %zu\n", + CurrentSet - 1, CurrentSet)); + + // If we just completed a members set (odd index), simulate + // anti-dependences. + if ((CurrentSet - 1) % 2 == 1) { + const size_t BurstIdx = (CurrentSet - 1) / 2; + const int RangeIdx = + (BurstIdx < ScarceRanges.size()) ? static_cast(BurstIdx) : -1; + if (RangeIdx >= 0) { + simulateAntiDependences(RangeIdx); + } + } + } + } +} + +void BurstMostUrgentStrategy::simulateAntiDependences(int CompletedRangeIdx) { + const auto &CompletedRange = ScarceRanges[CompletedRangeIdx]; + const auto *const SchedModel = DAG.getSchedModel(); + + LLVM_DEBUG(dbgs() << format("Simulating anti-dependences for range %d\n", + CompletedRangeIdx)); + + // For each Use in the completed range's LiveRange. + for (const auto &UseInfo : CompletedRange.LiveRange.uses()) { + MachineOperand *const UseOp = UseInfo.getOperand(); + assert(UseOp && "UseOp should be valid"); + MachineInstr *const UseMI = UseOp->getParent(); + assert(UseMI && "Every operand should have a parent MachineInstr"); + + const unsigned UseOpIdx = UseOp->getOperandNo(); + + // Find the corresponding SUnit index. + int UseSUIdx = -1; + for (const int MemberIdx : CompletedRange.Members) { + if (DAG.SUnits[MemberIdx].getInstr() == UseMI) { + UseSUIdx = MemberIdx; + break; + } + } + assert(UseSUIdx >= 0 && "Use instruction should be in completed range"); + + const int UseCycle = Info[UseSUIdx].Cycle; + + // For each subsequent range. + for (size_t LaterRangeIdx = CompletedRangeIdx + 1; + LaterRangeIdx < ScarceRanges.size(); ++LaterRangeIdx) { + const auto &LaterRange = ScarceRanges[LaterRangeIdx]; + + // For each Def in the later range's LiveRange. + for (const auto &DefInfo : LaterRange.LiveRange.defs()) { + MachineOperand *const DefOp = DefInfo.getOperand(); + assert(DefOp && "DefOp should be valid"); + MachineInstr *const DefMI = DefOp->getParent(); + assert(DefMI && "Every operand should have a parent MachineInstr"); + + const unsigned DefOpIdx = DefOp->getOperandNo(); + + // Find the corresponding SUnit index. + int DefSUIdx = -1; + for (const int MemberIdx : LaterRange.Members) { + if (DAG.SUnits[MemberIdx].getInstr() == DefMI) { + DefSUIdx = MemberIdx; + break; + } + } + assert(DefSUIdx >= 0 && "Def instruction should be in later range"); + + // Compute the anti-dependence latency. + const unsigned Latency = + SchedModel->computeOperandLatency(UseMI, UseOpIdx, DefMI, DefOpIdx); + + // Update Earliest[Def] = max(Earliest[Def], Cycle[Use] + L). + const int NewEarliest = UseCycle + static_cast(Latency); + Info[DefSUIdx].Earliest = + std::max(Info[DefSUIdx].Earliest, NewEarliest); + } + } + } +} + +void buildScarceRangeMapping(const std::vector &Ranges, + const ScheduleInfo &Info, + std::vector &RangeOfSUnit) { + RangeOfSUnit.assign(Info.NInstr, -1); + + for (size_t RangeIdx = 0; RangeIdx < Ranges.size(); ++RangeIdx) { + const auto &Range = Ranges[RangeIdx]; + for (int MemberIdx : Range.Members) { + assert(MemberIdx >= 0 && MemberIdx < Info.NInstr && + "Scarce range member index out of bounds"); + assert(RangeOfSUnit[MemberIdx] == -1 && + "SUnit cannot belong to multiple scarce ranges"); + RangeOfSUnit[MemberIdx] = RangeIdx; + } + } +} + +void buildScarceDAG(std::vector &Ranges, const ScheduleInfo &Info, + const ScheduleDAGInstrs &DAG) { + // Build the mapping from SUnit to range index. + std::vector RangeOfSUnit; + buildScarceRangeMapping(Ranges, Info, RangeOfSUnit); + + // Populate PredRanges for each range using direct predecessors from the DAG. + for (size_t RangeIdx = 0; RangeIdx < Ranges.size(); ++RangeIdx) { + auto &Range = Ranges[RangeIdx]; + Range.PredRanges.clear(); + + // Use a small set to deduplicate predecessor ranges. + SmallVector PredSet; + + // For each member of this range. + for (int MemberIdx : Range.Members) { + assert(MemberIdx >= 0 && MemberIdx < Info.NInstr && + "Scarce range member index out of bounds"); + + const auto &SU = DAG.SUnits[MemberIdx]; + + // For each direct predecessor of this member. + for (const auto &PredEdge : SU.Preds) { + const SUnit *PredSU = PredEdge.getSUnit(); + if (!PredSU || PredSU->isBoundaryNode()) { + continue; + } + + const int PredIdx = PredSU->NodeNum; + const int PredRange = RangeOfSUnit[PredIdx]; + + // If the predecessor is in a different scarce range, record the edge. + if (PredRange != -1 && PredRange != static_cast(RangeIdx)) { + // Add to PredSet if not already present. + if (std::find(PredSet.begin(), PredSet.end(), PredRange) == + PredSet.end()) { + PredSet.push_back(PredRange); + } + } + } + } + + // Copy deduplicated predecessors to PredRanges. + Range.PredRanges = PredSet; + } +} + +bool checkAcyclic(const std::vector &Ranges) { + const size_t K = Ranges.size(); + + // Compute indegrees (PredRanges.size() for each range). + SmallVector Indegree; + Indegree.reserve(K); + for (const auto &Range : Ranges) { + Indegree.push_back(Range.PredRanges.size()); + } + + // Kahn's algorithm: process ranges with indegree 0. + SmallVector Ready; + for (size_t I = 0; I < K; ++I) { + if (Indegree[I] == 0) { + Ready.push_back(I); + } + } + + unsigned ProcessedCount = 0; + while (!Ready.empty()) { + const int Current = Ready.pop_back_val(); + ++ProcessedCount; + + // For each range that has Current as a predecessor, decrement indegree. + for (size_t J = 0; J < K; ++J) { + const auto &Range = Ranges[J]; + if (std::find(Range.PredRanges.begin(), Range.PredRanges.end(), + Current) != Range.PredRanges.end()) { + --Indegree[J]; + if (Indegree[J] == 0) { + Ready.push_back(J); + } + } + } + } + + // If we processed all ranges, the DAG is acyclic. + return ProcessedCount == K; +} + +bool enumerateRangeOrders( + const std::vector &Ranges, + llvm::function_ref &Order)> OnOrder) { + + const size_t K = Ranges.size(); + + // Track which ranges have been placed in the current order. + SmallVector Placed(K, false); + + // Current partial order being built. + SmallVector Order; + Order.reserve(K); + + // Recursive DFS to enumerate linear extensions. + const auto Enumerate = [&](auto &EnumerateRef) -> bool { + // Base case: complete order found. + if (Order.size() == K) { + LLVM_DEBUG(dbgs() << "\nEntering burst scheduling with order "; + for (auto Ord : Order) { dbgs() << Ord << ", "; } dbgs() + << "\n";); + return OnOrder(Order); + } + + // Find ready ranges (all predecessors are in Order). + for (size_t RangeIdx = 0; RangeIdx < K; ++RangeIdx) { + if (Placed[RangeIdx]) { + continue; + } + + const auto &Range = Ranges[RangeIdx]; + + // Check if all predecessors are placed. + const bool AllPredsPlaced = llvm::all_of( + Range.PredRanges, [&Placed](int PredIdx) { return Placed[PredIdx]; }); + + if (AllPredsPlaced) { + // This range is ready; add it to the order and recurse. + + Order.push_back(RangeIdx); + Placed[RangeIdx] = true; + + if (EnumerateRef(EnumerateRef)) { + return true; + } + + // Backtrack. + Placed[RangeIdx] = false; + Order.pop_back(); + } + } + + return false; + }; + + LLVM_DEBUG(dbgs() << "Enumerating scarce ranges\n"); + + return Enumerate(Enumerate); +} + +} // namespace llvm::AIE diff --git a/llvm/lib/Target/AIE/AIEScarceRegScheduling.h b/llvm/lib/Target/AIE/AIEScarceRegScheduling.h new file mode 100644 index 000000000000..ec4586495487 --- /dev/null +++ b/llvm/lib/Target/AIE/AIEScarceRegScheduling.h @@ -0,0 +1,131 @@ +//===- AIEScarceRegScheduling.h - Scarce Register Scheduling Strategy ----===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// This file contains a PostPipelinerStrategy that prioritizes scheduling +// decisions based on scarce register pressure. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AIE_AIESCARCEREGSCHEDULING_H +#define LLVM_LIB_TARGET_AIE_AIESCARCEREGSCHEDULING_H + +#include "AIEPostPipeliner.h" +#include "llvm/ADT/SmallVector.h" +#include + +namespace llvm { +class RegLiveRange; +class RegLiveRangeTracker; +class SUnit; +} // namespace llvm + +namespace llvm::AIE { + +class ScarceRegScheduling : public PostPipelinerStrategy { + [[maybe_unused]] RegLiveRangeTracker &RegTracker; + [[maybe_unused]] int II; + +public: + ScarceRegScheduling(ScheduleDAGInstrs &DAG, ScheduleInfo &Info, + RegLiveRangeTracker &RegTracker, int II); + + std::string name() override { return "ScarceRegScheduling"; } +}; + +// Represents a scarce range to be scheduled atomically. +struct ScarceRange { + // SUnit indices that are part of this scarce range. + SmallVector Members; + + // Indices of scarce ranges that must precede this range (scarce-only DAG). + SmallVector PredRanges; + + // Reference to the corresponding RegLiveRange with def/use operand info. + // The LiveRange provides the MachineOperand pointers and indices needed for + // anti-dependence simulation in BurstMostUrgentStrategy. + const RegLiveRange &LiveRange; + + // Event-space anchor (start cycle modulo II). + int EventAnchor = 0; + + // Issue-space anchor (converted from event-space with base normalization). + int IssueAnchor = 0; + + // Event-space length of the MLI. + int EventLength = 0; + + // Constructor computes Members from LiveRange's defs and uses via DAG. + ScarceRange(const RegLiveRange &LR, const ScheduleDAGInstrs &DAG); +}; + +// Strategy for burst scheduling: prioritize predecessors of the current +// scarce range, then atomically place the scarce range members. +class BurstMostUrgentStrategy : public PostPipelinerStrategy { + // The ordered sequence of scarce ranges to schedule. + const std::vector &ScarceRanges; + + // Precomputed non-scarce predecessors for each range (in original order). + std::vector> Predecessors; + + // Members for each range (in original order, copied from ScarceRanges). + std::vector> Members; + + // Ordered sets to schedule (built by init() from Predecessors and Members). + // For each burst i: + // OrderedMembers[2*i] = Predecessors[RangeOrder[i]] + // OrderedMembers[2*i+1] = Members[RangeOrder[i]] + std::vector> OrderedMembers; + + // Current index into OrderedMembers (which set we're working on). + size_t CurrentSet = 0; + +public: + BurstMostUrgentStrategy(ScheduleDAGInstrs &DAG, ScheduleInfo &Info, + const std::vector &ScarceRanges, + int LatestBias); + + // Initialize OrderedMembers based on the given range order. + void init(const SmallVector &RangeOrder); + + std::string name() override { return "BurstMostUrgentStrategy"; } + + bool better(const SUnit &A, const SUnit &B) override; + + void selected(const SUnit &N) override; + + bool fromTop() override { return true; } + +private: + // Simulate anti-dependences from a completed range to all subsequent ranges. + void simulateAntiDependences(int CompletedRangeIdx); +}; + +// Build a mapping from SUnit index to scarce range index. +// RangeOfSUnit[i] = range index if SUnit i is in a scarce range, -1 otherwise. +void buildScarceRangeMapping(const std::vector &Ranges, + const ScheduleInfo &Info, + std::vector &RangeOfSUnit); + +// Build the scarce-only DAG by populating PredRanges for each range. +void buildScarceDAG(std::vector &Ranges, const ScheduleInfo &Info, + const ScheduleDAGInstrs &DAG); + +// Check that the scarce-only DAG is acyclic using Kahn's algorithm. +// Returns true if acyclic, false if a cycle is detected. +bool checkAcyclic(const std::vector &Ranges); + +// Enumerate range orders compatible with the DAG. +// OnOrder returns true to stop enumeration (success), false to continue. +// Returns true if OnOrder returned true for any order, false otherwise. +bool enumerateRangeOrders( + const std::vector &Ranges, + llvm::function_ref &Order)> OnOrder); + +} // namespace llvm::AIE + +#endif // LLVM_LIB_TARGET_AIE_AIESCARCEREGSCHEDULING_H diff --git a/llvm/lib/Target/AIE/AIEScheduleInterpreter.cpp b/llvm/lib/Target/AIE/AIEScheduleInterpreter.cpp new file mode 100644 index 000000000000..34143c26e313 --- /dev/null +++ b/llvm/lib/Target/AIE/AIEScheduleInterpreter.cpp @@ -0,0 +1,517 @@ +//===- AIEScheduleInterpreter.cpp - Schedule-aware itinerary interpreter -===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// This file implements a schedule-aware interpreter that computes register +// file (RF) occupancy windows from scheduled MachineInstrs and itinerary +// data. +// +//===----------------------------------------------------------------------===// + +#include "AIEScheduleInterpreter.h" +#include "AIEBaseInstrInfo.h" +#include "AIELivenessVector.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include +#include + +#define DEBUG_TYPE "aie-schedule-interpreter" + +using namespace llvm; + +AIEScheduleInterpreter::AIEScheduleInterpreter(const MachineFunction &MF) + : TII(*MF.getSubtarget().getInstrInfo()), + TRI(*MF.getSubtarget().getRegisterInfo()), MRI(MF.getRegInfo()), + Itin(MF.getSubtarget().getInstrItineraryData()) { + assert(Itin && !Itin->isEmpty() && + "Instruction itinerary data must be provided"); +} + +int AIEScheduleInterpreter::getOperandCycle(unsigned SchedClass, + unsigned OpIdx) const { + // Get operand cycle from itinerary. + // This tells us when the operand is accessed relative to instruction issue. + const std::optional OperandCycle = + Itin->getOperandCycle(SchedClass, OpIdx); + + // Ensure we have timing information for this operand. + assert(OperandCycle.has_value() && + "Itinerary must provide operand cycle information for all operands"); + + return *OperandCycle; +} + +// Helper to add an event to the schedule, resizing if necessary +static void addEvent(EventSchedule &Schedule, int Cycle, EventType Type, + unsigned VReg, unsigned SubRegIdx, + unsigned ForwardingClass, const MachineInstr *MI, + unsigned OpIdx) { + // Ensure the schedule is large enough + if (Cycle >= static_cast(Schedule.size())) { + Schedule.resize(Cycle + 1); + } + + // Add the event + Schedule[Cycle].emplace_back(Type, VReg, SubRegIdx, ForwardingClass, MI, + OpIdx); +} + +void AIEScheduleInterpreter::addInstructionEvents( + const MachineInstr &MI, int IssueCycle, EventSchedule &Schedule) const { + + LLVM_DEBUG(dbgs() << "Adding events for instruction at cycle " << IssueCycle + << ": " << MI); + + // Get scheduling class once for all operands. + const MCInstrDesc &Desc = MI.getDesc(); + const unsigned SchedClass = Desc.getSchedClass(); + + // Process all operands + for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) { + const MachineOperand &MO = MI.getOperand(OpIdx); + + // Skip non-register operands + if (!MO.isReg() || !MO.getReg()) + continue; + + // Skip physical registers for now + if (!Register::isVirtualRegister(MO.getReg())) + continue; + + // Skip implicit operands + if (MO.isImplicit()) + continue; + + const Register VReg = MO.getReg(); + const unsigned SubRegIdx = MO.getSubReg(); + const unsigned ForwardingClass = + Itin->getForwardingClass(SchedClass, OpIdx); + + if (MO.isUse()) { + const int ReadCycleOffset = getOperandCycle(SchedClass, OpIdx); + const int ReadCycle = IssueCycle + ReadCycleOffset; + + // Add read event. + // ForwardingClass != 0 indicates this read also accesses a bypass + // one cycle earlier. + addEvent(Schedule, ReadCycle, EventType::Read, VReg, SubRegIdx, + ForwardingClass, &MI, OpIdx); + + LLVM_DEBUG(dbgs() << " Read %vreg" << Register::virtReg2Index(VReg); + if (SubRegIdx) dbgs() + << ":" << TRI.getSubRegIndexName(SubRegIdx); + dbgs() << " at cycle " << ReadCycle; + if (ForwardingClass) dbgs() + << " (forwarding class " << ForwardingClass << ")"; + dbgs() << "\n"); + } + + if (MO.isDef()) { + const int WriteCycleOffset = getOperandCycle(SchedClass, OpIdx); + const int WriteCycle = IssueCycle + WriteCycleOffset; + + // Add write event. + // ForwardingClass != 0 indicates this write also writes to a bypass + // at the same cycle. + addEvent(Schedule, WriteCycle, EventType::Write, VReg, SubRegIdx, + ForwardingClass, &MI, OpIdx); + + LLVM_DEBUG(dbgs() << " Write %vreg" << Register::virtReg2Index(VReg); + if (SubRegIdx) dbgs() + << ":" << TRI.getSubRegIndexName(SubRegIdx); + dbgs() << " at cycle " << WriteCycle; + if (ForwardingClass) dbgs() + << " (forwarding class " << ForwardingClass << ")"; + dbgs() << "\n"); + } + } +} + +void AIEScheduleInterpreter::dumpEventSchedule(const EventSchedule &Schedule, + raw_ostream &OS) const { + + // Collect all unique virtual registers + std::set AllVRegs; + for (const auto &CycleEvents : Schedule) { + for (const auto &Event : CycleEvents) { + AllVRegs.insert(Event.VReg); + } + } + + // Helper lambda to format an event as a string + auto FormatEvent = [](const RFEvent &Event) -> std::string { + const char Action = (Event.Type == EventType::Read) ? 'R' : 'W'; + std::string ActionStr; + if (Event.SubRegIdx != 0) { + // Include subreg info if present (format as R## or W##) + raw_string_ostream Stream(ActionStr); + Stream << format("%c%02d", Action, Event.SubRegIdx); + } else { + // No subreg, just the action with padding + ActionStr = Action; + ActionStr += " "; + } + return ActionStr; + }; + + // Build separate maps for register and bypass events per VReg. + // Bypass events are derived from ForwardingClass: + // - Reads with ForwardingClass != 0 also read bypass at same cycle + // - Writes with ForwardingClass != 0 also write bypass one cycle earlier + std::map> RegEventsByVReg; + std::map> BypassEventsByVReg; + for (unsigned Cycle = 0; Cycle < Schedule.size(); ++Cycle) { + const auto &CycleEvents = Schedule[Cycle]; + for (const auto &Event : CycleEvents) { + // Add space if there's already an event in this cycle + if (!RegEventsByVReg[Event.VReg][Cycle].empty()) { + RegEventsByVReg[Event.VReg][Cycle] += " "; + } + RegEventsByVReg[Event.VReg][Cycle] += FormatEvent(Event); + + // If this event uses a bypass, add bypass event + if (Event.ForwardingClass != 0) { + const int BypassCycle = + (Event.Type == EventType::Write) ? Cycle - 1 : Cycle; + if (BypassCycle >= 0) { + if (!BypassEventsByVReg[Event.VReg][BypassCycle].empty()) { + BypassEventsByVReg[Event.VReg][BypassCycle] += " "; + } + BypassEventsByVReg[Event.VReg][BypassCycle] += FormatEvent(Event); + } + } + } + } + + // Print header with cycle numbers. + // Reserve 12 characters for register class names to handle long names. + OS << " RegClass VReg |"; + for (unsigned Cycle = 0; Cycle < Schedule.size(); ++Cycle) { + OS << format(" %4d |", Cycle); + } + OS << "\n"; + + // Print separator. + OS << "-------------------+"; + for (unsigned Cycle = 0; Cycle < Schedule.size(); ++Cycle) { + OS << "------+"; + } + OS << "\n"; + + // Helper lambda to print a row of events + auto PrintEventRow = [&](const std::map &Events) { + for (unsigned Cycle = 0; Cycle < Schedule.size(); ++Cycle) { + auto It = Events.find(Cycle); + OS << format(" %-4s |", It != Events.end() ? It->second.c_str() : ""); + } + OS << "\n"; + }; + + // Print each VReg with register events and bypass events on separate lines. + for (unsigned VReg : AllVRegs) { + const auto Reg = Register::virtReg2Index(VReg); + const char *RCName = TRI.getRegClassName(MRI.getRegClass(VReg)); + + // Print register events. + // Use %-12.12s to left-align, pad to 12 chars, and truncate at 12 chars. + OS << format(" %-12.12s%5d |", RCName, Reg); + PrintEventRow(RegEventsByVReg[VReg]); + + // Print bypass events if any exist for this VReg. + const auto &BypassEvents = BypassEventsByVReg[VReg]; + if (!BypassEvents.empty()) { + OS << " bypass |"; + PrintEventRow(BypassEvents); + } + } +} + +// Helper function to get lane mask for a register operand +static LaneBitmask getLaneMaskFor(const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI, + unsigned SubRegIdx, unsigned VReg) { + if (SubRegIdx == 0) { + // Full/composite register - get the actual lane mask from register class + const TargetRegisterClass *RC = MRI.getRegClass(VReg); + return RC->getLaneMask(); + } + // Specific subregister + return TRI.getSubRegIndexLaneMask(SubRegIdx); +} + +DenseMap +AIEScheduleInterpreter::buildLiveLanes(const EventSchedule &Schedule, + int II) const { + + assert(II > 0 && "Initiation interval must be positive"); + + DenseMap LiveLanesByVirtReg; + + if (Schedule.empty()) + return LiveLanesByVirtReg; + + // State: tracks which lanes are currently live when scanning backward + DenseMap ActiveMask; + + // Process cycles backward + int MaxCycle = Schedule.size() - 1; + for (int C = MaxCycle; C >= 0; --C) { + const auto &Events = Schedule[C]; + int ModuloCycle = C % II; // Master modulo-II bit + + // First, record what's live ENTERING this cycle (before any events) + // This is what was active from processing later cycles + for (const auto &[VReg, Mask] : ActiveMask) { + if (Mask.any()) { + // Ensure the output vector is sized for this VReg + if (!LiveLanesByVirtReg.count(VReg)) { + LiveLanesByVirtReg[VReg] = AIE::LivenessVector(II); + } + LiveLanesByVirtReg[VReg][ModuloCycle] |= Mask; + + LLVM_DEBUG(dbgs() << " Lanes " << PrintLaneMask(Mask) << " for %vreg" + << Register::virtReg2Index(VReg) + << " live entering cycle " << C << " (offset " + << ModuloCycle << ")\n"); + } + } + + // Collect reads for this cycle (they don't make register live in this + // cycle) + DenseMap ReadsInCycle; + + // Step 1: Process defs (writes) - they occupy the register and kill lanes + // going backward + for (const auto &Event : Events) { + if (Event.Type == EventType::Write) { + LaneBitmask M = getLaneMaskFor(TRI, MRI, Event.SubRegIdx, Event.VReg); + + // Ensure the output vector exists for this VReg + if (!LiveLanesByVirtReg.count(Event.VReg)) { + LiveLanesByVirtReg[Event.VReg] = AIE::LivenessVector(II); + } + + // RF write occupies register file at ModuloCycle + LiveLanesByVirtReg[Event.VReg][ModuloCycle] |= M; + + // If this write uses a bypass, mark bypass write one cycle earlier + if (Event.ForwardingClass != 0) { + const int BypassWriteCycle = C - 1; + if (BypassWriteCycle >= 0) { + const int BypassModuloCycle = BypassWriteCycle % II; + LiveLanesByVirtReg[Event.VReg][BypassModuloCycle].addBypassWrite( + Event.ForwardingClass); + + LLVM_DEBUG(dbgs() + << " Bypass write of class " << Event.ForwardingClass + << " at cycle " << BypassWriteCycle << " (offset " + << BypassModuloCycle << ")\n"); + } + } + + // Kill those lanes going backward + ActiveMask[Event.VReg] &= ~M; + + LLVM_DEBUG(dbgs() << " Cycle " << C << " (" << ModuloCycle + << "): Write %vreg" + << Register::virtReg2Index(Event.VReg); + if (Event.SubRegIdx) dbgs() + << ":" << TRI.getSubRegIndexName(Event.SubRegIdx); + dbgs() << " occupies lanes " << PrintLaneMask(M) + << " and kills them going backward\n"); + + // If no lanes remain active, remove from map + if (ActiveMask[Event.VReg].none()) { + ActiveMask.erase(Event.VReg); + } + } + } + + // Step 2: Collect all reads in this cycle + for (const auto &Event : Events) { + if (Event.Type == EventType::Read) { + LaneBitmask M = getLaneMaskFor(TRI, MRI, Event.SubRegIdx, Event.VReg); + + // Accumulate reads for this VReg in this cycle + ReadsInCycle[Event.VReg] |= M; + + LLVM_DEBUG(dbgs() << " Cycle " << C << " (" << ModuloCycle + << "): Read %vreg" + << Register::virtReg2Index(Event.VReg); + if (Event.SubRegIdx) dbgs() + << ":" << TRI.getSubRegIndexName(Event.SubRegIdx); + dbgs() << " lanes " << PrintLaneMask(M) << "\n"); + + // If this read uses a bypass, mark bypass read at same cycle + if (Event.ForwardingClass != 0) { + if (!LiveLanesByVirtReg.count(Event.VReg)) { + LiveLanesByVirtReg[Event.VReg] = AIE::LivenessVector(II); + } + LiveLanesByVirtReg[Event.VReg][ModuloCycle].addBypassRead( + Event.ForwardingClass); + + LLVM_DEBUG(dbgs() << " Bypass read of class " + << Event.ForwardingClass << " at cycle " << C + << " (offset " << ModuloCycle << ")\n"); + } + } + } + + // Step 3: Now propagate reads to ActiveMask for previous cycles + // Reads don't make the register live in the current cycle + for (const auto &[VReg, Mask] : ReadsInCycle) { + // The reads make the register live going backward (but not in this cycle) + ActiveMask[VReg] |= Mask; + + LLVM_DEBUG(dbgs() << " %vreg" << Register::virtReg2Index(VReg) + << " lanes " << PrintLaneMask(Mask) + << " become live going backward from cycle " << C + << "\n"); + } + } + + // At the end, ActiveMask should be empty (all defs should have been seen) + // If not, we have uses without defs (which would be an error in def-first + // semantics) + for (const auto &[VReg, Mask] : ActiveMask) { + if (Mask.any()) { + LLVM_DEBUG(dbgs() << "Warning: %vreg" << Register::virtReg2Index(VReg) + << " has lanes " << PrintLaneMask(Mask) + << " live at beginning (use without def?)\n"); + } + } + + return LiveLanesByVirtReg; +} + +void AIEScheduleInterpreter::dumpLiveLanes( + const DenseMap &LiveLanesByVirtReg, int II, + raw_ostream &OS) const { + + if (LiveLanesByVirtReg.empty()) { + OS << "No live lanes data\n"; + return; + } + + // Collect and sort VRegs for consistent output. + SmallVector VRegs; + for (const auto &[VReg, _] : LiveLanesByVirtReg) { + VRegs.push_back(VReg); + } + llvm::sort(VRegs); + + OS << "Live Lanes (II=" << II << "):\n"; + OS << "VReg | "; + for (int T = 0; T < II; ++T) { + OS << format("t%-6d ", T); + } + OS << "\n"; + + OS << "-------+"; + for (int T = 0; T < II; ++T) { + OS << "--------"; + } + OS << "\n"; + + for (unsigned VReg : VRegs) { + OS << format("%-6d | ", Register::virtReg2Index(VReg)); + + const auto &LanesByOffset = LiveLanesByVirtReg.lookup(VReg); + for (int T = 0; T < II; ++T) { + const AIE::Liveness &L = LanesByOffset[T]; + if (L.any()) { + // Build indicator showing lanes and bypass classes. + // Format examples: + // "## " = lanes only + // "#R1 " = lanes + bypass read class 1 + // "#W2 " = lanes + bypass write class 2 + // "R1W2 " = bypass read class 1 + bypass write class 2 + // "#R1W2 " = lanes + bypass read class 1 + bypass write class 2 + std::string Indicator; + if (L.getLanes().any()) { + Indicator = "#"; + } + + // Add bypass read classes. + if (!L.getBypassReads().empty()) { + Indicator += "R"; + for (unsigned FC : L.getBypassReads()) { + Indicator += std::to_string(FC); + } + } + + // Add bypass write classes. + if (!L.getBypassWrites().empty()) { + Indicator += "W"; + for (unsigned FC : L.getBypassWrites()) { + Indicator += std::to_string(FC); + } + } + + // Pad to 6 characters for alignment. + while (Indicator.size() < 6) { + Indicator += " "; + } + OS << " " << Indicator << " "; + } else { + OS << " .. "; + } + } + OS << "\n"; + } +} + +BitVector +AIEScheduleInterpreter::buildSubRegBitmap(ArrayRef LaneByOffset, + unsigned SubRegIdx) const { + + int II = LaneByOffset.size(); + BitVector BV(II, false); + + LaneBitmask SubRegMask = (SubRegIdx == 0) + ? LaneBitmask::getAll() + : TRI.getSubRegIndexLaneMask(SubRegIdx); + + for (int T = 0; T < II; ++T) { + BV[T] = (LaneByOffset[T] & SubRegMask).any(); + } + + return BV; +} + +BitVector AIEScheduleInterpreter::buildVRegBitmap( + ArrayRef LaneByOffset) const { + + int II = LaneByOffset.size(); + BitVector BV(II, false); + + for (int T = 0; T < II; ++T) { + BV[T] = LaneByOffset[T].any(); + } + + return BV; +} diff --git a/llvm/lib/Target/AIE/AIEScheduleInterpreter.h b/llvm/lib/Target/AIE/AIEScheduleInterpreter.h new file mode 100644 index 000000000000..b9932a671287 --- /dev/null +++ b/llvm/lib/Target/AIE/AIEScheduleInterpreter.h @@ -0,0 +1,169 @@ +//===- AIEScheduleInterpreter.h - Schedule-aware itinerary interpreter ---===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// This file contains a schedule-aware interpreter that computes register +// file (RF) occupancy windows from scheduled MachineInstrs and itinerary +// data. It emits per-operand, per-subregister liveness segments via a +// callback interface, enabling cycle-accurate interference computation. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AIE_AIESCHEDULEINTERPRETER_H +#define LLVM_LIB_TARGET_AIE_AIESCHEDULEINTERPRETER_H + +#include "AIELivenessVector.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include +#include + +namespace llvm { + +class MachineBasicBlock; +class MachineFunction; +class MachineInstr; +class MachineRegisterInfo; +class TargetInstrInfo; +class TargetRegisterInfo; +class InstrItineraryData; +class ScheduleDAGInstrs; +class SUnit; + +/// Key identifying a live range and its subregister +struct LRKey { + unsigned LRId; // Live range identifier + unsigned SubRegIdx; // Subregister index (0 for full register) + + bool operator==(const LRKey &Other) const { + return LRId == Other.LRId && SubRegIdx == Other.SubRegIdx; + } +}; + +/// Callback interface for receiving live range events +class LiveRangeEventSink { +public: + /// Called when a live range segment starts at a specific cycle + virtual void startLiveRange(const LRKey &Key, int Cycle) = 0; + + /// Called when a live range segment ends at a specific cycle + virtual void endLiveRange(const LRKey &Key, int Cycle) = 0; + + virtual ~LiveRangeEventSink() = default; +}; + +/// Map from MachineInstr to its scheduled cycle +using CycleMap = DenseMap; + +/// Handle for a live range +struct LRHandle { + unsigned LRId; // Live range identifier + unsigned VReg = 0; // Virtual register (optional, for diagnostics) + const TargetRegisterClass *RC = nullptr; // Register class (optional) +}; + +/// Event types for register file access +enum class EventType { Read, Write }; + +/// Event structure to track register accesses +struct RFEvent { + EventType Type; // Read or Write + unsigned VReg; // Virtual register + unsigned SubRegIdx; // Subregister index (0 for full register) + unsigned ForwardingClass; // Forwarding/bypass class (0 = no bypass) + const MachineInstr *MI; // Source instruction + unsigned OpIdx; // Operand index + + RFEvent(EventType T, unsigned V, unsigned S, unsigned F, + const MachineInstr *M, unsigned O) + : Type(T), VReg(V), SubRegIdx(S), ForwardingClass(F), MI(M), OpIdx(O) {} +}; + +/// Event schedule indexed by cycle +using EventSchedule = std::vector>; + +/// Schedule interpreter that computes RF occupancy windows +class AIEScheduleInterpreter { + const TargetInstrInfo &TII; + const TargetRegisterInfo &TRI; + const MachineRegisterInfo &MRI; + const InstrItineraryData *Itin; + + /// Get the cycle offset when an operand is accessed given a scheduling class + /// Returns the offset from issue cycle + int getOperandCycle(unsigned SchedClass, unsigned OpIdx) const; + +public: + explicit AIEScheduleInterpreter(const MachineFunction &MF); + + /// Add events for a single instruction to the event schedule + /// + /// Processes all register operands of the instruction and adds their + /// read/write events to the schedule based on the issue cycle and + /// itinerary timing information. + /// + /// \param MI The machine instruction to process + /// \param IssueCycle The cycle when the instruction is issued + /// \param Schedule The event schedule to update (will be resized if needed) + void addInstructionEvents(const MachineInstr &MI, int IssueCycle, + EventSchedule &Schedule) const; + + /// Dump the event schedule in a tabular format + /// + /// Displays cycles in rows and virtual registers in aligned columns, + /// showing 'R' for reads and 'W' for writes. + /// + /// \param Schedule The event schedule to dump + /// \param OS Output stream to write to + void dumpEventSchedule(const EventSchedule &Schedule, raw_ostream &OS) const; + + /// Build per-lane modulo-II live range masks from an event schedule + /// + /// Uses a backward scan to compute which lanes of each virtual register + /// are live at each modulo-II offset. The result is a map from VReg to + /// a LaneMaskVector, where LiveLanesByVirtReg[VReg][t] indicates + /// which lanes are live at offset t (0 <= t < II). + /// + /// \param Schedule The event schedule to analyze + /// \param II The initiation interval for modulo scheduling + /// \return Map of VReg to per-offset lane masks + DenseMap + buildLiveLanes(const EventSchedule &Schedule, int II) const; + + /// Dump the live lanes in a readable format + /// + /// \param LiveLanesByVirtReg The live lanes data to dump + /// \param II The initiation interval + /// \param OS Output stream to write to + void dumpLiveLanes( + const DenseMap &LiveLanesByVirtReg, int II, + raw_ostream &OS) const; + + /// Convert lane masks to a BitVector for a specific subregister + /// + /// \param LaneByOffset Array of lane masks indexed by modulo-II offset + /// \param SubRegIdx The subregister index (0 for full register) + /// \return BitVector of length II with bits set where the subregister is live + BitVector buildSubRegBitmap(ArrayRef LaneByOffset, + unsigned SubRegIdx) const; + + /// Convert lane masks to a BitVector for the full register + /// + /// \param LaneByOffset Array of lane masks indexed by modulo-II offset + /// \return BitVector of length II with bits set where any lane is live + BitVector buildVRegBitmap(ArrayRef LaneByOffset) const; +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AIE_AIESCHEDULEINTERPRETER_H diff --git a/llvm/lib/Target/AIE/CMakeLists.txt b/llvm/lib/Target/AIE/CMakeLists.txt index b0d4fd607b34..d333902f76e7 100644 --- a/llvm/lib/Target/AIE/CMakeLists.txt +++ b/llvm/lib/Target/AIE/CMakeLists.txt @@ -128,7 +128,9 @@ add_llvm_target(AIECodeGen AIEPreISelCombiner.cpp AIEInterBlockScheduling.cpp AIEISelDAGToDAG.cpp + AIELivenessVector.cpp AIELegalizerHelper.cpp + AIELiveRangeUtils.cpp AIELiveRegs.cpp AIELoopClass.cpp AIEMachineAlignment.cpp @@ -139,11 +141,15 @@ add_llvm_target(AIECodeGen AIEMIRFormatter.cpp AIEMultiSlotInstrMaterializer.cpp AIEPostPipeliner.cpp + AIEPostRegAlloc.cpp AIEPostSelectOptimize.cpp AIEPseudoBranchExpansion.cpp AIEPtrModOptimizer.cpp AIERegClassConstrainer.cpp + AIERegDefUseTracker.cpp AIERegMemEventTracker.cpp + AIEScarceRegScheduling.cpp + AIEScheduleInterpreter.cpp AIESlotCounts.cpp AIESpillSlotOptimization.cpp AIESlotStatistics.cpp diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v2.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v2.mir index d292def1eb9a..4a973749629e 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v2.mir +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v2.mir @@ -1,4 +1,3 @@ -# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 # This file is licensed under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception @@ -17,56 +16,51 @@ ; CHECK-LABEL: gemm: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p7, #64]; vldb x2, [p6, #64]; nops ; nopx ; mov p5, p6; nopv - ; CHECK-NEXT: padda [p5], m4; vldb.3d x0, [p6], d0; nopx ; mov p3, p7; nops - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p7], d1; vldb x6, [p5, #0] + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p7, #64]; vldb x0, [p6, #64]; nops ; nopx ; mov p5, p6; nopv + ; CHECK-NEXT: padda [p5], m4; vldb.3d x6, [p6], d0; mov p3, p7 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p7], d1; vldb x2, [p5, #0] ; CHECK-NEXT: vldb x4, [p5, #64] ; CHECK-NEXT: paddb [p3], m5 - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: vshuffle x8, x0, x2, r0 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex2, dm4; vshuffle x9, x0, x2, r1 - ; CHECK-NEXT: vshuffle x0, x4, x6, r0 - ; CHECK-NEXT: vshuffle x1, x4, x6, r1; vmul.f dm4, y4, y5, r2 - ; CHECK-NEXT: nop - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p3, #0]; vmul.f dm4, y0, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p3, #0] ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p3, #64]; add.nc lc, r0, #-1 ; CHECK-NEXT: movxm ls, #.LBB0_1 + ; CHECK-NEXT: vshuffle x8, x6, x0, r0 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vshuffle x9, x6, x0, r1 + ; CHECK-NEXT: vshuffle x0, x4, x2, r0 + ; CHECK-NEXT: vshuffle x1, x4, x2, r1; vmul.f dm4, y4, y5, r2 ; CHECK-NEXT: nopa ; nopb ; nops ; movxm le, #.L_LEnd0; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p7, #64]; vldb x2, [p6, #64]; vconv.bfp16ebs8.fp32 ex3, dm4; nopx ; mov p5, p6; nopv - ; CHECK-NEXT: padda [p5], m4; vldb.3d x0, [p6], d0; nops ; nopx ; mov p3, p7; nopv - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p7], d1; vldb x6, [p5, #0]; nopx ; vconv.bfp16ebs8.fp32 ex5, dm4 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p7, #64]; vldb x0, [p6, #64]; vconv.bfp16ebs8.fp32 ex5, dm4; nopx ; mov p5, p6; vmul.f dm4, y0, y5, r2 + ; CHECK-NEXT: padda [p5], m4; vldb.3d x6, [p6], d0; nops ; nopx ; mov p3, p7; nopv + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p7], d1; vldb x2, [p5, #0]; nopx ; CHECK-NEXT: vldb x4, [p5, #64] - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex7, dm4; paddb [p3], m5; vmac.f dm3, dm3, ex2, ex3, r3 - ; CHECK-NEXT: nop - ; CHECK-NEXT: vmac.f dm2, dm2, ex2, ex5, r3 - ; CHECK-NEXT: nop - ; CHECK-NEXT: vshuffle x8, x0, x2, r0; vmac.f dm1, dm1, ex7, ex3, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex2, dm4; vshuffle x9, x0, x2, r1; vmac.f dm0, dm0, ex7, ex5, r3 - ; CHECK-NEXT: vshuffle x0, x4, x6, r0 - ; CHECK-NEXT: vshuffle x1, x4, x6, r1; vmul.f dm4, y4, y5, r2 - ; CHECK-NEXT: nop - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p3, #0]; vmul.f dm4, y0, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p3, #64] + ; CHECK-NEXT: paddb [p3], m5; vconv.bfp16ebs8.fp32 ex7, dm4 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p3, #0] + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p3, #64]; vconv.bfp16ebs8.fp32 ex7, dm4 ; CHECK-NEXT: nop + ; CHECK-NEXT: vshuffle x8, x6, x0, r0; vmac.f dm3, dm3, ex3, ex7, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vshuffle x9, x6, x0, r1; vmac.f dm1, dm1, ex5, ex7, r3 + ; CHECK-NEXT: vshuffle x0, x4, x2, r0; vmac.f dm2, dm2, ex3, ex7, r3 + ; CHECK-NEXT: vshuffle x1, x4, x2, r1; vmul.f dm4, y4, y5, r2 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; vmac.f dm0, dm0, ex5, ex7, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: // %bb.2: - ; CHECK-NEXT: nopa ; nopb ; vconv.bfp16ebs8.fp32 ex3, dm4; nopxm ; nopv - ; CHECK-NEXT: nopx - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex5, dm4 + ; CHECK-NEXT: nopa ; nopb ; nopx ; vconv.bfp16ebs8.fp32 ex5, dm4; vmul.f dm4, y0, y5, r2 + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex7, dm4 ; CHECK-NEXT: nop - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex7, dm4; vmac.f dm3, dm3, ex2, ex3, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex7, dm4 ; CHECK-NEXT: nop - ; CHECK-NEXT: vmac.f dm2, dm2, ex2, ex5, r3 + ; CHECK-NEXT: vmac.f dm3, dm3, ex3, ex7, r3 + ; CHECK-NEXT: vmac.f dm1, dm1, ex5, ex7, r3 + ; CHECK-NEXT: vmac.f dm2, dm2, ex3, ex7, r3 ; CHECK-NEXT: nop - ; CHECK-NEXT: vmac.f dm1, dm1, ex7, ex3, r3 - ; CHECK-NEXT: vmac.f dm0, dm0, ex7, ex5, r3 + ; CHECK-NEXT: vmac.f dm0, dm0, ex5, ex7, r3 ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16.mir index 77c15549f7b5..094725094bdc 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16.mir +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16.mir @@ -17,57 +17,53 @@ ; CHECK-LABEL: gemm: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry - ; CHECK-NEXT: nopa ; vldb.3d x2, [p6], d0; nops ; nopx ; mov p5, p6; nopv - ; CHECK-NEXT: padda [p5], m4; vldb x4, [p5, #64]; mov p3, p7 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p7], d1; vldb x6, [p5], #64 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p3, #64]; vldb x8, [p5, #0] + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p7], d1; vldb.3d x2, [p6], d0; movs p5, p6; nopx ; mov p3, p7; nopv + ; CHECK-NEXT: padda [p5], m4; vldb x11, [p5, #64]; nops ; nopxm ; nopv + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p3, #64]; vldb x6, [p5], #64 + ; CHECK-NEXT: vldb x8, [p5, #0] ; CHECK-NEXT: paddb [p3], m5 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p3], #64 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p3, #0] ; CHECK-NEXT: nop + ; CHECK-NEXT: vshuffle x4, x2, x11, r0 + ; CHECK-NEXT: vshuffle x5, x2, x11, r1 + ; CHECK-NEXT: vshuffle x10, x6, x8, r0 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex6, dm4; vshuffle x11, x6, x8, r1; vmul.f dm4, y2, y0, r2 ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: vshuffle x10, x2, x4, r0 - ; CHECK-NEXT: vshuffle x11, x2, x4, r1 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex2, dm4; vshuffle x4, x6, x8, r0 - ; CHECK-NEXT: vshuffle x5, x6, x8, r1; vmul.f dm4, y5, y0, r2 - ; CHECK-NEXT: nop - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p3], #64; vmul.f dm4, y2, y0, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p3, #0] - ; CHECK-NEXT: add.nc lc, r0, #-1 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex4, dm4; add.nc lc, r0, #-1; vmul.f dm4, y5, y0, r2 ; CHECK-NEXT: movxm ls, #.LBB0_1 - ; CHECK-NEXT: nopa ; nopb ; vconv.bfp16ebs8.fp32 ex6, dm4; movxm le, #.L_LEnd0; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; movxm le, #.L_LEnd0; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vldb.3d x2, [p6], d0; mov p5, p6 - ; CHECK-NEXT: padda [p5], m4; vldb x4, [p5, #64]; vconv.bfp16ebs8.fp32 ex4, dm4; mov p3, p7 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p7], d1; vldb x6, [p5], #64 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p3, #64]; vldb x8, [p5, #0]; vconv.bfp16ebs8.fp32 ex8, dm4; vmac.f dm3, dm3, ex2, ex6, r3 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p7], d1; vldb.3d x2, [p6], d0; nopx ; mov p3, p7; movs p5, p6 + ; CHECK-NEXT: padda [p5], m4; vldb x11, [p5, #64]; vconv.bfp16ebs8.fp32 ex8, dm4 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p3, #64]; vldb x6, [p5], #64 + ; CHECK-NEXT: vldb x8, [p5, #0]; vconv.bfp16ebs8.fp32 ex8, dm4 ; CHECK-NEXT: paddb [p3], m5 - ; CHECK-NEXT: vmac.f dm2, dm2, ex2, ex4, r3 - ; CHECK-NEXT: nop - ; CHECK-NEXT: vmac.f dm0, dm0, ex8, ex4, r3 - ; CHECK-NEXT: vshuffle x10, x2, x4, r0; vmac.f dm1, dm1, ex8, ex6, r3 - ; CHECK-NEXT: vshuffle x11, x2, x4, r1 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex2, dm4; vshuffle x4, x6, x8, r0 - ; CHECK-NEXT: vshuffle x5, x6, x8, r1; vmul.f dm4, y5, y0, r2 - ; CHECK-NEXT: nop - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p3], #64; vmul.f dm4, y2, y0, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p3, #0] + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p3], #64; vmac.f dm3, dm3, ex6, ex8, r3 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p3, #0]; vmac.f dm1, dm1, ex4, ex8, r3 + ; CHECK-NEXT: vmac.f dm2, dm2, ex6, ex8, r3 + ; CHECK-NEXT: vshuffle x4, x2, x11, r0; vmac.f dm0, dm0, ex4, ex8, r3 + ; CHECK-NEXT: vshuffle x5, x2, x11, r1 + ; CHECK-NEXT: vshuffle x10, x6, x8, r0 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex6, dm4; vshuffle x11, x6, x8, r1; vmul.f dm4, y2, y0, r2 ; CHECK-NEXT: nop + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex4, dm4; vmul.f dm4, y5, y0, r2 ; CHECK-NEXT: nop ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopa ; nopb ; vconv.bfp16ebs8.fp32 ex6, dm4; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: // %bb.2: - ; CHECK-NEXT: nopx - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex4, dm4 - ; CHECK-NEXT: nop - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex8, dm4; vmac.f dm3, dm3, ex2, ex6, r3 + ; CHECK-NEXT: nopa ; nopx + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex8, dm4 ; CHECK-NEXT: nop - ; CHECK-NEXT: vmac.f dm2, dm2, ex2, ex4, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex8, dm4 ; CHECK-NEXT: nop - ; CHECK-NEXT: vmac.f dm0, dm0, ex8, ex4, r3 - ; CHECK-NEXT: vmac.f dm1, dm1, ex8, ex6, r3 + ; CHECK-NEXT: vmac.f dm3, dm3, ex6, ex8, r3 + ; CHECK-NEXT: vmac.f dm1, dm1, ex4, ex8, r3 + ; CHECK-NEXT: vmac.f dm2, dm2, ex6, ex8, r3 + ; CHECK-NEXT: vmac.f dm0, dm0, ex4, ex8, r3 ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 @@ -123,12 +119,12 @@ body: | successors: %bb.2, %bb.3 liveins: $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dj7, $dm0:0x000000001800000C, $dm1:0x000000001800000C, $dm2:0x000000001800000C, $dm3:0x000000001800000C, $dn0, $dn1, $dn2, $dn4, $dn5, $m0, $m1, $m2, $m3, $m4, $m5, $p0, $p1, $p2, $p4, $p6, $p7, $r0, $r1, $r2, $r3, $r4, $r8, $y0:0x0000000000000033, $d0_3d:0x0003C00000200E00, $d1_3d:0x0003C00000200E00, $dn3 - $p5 = MOV_alu_mv_mv_mv_scl $p6 + $p5 = MOV_scalar_pseudo $p6 $x2, $p6, $dc0, $dc4 = VLD_3D_x_pseudo killed $p6, $d0_3d :: (load (<16 x s32>)) renamable $x4 = VLD_x_idx_imm_pseudo renamable $p5, 64 :: (load (<16 x s32>)) - $p3 = MOV_alu_mv_mv_mv_scl $p7 - $cml4, $p7, $dc1, $dc5 = VLDA_3D_CONV_fp32_bf16_dmx_lda_ups_bf killed $p7, $d1_3d :: (load (<32 x s16>)) + $p3 = MOV_scalar_pseudo $p7 renamable $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm renamable $p3, 64 :: (load (<32 x s16>)) + $cml4, $p7, $dc1, $dc5 = VLDA_3D_CONV_fp32_bf16_dmx_lda_ups_bf killed $p7, $d1_3d :: (load (<32 x s16>)) renamable $p5 = PADD_mod_pseudo killed renamable $p5, renamable $m4 renamable $x6, renamable $p5 = VLD_x_pstm_nrm_imm_pseudo killed renamable $p5, 64 :: (load (<16 x s32>)) renamable $x8 = VLD_x_idx_imm_pseudo killed renamable $p5, 0 :: (load (<16 x s32>)) diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/postpipeliner-ore.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/postpipeliner-ore.mir index 29c9a6f155a3..2ef9cea8f838 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/postpipeliner-ore.mir +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/postpipeliner-ore.mir @@ -1,8 +1,9 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 # This file is licensed under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates # RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ # RUN: --start-before=postmisched %s \ @@ -13,16 +14,6 @@ --- | define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr { - ; CHECK: --- !Passed - ; CHECK-NEXT: Pass: postpipeliner - ; CHECK-NEXT: Name: schedule - ; CHECK-NEXT: Function: gemm - ; CHECK-NEXT: Args: - ; CHECK-NEXT: - String: Schedule found - ; CHECK-NEXT: - NS: '4' - ; CHECK-NEXT: - II: '8' - ; CHECK-NEXT: - BasicBlock: for.body - ; CHECK-NEXT: ... entry: %cmp5 = icmp sgt i32 %n, 0 br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup @@ -49,13 +40,6 @@ define dso_local void @gemm_lowitercount(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr { - ; CHECK: --- !Missed - ; CHECK-NEXT: Pass: postpipeliner - ; CHECK-NEXT: Name: schedule - ; CHECK-NEXT: Function: gemm_lowitercount - ; CHECK-NEXT: Args: - ; CHECK-NEXT: - String: No schedule found. - ; CHECK-NEXT: ... entry: %cmp5 = icmp sgt i32 %n, 0 br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup @@ -192,3 +176,5 @@ body: | DelayedSchedBarrier ... +## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +# CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/gemm-bfp16-exp.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/gemm-bfp16-exp.mir new file mode 100644 index 000000000000..626a3fb8460f --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/gemm-bfp16-exp.mir @@ -0,0 +1,250 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +# This test exercises experimental modules AIERegDefUseTracker and AIEScheduleInterpreter +# using the motivating GEMM example with multi-slot pseudo materialization + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched --stop-after=postmisched \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: -o - --debug-only=aie-reg-liverange %s 2>&1 | FileCheck %s +# REQUIRES: asserts + +# CHECK: FINAL LIVE RANGES +# CHECK: ================================ +# CHECK: Total live ranges: 14 +# CHECK: Live Range #28 for dm4: +# CHECK: Definitions (2): +# CHECK: [0] Register: cmh4 (SubRegIdx: 9) renamable $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm renamable $p3, 64 :: (load (<32 x s16>)) +# CHECK: [1] Register: cml4 (SubRegIdx: 10) $cml4, $p7, $dc1, $dc5 = VLDA_3D_CONV_fp32_bf16_dmx_lda_ups_bf killed $p7(tied-def 1), $d1_3d :: (load (<32 x s16>)) +# CHECK: Uses (1): +# CHECK: [0] Register: dm4 renamable $ex2 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd +# CHECK: Live Range #22 for dm4: +# CHECK: Definitions (1): +# CHECK: [0] Register: dm4 renamable $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y killed renamable $y5, renamable $y0, renamable $r2, implicit-def dead $srfpflags, implicit $crfpmask +# CHECK: Uses (1): +# CHECK: [0] Register: dm4 renamable $ex6 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd +# CHECK: Live Range #16 for dm4: +# CHECK: Definitions (1): +# CHECK: [0] Register: dm4 renamable $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y killed renamable $y2, renamable $y0, renamable $r2, implicit-def dead $srfpflags, implicit $crfpmask +# CHECK: Uses (1): +# CHECK: [0] Register: dm4 renamable $ex4 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd +# CHECK: Live Range #13 for dm4: +# CHECK: Definitions (2): +# CHECK: [0] Register: cmh4 (SubRegIdx: 9) renamable $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm killed renamable $p3, 0 :: (load (<32 x s16>)) +# CHECK: [1] Register: cml4 (SubRegIdx: 10) renamable $cml4, renamable $p3 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_pstm_nrm_imm killed renamable $p3(tied-def 1), 64 :: (load (<32 x s16>)) +# CHECK: Uses (1): +# CHECK: [0] Register: dm4 renamable $ex8 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd +# CHECK: Live Range #10 for ex2: +# CHECK: Definitions (1): +# CHECK: [0] Register: ex2 renamable $ex2 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd +# CHECK: Uses (2): +# CHECK: [0] Register: ex2 renamable $dm2 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm2, killed renamable $ex2, renamable $ex4, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask +# CHECK: [1] Register: ex2 renamable $dm3 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm3, renamable $ex2, renamable $ex6, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask +# CHECK: Live Range #3 for ex4: +# CHECK: Definitions (1): +# CHECK: [0] Register: ex4 renamable $ex4 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd +# CHECK: Uses (2): +# CHECK: [0] Register: ex4 renamable $dm0 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm0, killed renamable $ex8, killed renamable $ex4, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask +# CHECK: [1] Register: ex4 renamable $dm2 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm2, killed renamable $ex2, renamable $ex4, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask +# CHECK: Live Range #7 for ex6: +# CHECK: Definitions (1): +# CHECK: [0] Register: ex6 renamable $ex6 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd +# CHECK: Uses (2): +# CHECK: [0] Register: ex6 renamable $dm1 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm1, renamable $ex8, killed renamable $ex6, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask +# CHECK: [1] Register: ex6 renamable $dm3 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm3, renamable $ex2, renamable $ex6, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask +# CHECK: Live Range #2 for ex8: +# CHECK: Definitions (1): +# CHECK: [0] Register: ex8 renamable $ex8 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd +# CHECK: Uses (2): +# CHECK: [0] Register: ex8 renamable $dm0 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm0, killed renamable $ex8, killed renamable $ex4, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask +# CHECK: [1] Register: ex8 renamable $dm1 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm1, renamable $ex8, killed renamable $ex6, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask +# CHECK: Live Range #29 for x2: +# CHECK: Definitions (1): +# CHECK: [0] Register: x2 $x2, $p6, $dc0, $dc4 = VLDB_3D_dmx_ldb_x killed $p6(tied-def 1), $d0_3d :: (load (<16 x s32>)) +# CHECK: Uses (2): +# CHECK: [0] Register: x2 renamable $x11 = VSHUFFLE_vec_shuffle_x killed renamable $x2, killed renamable $x4, renamable $r1 +# CHECK: [1] Register: x2 renamable $x10 = VSHUFFLE_vec_shuffle_x renamable $x2, renamable $x4, renamable $r0 +# CHECK: Live Range #30 for x4: +# CHECK: Definitions (1): +# CHECK: [0] Register: x4 renamable $x4 = VLDB_dmx_ldb_x_idx_imm renamable $p5, 64 :: (load (<16 x s32>)) +# CHECK: Uses (2): +# CHECK: [0] Register: x4 renamable $x11 = VSHUFFLE_vec_shuffle_x killed renamable $x2, killed renamable $x4, renamable $r1 +# CHECK: [1] Register: x4 renamable $x10 = VSHUFFLE_vec_shuffle_x renamable $x2, renamable $x4, renamable $r0 +# CHECK: Live Range #23 for x6: +# CHECK: Definitions (1): +# CHECK: [0] Register: x6 renamable $x6, renamable $p5 = VLDB_dmx_ldb_x_pstm_nrm_imm killed renamable $p5(tied-def 1), 64 :: (load (<16 x s32>)) +# CHECK: Uses (2): +# CHECK: [0] Register: x6 renamable $x5 = VSHUFFLE_vec_shuffle_x killed renamable $x6, killed renamable $x8, renamable $r1 +# CHECK: [1] Register: x6 renamable $x4 = VSHUFFLE_vec_shuffle_x renamable $x6, renamable $x8, renamable $r0 +# CHECK: Live Range #24 for x8: +# CHECK: Definitions (1): +# CHECK: [0] Register: x8 renamable $x8 = VLDB_dmx_ldb_x_idx_imm killed renamable $p5, 0 :: (load (<16 x s32>)) +# CHECK: Uses (2): +# CHECK: [0] Register: x8 renamable $x5 = VSHUFFLE_vec_shuffle_x killed renamable $x6, killed renamable $x8, renamable $r1 +# CHECK: [1] Register: x8 renamable $x4 = VSHUFFLE_vec_shuffle_x renamable $x6, renamable $x8, renamable $r0 +# CHECK: Live Range #19 for y2: +# CHECK: Definitions (2): +# CHECK: [0] Register: x5 (SubRegIdx: 5) renamable $x5 = VSHUFFLE_vec_shuffle_x killed renamable $x6, killed renamable $x8, renamable $r1 +# CHECK: [1] Register: x4 (SubRegIdx: 8) renamable $x4 = VSHUFFLE_vec_shuffle_x renamable $x6, renamable $x8, renamable $r0 +# CHECK: Uses (1): +# CHECK: [0] Register: y2 renamable $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y killed renamable $y2, renamable $y0, renamable $r2, implicit-def dead $srfpflags, implicit $crfpmask +# CHECK: Live Range #27 for y5: +# CHECK: Definitions (2): +# CHECK: [0] Register: x11 (SubRegIdx: 5) renamable $x11 = VSHUFFLE_vec_shuffle_x killed renamable $x2, killed renamable $x4, renamable $r1 +# CHECK: [1] Register: x10 (SubRegIdx: 8) renamable $x10 = VSHUFFLE_vec_shuffle_x renamable $x2, renamable $x4, renamable $r0 +# CHECK: Uses (1): +# CHECK: [0] Register: y5 renamable $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y killed renamable $y5, renamable $y0, renamable $r2, implicit-def dead $srfpflags, implicit $crfpmask + +# CHECK: Available Physical Registers for Reallocation: +# CHECK: ============================================== +# CHECK: bmhh4 +# CHECK: bmhl4 +# CHECK: bmlh4 +# CHECK: bmll4 +# CHECK: cmh4 +# CHECK: cml4 +# CHECK: dm4 +# CHECK: e2 +# CHECK: e4 +# CHECK: e6 +# CHECK: e8 +# CHECK: eh2 +# CHECK: eh4 +# CHECK: eh6 +# CHECK: eh8 +# CHECK: el2 +# CHECK: el4 +# CHECK: el6 +# CHECK: el8 +# CHECK: ewh2 +# CHECK: ewh4 +# CHECK: ewh6 +# CHECK: ewh8 +# CHECK: ewl2 +# CHECK: ewl4 +# CHECK: ewl6 +# CHECK: ewl8 +# CHECK: ex2 +# CHECK: ex4 +# CHECK: ex6 +# CHECK: ex8 +# CHECK: wh2 +# CHECK: wh4 +# CHECK: wh5 +# CHECK: wh6 +# CHECK: wh8 +# CHECK: wh10 +# CHECK: wh11 +# CHECK: wl2 +# CHECK: wl4 +# CHECK: wl5 +# CHECK: wl6 +# CHECK: wl8 +# CHECK: wl10 +# CHECK: wl11 +# CHECK: x2 +# CHECK: x4 +# CHECK: x5 +# CHECK: x6 +# CHECK: x8 +# CHECK: x10 +# CHECK: x11 +# CHECK: y2 +# CHECK: y5 +# CHECK: Total: 54 registers +# CHECK: === END FINAL LIVE RANGES + +# derived from GEMM_Bfp16_opt_0 + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr { + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !0 + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + } + + declare void @llvm.set.loop.iterations.i32(i32) + declare i1 @llvm.loop.decrement.i32(i32) + + !0 = distinct !{!0, !1, !2} + !1 = !{!"llvm.loop.mustprogress"} + !2 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.2 + liveins: $p0, $p1, $r0 + + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.2 + $le = MOVXM + + bb.2.for.body (align 16): + successors: %bb.2, %bb.3 + liveins: $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dj7, $dm0:0x000000001800000C, $dm1:0x000000001800000C, $dm2:0x000000001800000C, $dm3:0x000000001800000C, $dn0, $dn1, $dn2, $dn4, $dn5, $m0, $m1, $m2, $m3, $m4, $m5, $p0, $p1, $p2, $p4, $p6, $p7, $r0, $r1, $r2, $r3, $r4, $r8, $y0:0x0000000000000033, $d0_3d:0x0003C00000200E00, $d1_3d:0x0003C00000200E00, $dn3 + + $p5 = MOV_alu_mv_mv_mv_scl $p6 + $x2, $p6, $dc0, $dc4 = VLD_3D_x_pseudo killed $p6, $d0_3d :: (load (<16 x s32>)) + renamable $x4 = VLD_x_idx_imm_pseudo renamable $p5, 64 :: (load (<16 x s32>)) + $p3 = MOV_alu_mv_mv_mv_scl $p7 + $cml4, $p7, $dc1, $dc5 = VLDA_3D_CONV_fp32_bf16_dmx_lda_ups_bf killed $p7, $d1_3d :: (load (<32 x s16>)) + renamable $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm renamable $p3, 64 :: (load (<32 x s16>)) + renamable $p5 = PADD_mod_pseudo killed renamable $p5, renamable $m4 + renamable $x6, renamable $p5 = VLD_x_pstm_nrm_imm_pseudo killed renamable $p5, 64 :: (load (<16 x s32>)) + renamable $x8 = VLD_x_idx_imm_pseudo killed renamable $p5, 0 :: (load (<16 x s32>)) + renamable $x10 = VSHUFFLE_vec_shuffle_x renamable $x2, renamable $x4, renamable $r0 + renamable $x11 = VSHUFFLE_vec_shuffle_x killed renamable $x2, killed renamable $x4, renamable $r1 + renamable $ex2 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + renamable $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y killed renamable $y5, renamable $y0, renamable $r2, implicit-def dead $srfpflags, implicit $crfpmask + renamable $x4 = VSHUFFLE_vec_shuffle_x renamable $x6, renamable $x8, renamable $r0 + renamable $x5 = VSHUFFLE_vec_shuffle_x killed renamable $x6, killed renamable $x8, renamable $r1 + renamable $ex6 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + renamable $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y killed renamable $y2, renamable $y0, renamable $r2, implicit-def dead $srfpflags, implicit $crfpmask + renamable $p3 = PADD_mod_pseudo killed renamable $p3, renamable $m5 + renamable $ex4 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + renamable $cml4, renamable $p3 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_pstm_nrm_imm killed renamable $p3, 64 :: (load (<32 x s16>)) + renamable $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm killed renamable $p3, 0 :: (load (<32 x s16>)) + renamable $ex8 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + renamable $dm3 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm3, renamable $ex2, renamable $ex6, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask + renamable $dm2 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm2, killed renamable $ex2, renamable $ex4, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask + renamable $dm1 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm1, renamable $ex8, killed renamable $ex6, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask, + renamable $dm0 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm0, killed renamable $ex8, killed renamable $ex4, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask + + PseudoLoopEnd , %bb.2 + + bb.3 (align 16): + RET implicit $lr + DelayedSchedBarrier + +... +## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +# REWRITE: {{.*}} diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/gemm-bfp16-ii7.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/gemm-bfp16-ii7.mir new file mode 100644 index 000000000000..f0f13d3550a3 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/gemm-bfp16-ii7.mir @@ -0,0 +1,159 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +# This test exercises experimental modules +# AIERegDefUseTracker +# AIEScheduleInterpreter +# AIEPostRegAlloc +# using the motivating GEMM example skipping WAWRegRewriter. +# We rewrite suitable physregs to virtual regs, create the dependence graph, +# pipeline, then reallocate the virtual regs + + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --aie-postpipeliner-filter-no-choice=0 \ +# RUN: --aie-wawreg-rewrite=0 \ +# RUN: --aie-preassign-multi-slot-instr=1 \ +# RUN: --aie-materialize-pipeline=0 \ +# RUN: --aie-postpipeliner-maxii=10 \ +# RUN: -o - %s | FileCheck %s + +# derived from GEMM_Bfp16_opt_0 + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr { + ; CHECK-LABEL: gemm: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: add.nc lc, r0, #0 + ; CHECK-NEXT: movxm ls, #.LBB0_1 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_1: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: nopa ; vldb.3d x2, [p6], d0; nopx ; mov p5, p6 + ; CHECK-NEXT: padda [p5], m4; vldb x4, [p5, #64] + ; CHECK-NEXT: vldb x6, [p5], #64 + ; CHECK-NEXT: vldb x8, [p5, #0] + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: mov p3, p7 + ; CHECK-NEXT: vshuffle x10, x2, x4, r0 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p7], d1; vshuffle x11, x2, x4, r1 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p3, #64]; vshuffle x4, x6, x8, r0 + ; CHECK-NEXT: vshuffle x5, x6, x8, r1 + ; CHECK-NEXT: padda [p3], m5; vmul.f dm4, y5, y0, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p3], #64; vmul.f dm4, y2, y0, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p3, #0] + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex2, dm4 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex6, dm4 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex4, dm4 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex8, dm4 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vmac.f dm3, dm3, ex2, ex6, r3 + ; CHECK-NEXT: vmac.f dm2, dm2, ex2, ex4, r3 + ; CHECK-NEXT: vmac.f dm1, dm1, ex8, ex6, r3 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; vmac.f dm0, dm0, ex8, ex4, r3 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: // %bb.2: + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !0 + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + } + + declare void @llvm.set.loop.iterations.i32(i32) + declare i1 @llvm.loop.decrement.i32(i32) + + !0 = distinct !{!0, !1, !2} + !1 = !{!"llvm.loop.mustprogress"} + !2 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.2 + liveins: $p0, $p1, $r0 + + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.2 + $le = MOVXM + + bb.2.for.body (align 16): + successors: %bb.2, %bb.3 + liveins: $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dj7, $dm0:0x000000001800000C, $dm1:0x000000001800000C, $dm2:0x000000001800000C, $dm3:0x000000001800000C, $dn0, $dn1, $dn2, $dn4, $dn5, $m0, $m1, $m2, $m3, $m4, $m5, $p0, $p1, $p2, $p4, $p6, $p7, $r0, $r1, $r2, $r3, $r4, $r8, $y0:0x0000000000000033, $d0_3d:0x0003C00000200E00, $d1_3d:0x0003C00000200E00, $dn3 + + $p5 = MOV_alu_mv_mv_mv_scl $p6 + $x2, $p6, $dc0, $dc4 = VLD_3D_x_pseudo killed $p6, $d0_3d :: (load (<16 x s32>)) + renamable $x4 = VLD_x_idx_imm_pseudo renamable $p5, 64 :: (load (<16 x s32>)) + $p3 = MOV_alu_mv_mv_mv_scl $p7 + $cml4, $p7, $dc1, $dc5 = VLDA_3D_CONV_fp32_bf16_dmx_lda_ups_bf killed $p7, $d1_3d :: (load (<32 x s16>)) + renamable $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm renamable $p3, 64 :: (load (<32 x s16>)) + renamable $p5 = PADD_mod_pseudo killed renamable $p5, renamable $m4 + renamable $x6, renamable $p5 = VLD_x_pstm_nrm_imm_pseudo killed renamable $p5, 64 :: (load (<16 x s32>)) + renamable $x8 = VLD_x_idx_imm_pseudo killed renamable $p5, 0 :: (load (<16 x s32>)) + renamable $x10 = VSHUFFLE_vec_shuffle_x renamable $x2, renamable $x4, renamable $r0 + renamable $x11 = VSHUFFLE_vec_shuffle_x killed renamable $x2, killed renamable $x4, renamable $r1 + renamable $ex2 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + renamable $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y killed renamable $y5, renamable $y0, renamable $r2, implicit-def dead $srfpflags, implicit $crfpmask + renamable $x4 = VSHUFFLE_vec_shuffle_x renamable $x6, renamable $x8, renamable $r0 + renamable $x5 = VSHUFFLE_vec_shuffle_x killed renamable $x6, killed renamable $x8, renamable $r1 + renamable $ex6 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + renamable $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y killed renamable $y2, renamable $y0, renamable $r2, implicit-def dead $srfpflags, implicit $crfpmask + renamable $p3 = PADD_mod_pseudo killed renamable $p3, renamable $m5 + renamable $ex4 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + renamable $cml4, renamable $p3 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_pstm_nrm_imm killed renamable $p3, 64 :: (load (<32 x s16>)) + renamable $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm killed renamable $p3, 0 :: (load (<32 x s16>)) + renamable $ex8 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + renamable $dm3 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm3, renamable $ex2, renamable $ex6, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask + renamable $dm2 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm2, killed renamable $ex2, renamable $ex4, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask + renamable $dm1 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm1, renamable $ex8, killed renamable $ex6, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask, + renamable $dm0 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm0, killed renamable $ex8, killed renamable $ex4, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask + + PseudoLoopEnd , %bb.2 + + bb.3 (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/gemm-bfp16-mli.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/gemm-bfp16-mli.mir new file mode 100644 index 000000000000..6e544a614433 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/gemm-bfp16-mli.mir @@ -0,0 +1,124 @@ + +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +# This test exercises experimental modules AIERegDefUseTracker and AIEScheduleInterpreter +# using the motivating GEMM example with multi-slot pseudo materialization + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched --stop-after=postmisched \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --aie-postpipeliner-filter-no-choice=false \ +# RUN: --aie-postpipeliner-maxii=7 \ +# RUN: -o - --debug-only=aie-postregalloc %s 2>&1 | FileCheck %s +# REQUIRES: asserts + +# derived from GEMM_Bfp16_opt_0 + +# CHECK: Live Lanes (II=7): +# CHECK: VReg | t0 t1 t2 t3 t4 t5 t6 +# CHECK: -------+-------------------------------------------------------- +# CHECK: 0 | .. # # #R1 R1 .. .. +# CHECK: 1 | # # # #R1 R1 .. .. +# CHECK: 2 | # .. # # # # # +# CHECK: 3 | .. .. .. W1 #W1 # .. +# CHECK: 4 | .. .. .. # # #R1 R1 +# CHECK: 5 | .. .. # # # #R1 R1 +# CHECK: 6 | .. .. .. .. # .. .. +# CHECK: 7 | # .. .. .. .. W1 #W1 +# CHECK: 8 | .. .. .. .. .. .. # +# CHECK: 9 | # # .. .. .. # # +# CHECK: 10 | # # # # # # # +# CHECK: 11 | .. # # .. .. .. .. +# CHECK: 12 | .. .. .. # # .. .. +# CHECK: 13 | # # # # # # # + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr { + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !0 + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + } + + declare void @llvm.set.loop.iterations.i32(i32) + declare i1 @llvm.loop.decrement.i32(i32) + + !0 = distinct !{!0, !1, !2} + !1 = !{!"llvm.loop.mustprogress"} + !2 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.2 + liveins: $p0, $p1, $r0 + + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.2 + $le = MOVXM + + bb.2.for.body (align 16): + successors: %bb.2, %bb.3 + liveins: $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dj7, $dm0:0x000000001800000C, $dm1:0x000000001800000C, $dm2:0x000000001800000C, $dm3:0x000000001800000C, $dn0, $dn1, $dn2, $dn4, $dn5, $m0, $m1, $m2, $m3, $m4, $m5, $p0, $p1, $p2, $p4, $p6, $p7, $r0, $r1, $r2, $r3, $r4, $r8, $y0:0x0000000000000033, $d0_3d:0x0003C00000200E00, $d1_3d:0x0003C00000200E00, $dn3 + + $p5 = MOV_alu_mv_mv_mv_scl $p6 + $x2, $p6, $dc0, $dc4 = VLD_3D_x_pseudo killed $p6, $d0_3d :: (load (<16 x s32>)) + renamable $x4 = VLD_x_idx_imm_pseudo renamable $p5, 64 :: (load (<16 x s32>)) + $p3 = MOV_alu_mv_mv_mv_scl $p7 + $cml4, $p7, $dc1, $dc5 = VLDA_3D_CONV_fp32_bf16_dmx_lda_ups_bf killed $p7, $d1_3d :: (load (<32 x s16>)) + renamable $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm renamable $p3, 64 :: (load (<32 x s16>)) + renamable $p5 = PADD_mod_pseudo killed renamable $p5, renamable $m4 + renamable $x6, renamable $p5 = VLD_x_pstm_nrm_imm_pseudo killed renamable $p5, 64 :: (load (<16 x s32>)) + renamable $x8 = VLD_x_idx_imm_pseudo killed renamable $p5, 0 :: (load (<16 x s32>)) + renamable $x10 = VSHUFFLE_vec_shuffle_x renamable $x2, renamable $x4, renamable $r0 + renamable $x11 = VSHUFFLE_vec_shuffle_x killed renamable $x2, killed renamable $x4, renamable $r1 + renamable $ex2 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + renamable $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y killed renamable $y5, renamable $y0, renamable $r2, implicit-def dead $srfpflags, implicit $crfpmask + renamable $x4 = VSHUFFLE_vec_shuffle_x renamable $x6, renamable $x8, renamable $r0 + renamable $x5 = VSHUFFLE_vec_shuffle_x killed renamable $x6, killed renamable $x8, renamable $r1 + renamable $ex6 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + renamable $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y killed renamable $y2, renamable $y0, renamable $r2, implicit-def dead $srfpflags, implicit $crfpmask + renamable $p3 = PADD_mod_pseudo killed renamable $p3, renamable $m5 + renamable $ex4 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + renamable $cml4, renamable $p3 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_pstm_nrm_imm killed renamable $p3, 64 :: (load (<32 x s16>)) + renamable $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm killed renamable $p3, 0 :: (load (<32 x s16>)) + renamable $ex8 = VCONV_bfp16ebs8_fp32 killed renamable $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + renamable $dm3 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm3, renamable $ex2, renamable $ex6, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask + renamable $dm2 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm2, killed renamable $ex2, renamable $ex4, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask + renamable $dm1 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm1, renamable $ex8, killed renamable $ex6, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask, + renamable $dm0 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm0, killed renamable $ex8, killed renamable $ex4, renamable $r3, implicit-def dead $srfpflags, implicit $crfpmask + + PseudoLoopEnd , %bb.2 + + bb.3 (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test-available-phys-regs.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test-available-phys-regs.mir new file mode 100644 index 000000000000..52e47960d28c --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test-available-phys-regs.mir @@ -0,0 +1,79 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +# NOTE: Test for AIERegDefUseTracker - available physical registers tracking +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2p \ +# RUN: --start-before=postmisched --stop-after=postmisched \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker --aie-postpipeliner-filter-no-choice=false --aie-postpipeliner-filter-no-choice=false \ +# RUN: -o - %s 2>&1 | FileCheck %s +# REQUIRES: asserts + +# This test verifies that the AIERegDefUseTracker correctly tracks and dumps +# available physical registers after rewriting them to virtual registers. + +# CHECK: Available Physical Registers for Reallocation: +# CHECK: ============================================== +# CHECK-DAG: bmhh4 +# CHECK-DAG: bmhl4 +# CHECK-DAG: bmlh4 +# CHECK-DAG: bmll4 +# CHECK-DAG: cmh4 +# CHECK-DAG: cml4 +# CHECK-DAG: dm4 +# CHECK-DAG: l0 +# CHECK-DAG: r0 +# CHECK-DAG: r1 +# CHECK: Total: 10 registers + +--- | + define void @test_available_regs_dump() { + entry: + br label %loop + loop: + br label %exit + exit: + ret void + } +... +--- +name: test_available_regs_dump +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $p0, $p1, $p6, $y5, $y0, $r2, $r3 + + $lc = ADD_NC_mv_add_ri $r3, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + liveins: $p0, $p1, $p6, $y5, $y0, $r2 + + ; Simple scalar registers that should become available + $r0 = MOV_alu_mv_mv_mv_scl $p6 + $r1 = ADD_NC_mv_add_rr $r0, $r2 + + ; Composite register dm4 with subregs cml4 and cmh4 that should all become available + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cml4, $p0, 0, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cmh4, $p1, 64, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + RET implicit $lr + DelayedSchedBarrier + +... +## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +# CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test-physreg-to-vreg-rewrite-final.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test-physreg-to-vreg-rewrite-final.mir new file mode 100644 index 000000000000..ae5ca5a6d4ea --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test-physreg-to-vreg-rewrite-final.mir @@ -0,0 +1,264 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -mtriple=aie2p -run-pass=postmisched --aie-postpipeliner-vreg-mode %s -o - | FileCheck %s + +# This test verifies that the --aie-postpipeliner-vreg-mode option correctly +# replaces filtered physical registers with virtual registers of the appropriate class. + +--- +name: simple_scalar_def_use +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: simple_scalar_def_use + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $lc = MOV_alu_mv_mv_mv_scl $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $p0, $p6, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $r0 = MOV_alu_mv_mv_mv_scl $p6 + ; CHECK-NEXT: $r1 = ADD_NC_mv_add_rr killed $r0, $r2 + ; CHECK-NEXT: PseudoLoopEnd , %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier + bb.0.entry: + liveins: $p0 + successors: %bb.1 + $lc = MOV_alu_mv_mv_mv_scl $p0 + + bb.1: + successors: %bb.1, %bb.2 + liveins: $p0, $p6, $r2 + + $r0 = MOV_alu_mv_mv_mv_scl $p6 + $r1 = ADD_NC_mv_add_rr $r0, $r2 + PseudoLoopEnd , %bb.1 + + bb.2: + RET implicit $lr + DelayedSchedBarrier + +... +--- +name: composite_dm_register +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: composite_dm_register + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $lc = MOV_alu_mv_mv_mv_scl $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $p0, $p1, $y5, $y0, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm killed $cml4, $p0, 0, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + ; CHECK-NEXT: VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm killed $cmh4, $p1, 64, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + ; CHECK-NEXT: PseudoLoopEnd , %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier + bb.0.entry: + liveins: $p0 + successors: %bb.1 + $lc = MOV_alu_mv_mv_mv_scl $p0 + + bb.1: + successors: %bb.1, %bb.2 + liveins: $p0, $p1, $y5, $y0, $r2 + + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cml4, $p0, 0, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cmh4, $p1, 64, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + PseudoLoopEnd , %bb.1 + + bb.2: + RET implicit $lr + DelayedSchedBarrier + +... +--- +name: partial_composite_def +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: partial_composite_def + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0, $cmh4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $lc = MOV_alu_mv_mv_mv_scl $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $p0, $p7, $cmh4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $cml4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p7, 0 :: (load (<32 x s16>)) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $ex2 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + ; CHECK-NEXT: PseudoLoopEnd , %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier + bb.0.entry: + liveins: $p0, $cmh4 + successors: %bb.1 + $lc = MOV_alu_mv_mv_mv_scl $p0 + + bb.1: + successors: %bb.1, %bb.2 + liveins: $p0, $p7, $cmh4 + + $cml4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p7, 0 :: (load (<32 x s16>)) + + $ex2 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + + PseudoLoopEnd , %bb.1 + + bb.2: + RET implicit $lr + DelayedSchedBarrier + +... +--- +name: tied_operands +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: tied_operands + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0, $p3, $d0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $lc = MOV_alu_mv_mv_mv_scl $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $p0, $p3, $d0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead $r1, $p3, $dc0 = LDA_2D_dms_lda killed $p3, $d0 :: (load (s32)) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: PseudoLoopEnd , %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier + bb.0.entry: + liveins: $p0, $p3, $d0 + successors: %bb.1 + $lc = MOV_alu_mv_mv_mv_scl $p0 + + bb.1: + successors: %bb.1, %bb.2 + liveins: $p0, $p3, $d0 + + dead $r1, $p3, $dc0 = LDA_2D_dms_lda $p3, $d0 :: (load (s32)) + PseudoLoopEnd , %bb.1 + + bb.2: + RET implicit $lr + DelayedSchedBarrier + +... +--- +name: composite_with_liveout +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: composite_with_liveout + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $lc = MOV_alu_mv_mv_mv_scl $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $p0, $y5, $y0, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm killed $cml4, $p0, 0, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + ; CHECK-NEXT: PseudoLoopEnd , %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $p0, $cmh4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm killed $cmh4, killed $p0, 64, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier + bb.0.entry: + liveins: $p0 + successors: %bb.1 + $lc = MOV_alu_mv_mv_mv_scl $p0 + + bb.1: + successors: %bb.1, %bb.2 + liveins: $p0, $y5, $y0, $r2 + + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cml4, $p0, 0, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + PseudoLoopEnd , %bb.1 + + bb.2: + liveins: $p0, $cmh4 + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cmh4, $p0, 64, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test-vreg-metrics.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test-vreg-metrics.mir new file mode 100644 index 000000000000..27d46dcf1b63 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test-vreg-metrics.mir @@ -0,0 +1,143 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +# Test virtual register metrics dumping in AIEPostRegAlloc +# This test verifies that the metrics are properly computed and displayed +# REQUIRES: asserts + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --aie-postpipeliner-filter-no-choice=false \ +# RUN: --aie-wawreg-rewrite=0 \ +# RUN: --aie-preassign-multi-slot-instr=1 \ +# RUN: --aie-materialize-pipeline=0 \ +# RUN: --aie-postpipeliner-maxii=7 \ +# RUN: --debug-only=aie-postregalloc \ +# RUN: -o /dev/null %s 2>&1 | FileCheck %s --check-prefix=METRICS + +# Based on the actual output from gemm-bfp16-ii7.mir, we expect: +# - 14 virtual registers total +# - Various register classes (ex, x, y, dm) +# - Interference degrees ranging from 0 to 9 +# - Different lane widths and durations + +# NOTE: Status quo - II=7 (was II=6) +# METRICS: AIEPostRegAlloc::allocate for 14 vregs, II=7 +# METRICS: === Virtual Register Metrics Dump === +# METRICS: Total Virtual Registers: 14 +# METRICS-EMPTY: +# METRICS: VReg RegClass Avail Pure Alias TotalLanes MaxWidth Duration +# METRICS-NEXT: -------- ----------------------- ----- ---- ----- ---------- -------- -------- +# METRICS-NEXT: %vreg0 VEC512 7 3 600 6 2 4 +# METRICS-NEXT: %vreg1 VEC512 7 3 800 8 2 5 +# METRICS-NEXT: %vreg2 eDM 1 3 0 22 4 6 FAIL +# METRICS-NEXT: %vreg3 eY 2 1 700 6 4 3 +# METRICS-NEXT: %vreg4 VEC512 7 3 700 6 2 4 +# METRICS-NEXT: %vreg5 VEC512 7 3 800 8 2 5 +# METRICS-NEXT: %vreg6 eDM 1 1 0 4 4 1 FAIL +# METRICS-NEXT: %vreg7 eY 2 1 500 6 4 3 +# METRICS-NEXT: %vreg8 eDM 1 2 0 4 4 1 FAIL +# METRICS-NEXT: %vreg9 eDM 1 2 0 14 4 4 FAIL +# METRICS-NEXT: %vreg10 VEC576 4 3 800 28 4 7 +# METRICS-NEXT: %vreg11 VEC576 4 2 300 8 4 2 +# METRICS-NEXT: %vreg12 VEC576 4 2 600 8 4 2 +# METRICS-NEXT: %vreg13 VEC576 4 3 800 28 4 7 +# METRICS-EMPTY: +# METRICS: === Summary Statistics === +# METRICS: Total Lanes (sum): 156 +# METRICS: Max Width (max): 4 +# METRICS: Max Duration: 7 +# METRICS: Max Pure Interference Degree: 3 +# METRICS: Max Aliasing Interference Deg: 800 +# METRICS: Avg Pure Interference Degree: 2.29 +# METRICS: Avg Aliasing Interference Deg: 471.43 + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr { + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.body: + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !0 + + for.cond.cleanup: + ret void + } + + declare void @llvm.set.loop.iterations.i32(i32) + declare i1 @llvm.loop.decrement.i32(i32) + + !0 = distinct !{!0, !1, !2} + !1 = !{!"llvm.loop.mustprogress"} + !2 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.2 + liveins: $p0, $p1, $r0 + + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.2 + $le = MOVXM + + bb.2.for.body (align 16): + successors: %bb.2, %bb.3 + liveins: $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dj7, $dm0:0x000000001800000C, $dm1:0x000000001800000C, $dm2:0x000000001800000C, $dm3:0x000000001800000C, $dn0, $dn1, $dn2, $dn4, $dn5, $m0, $m1, $m2, $m3, $m4, $m5, $p0, $p1, $p2, $p4, $p6, $p7, $r0, $r1, $r2, $r3, $r4, $r8, $y0:0x0000000000000033, $d0_3d:0x0003C00000200E00, $d1_3d:0x0003C00000200E00, $dn3 + + $p5 = MOV_alu_mv_mv_mv_scl $p6 + $x2, $p6, $dc0, $dc4 = VLD_3D_x_pseudo $p6, $d0_3d :: (load (<16 x s32>)) + $x4 = VLD_x_idx_imm_pseudo $p5, 64 :: (load (<16 x s32>)) + $p3 = MOV_alu_mv_mv_mv_scl $p7 + $cml4, $p7, $dc1, $dc5 = VLDA_3D_CONV_fp32_bf16_dmx_lda_ups_bf $p7, $d1_3d :: (load (<32 x s16>)) + $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p3, 64 :: (load (<32 x s16>)) + $p5 = PADD_mod_pseudo $p5, $m4 + $x6, $p5 = VLD_x_pstm_nrm_imm_pseudo $p5, 64 :: (load (<16 x s32>)) + $x8 = VLD_x_idx_imm_pseudo $p5, 0 :: (load (<16 x s32>)) + $x10 = VSHUFFLE_vec_shuffle_x $x2, $x4, $r0 + $x11 = VSHUFFLE_vec_shuffle_x $x2, $x4, $r1 + $ex2 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + $x4 = VSHUFFLE_vec_shuffle_x $x6, $x8, $r0 + $x5 = VSHUFFLE_vec_shuffle_x $x6, $x8, $r1 + $ex6 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y2, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + $p3 = PADD_mod_pseudo $p3, $m5 + $ex4 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + $cml4, $p3 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_pstm_nrm_imm $p3, 64 :: (load (<32 x s16>)) + $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p3, 0 :: (load (<32 x s16>)) + $ex8 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + $dm3 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX $dm3, $ex2, $ex6, $r3, implicit-def dead $srfpflags, implicit $crfpmask + $dm2 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX $dm2, $ex2, $ex4, $r3, implicit-def dead $srfpflags, implicit $crfpmask + $dm1 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX $dm1, $ex8, $ex6, $r3, implicit-def dead $srfpflags, implicit $crfpmask, + $dm0 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX $dm0, $ex8, $ex4, $r3, implicit-def dead $srfpflags, implicit $crfpmask + + PseudoLoopEnd , %bb.2 + + bb.3 (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test1-simple-def-use.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test1-simple-def-use.mir new file mode 100644 index 000000000000..24d930be777c --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test1-simple-def-use.mir @@ -0,0 +1,62 @@ +# NOTE: Test for AIERegDefUseTracker - simple def-use chains +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker \ +# RUN: -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test 1: def-use def-use on the same simple register leads to two live ranges +# CHECK-DAG: Live Range {{.*}} for r0: +# CHECK-DAG: Live Range {{.*}} for r0: + +--- | + define void @test_simple_def_use() { + entry: + br label %loop + loop: + br label %exit + exit: + ret void + } +... +--- +name: test_simple_def_use +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $p0, $p6, $r0, $r2, $r4 + + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + liveins: $p0, $p6, $r2, $r4 + + ; First def-use chain + $r0 = MOV_alu_mv_mv_mv_scl $p6 + $r1 = ADD_NC_mv_add_rr $r0, $r2 + + ; Second def-use chain on same register + $r0 = MOV_alu_mv_mv_mv_scl $p6 + $r3 = ADD_NC_mv_add_rr $r0, $r4 + + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test10-reserved-subreg-loads-vmul-liveout.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test10-reserved-subreg-loads-vmul-liveout.mir new file mode 100644 index 000000000000..f61705e81282 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test10-reserved-subreg-loads-vmul-liveout.mir @@ -0,0 +1,117 @@ +# NOTE: Test for AIERegDefUseTracker - reserved ranges with subreg loads and VMUL composite use +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker --aie-postpipeliner-filter-no-choice=false \ +# RUN: -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test 10: Verify reserved range handling with subreg loads feeding VMUL with live-out +# +# This test exercises reserved ranges with subreg defs and composite Y register use: +# 1. Subreg loads (x10, x11) that feed into a composite Y register use (y5) in VMUL +# where the result feeds into a live-out should create a RESERVED range +# 2. An additional disjoint live range on the same composite register should +# NOT make that register available for reallocation +# +# Program order (backward analysis processes in reverse): +# - y5: def x10, def x11 → use y5 in VMUL (disjoint, early) - analyzed LAST → normal +# - y5: def x10, def x11 → use y5 in VMUL (late, result feeds live-out) - analyzed FIRST → RESERVED +# +# Expected behavior: +# - y5's late range (feeding live-out) should be marked RESERVED +# - y5's early disjoint range should be normal (not reserved) +# - y5 (and its subregs) should NOT appear in available physical registers + +# CHECK-DAG: Live Range #{{[0-9]+}} for dm4: +# CHECK-DAG: Live Range #{{[0-9]+}} for y5 [RESERVED]: +# CHECK-DAG: Live Range #{{[0-9]+}} for y5: + +# Verify available registers explicitly - dm4 and subregs should be available, but not y5/x10/x11 +# CHECK: FINAL LIVE RANGES +# CHECK: Available Physical Registers for Reallocation: +# CHECK-NEXT: ============================================== +# CHECK-DAG: bmhh4 +# CHECK-DAG: bmhl4 +# CHECK-DAG: bmlh4 +# CHECK-DAG: bmll4 +# CHECK-DAG: cmh4 +# CHECK-DAG: cml4 +# CHECK-DAG: dm4 +# CHECK: Total: 7 registers + +# Verify scarce range set - both dm4 ranges should be identified as scarce +# CHECK: Most promising scarce range set: 2 ranges +# CHECK-NEXT: Register class: eDM +# CHECK-NEXT: [0] BaseReg=dm4 Defs=1 Uses=2 +# CHECK-NEXT: [1] BaseReg=dm4 Defs=1 Uses=2 + +--- | + define void @test_reserved_subreg_loads_vmul_liveout() { + entry: + br label %loop + loop: + br i1 undef, label %loop, label %exit + exit: + ret void + } +... +--- +name: test_reserved_subreg_loads_vmul_liveout +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $p0, $p1, $p7, $y0, $r2, $r3 + + $lc = ADD_NC_mv_add_ri $r3, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + liveins: $p0, $p1, $p7, $y0, $r2 + + ; EARLY in program order (analyzed LAST in backward pass): + ; Subreg loads (x10, x11) → composite use (y5) in VMUL, disjoint from later range + ; This is a normal range + $x10, $p7 = VLD_x_pstm_nrm_imm_pseudo $p7, 0 :: (load (<16 x s32>)) + $x11, $p7 = VLD_x_pstm_nrm_imm_pseudo $p7, 64 :: (load (<16 x s32>)) + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cml4, $p0, 0, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cmh4, $p1, 64, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + ; LATE in program order (analyzed FIRST in backward pass): + ; Subreg loads (x10, x11) → composite use (y5) in VMUL where result feeds live-out + ; This should be marked as RESERVED because dm4 result is live-out to bb.2 + $x10, $p7 = VLD_x_pstm_nrm_imm_pseudo $p7, 128 :: (load (<16 x s32>)) + $x11, $p7 = VLD_x_pstm_nrm_imm_pseudo $p7, 192 :: (load (<16 x s32>)) + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cml4, $p0, 128, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cmh4, $p1, 192, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + liveins: $y5, $y0, $r2, $p0 + + ; Use y5 in another VMUL - this makes y5 live-out from bb.1 + ; Backward analysis starts here, sees y5 is live-in to bb.2 + ; Then traces back and finds the LATE x10/x11 → y5 chain (closest to block end) + ; feeds into this, so that y5 range becomes RESERVED + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cml4, $p0, 256, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test11-reserved-subreg-loads-vshuffle-vmul-liveout.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test11-reserved-subreg-loads-vshuffle-vmul-liveout.mir new file mode 100644 index 000000000000..7b670f5f5f97 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test11-reserved-subreg-loads-vshuffle-vmul-liveout.mir @@ -0,0 +1,130 @@ +# NOTE: Test for AIERegDefUseTracker - reserved ranges with subreg loads, VSHUFFLE, and VMUL composite use +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker --aie-postpipeliner-filter-no-choice=false \ +# RUN: -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test 11: Verify reserved range handling with subreg loads, VSHUFFLE, and VMUL with live-out +# +# This test exercises reserved ranges with subreg defs via VSHUFFLE and composite Y register use: +# 1. Subreg loads (x8, x9) that are shuffled to create (x10, x11) which feed into a composite +# Y register use (y5) in VMUL where the result feeds into a live-out should create a RESERVED range +# 2. An additional disjoint live range on the same composite register should +# NOT make that register available for reallocation +# 3. Since x8/x9 (forming y4) are only used as inputs to VSHUFFLE, y4 should become available +# +# Program order (backward analysis processes in reverse): +# - y5: def x8, def x9 → VSHUFFLE → def x10, def x11 → use y5 in VMUL (disjoint, early) - analyzed LAST → normal +# - y5: def x10, def x11 → use y5 in VMUL (late, result feeds live-out) - analyzed FIRST → RESERVED +# +# Expected behavior: +# - y5's late range (feeding live-out) should be marked RESERVED +# - y5's early disjoint range should be normal (not reserved) +# - y5 (and its subregs x10/x11) should NOT appear in available physical registers +# - y4 (and its subregs x8/x9) SHOULD appear in available physical registers + +# CHECK-DAG: Live Range #{{[0-9]+}} for dm4: +# CHECK-DAG: Live Range #{{[0-9]+}} for x8: +# CHECK-DAG: Live Range #{{[0-9]+}} for x9: +# CHECK-DAG: Live Range #{{[0-9]+}} for y5 [RESERVED]: +# CHECK-DAG: Live Range #{{[0-9]+}} for y5: + +# Verify available registers explicitly - dm4, y4, and their subregs should be available, but not y5/x10/x11 +# CHECK: FINAL LIVE RANGES +# CHECK: Available Physical Registers for Reallocation: +# CHECK-DAG: bmhh4 +# CHECK-DAG: bmhl4 +# CHECK-DAG: bmlh4 +# CHECK-DAG: bmll4 +# CHECK-DAG: cmh4 +# CHECK-DAG: cml4 +# CHECK-DAG: dm4 +# CHECK-DAG: wh8 +# CHECK-DAG: wh9 +# CHECK-DAG: wl8 +# CHECK-DAG: wl9 +# CHECK-DAG: x8 +# CHECK-DAG: x9 +# CHECK-DAG: y4 +# CHECK: Total: 14 registers + +# Verify scarce range set - both dm4 ranges should be identified as scarce +# CHECK: Most promising scarce range set: 2 ranges +# CHECK-NEXT: Register class: eDM +# CHECK-NEXT: [0] BaseReg=dm4 Defs=1 Uses=2 +# CHECK-NEXT: [1] BaseReg=dm4 Defs=1 Uses=2 + +--- | + define void @test_reserved_subreg_loads_vshuffle_vmul_liveout() { + entry: + br label %loop + loop: + br i1 undef, label %loop, label %exit + exit: + ret void + } +... +--- +name: test_reserved_subreg_loads_vshuffle_vmul_liveout +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $p0, $p1, $p7, $y0, $r2, $r3 + + $lc = ADD_NC_mv_add_ri $r3, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + liveins: $p0, $p1, $p7, $y0, $r2, $r3 + + ; EARLY in program order (analyzed LAST in backward pass): + ; Subreg loads (x8, x9) → VSHUFFLE → (x10, x11) → composite use (y5) in VMUL, disjoint from later range + ; This is a normal range + ; Since x8/x9 are only used as VSHUFFLE inputs, y4 should become available + $x8, $p7 = VLD_x_pstm_nrm_imm_pseudo $p7, 0 :: (load (<16 x s32>)) + $x9, $p7 = VLD_x_pstm_nrm_imm_pseudo $p7, 64 :: (load (<16 x s32>)) + $x10 = VSHUFFLE_vec_shuffle_x $x8, $x9, $r2 + $x11 = VSHUFFLE_vec_shuffle_x $x8, $x9, $r3 + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cml4, $p0, 0, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cmh4, $p1, 64, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + ; LATE in program order (analyzed FIRST in backward pass): + ; Subreg loads (x10, x11) → composite use (y5) in VMUL where result feeds live-out + ; This should be marked as RESERVED because dm4 result is live-out to bb.2 + $x10, $p7 = VLD_x_pstm_nrm_imm_pseudo $p7, 128 :: (load (<16 x s32>)) + $x11, $p7 = VLD_x_pstm_nrm_imm_pseudo $p7, 192 :: (load (<16 x s32>)) + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cml4, $p0, 128, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cmh4, $p1, 192, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + liveins: $y5, $y0, $r2, $p0 + + ; Use y5 in another VMUL - this makes y5 live-out from bb.1 + ; Backward analysis starts here, sees y5 is live-in to bb.2 + ; Then traces back and finds the LATE x10/x11 → y5 chain (closest to block end) + ; feeds into this, so that y5 range becomes RESERVED + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cml4, $p0, 256, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test12-reserved-subreg-scarce-superreg.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test12-reserved-subreg-scarce-superreg.mir new file mode 100644 index 000000000000..8ac5aa3d25b4 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test12-reserved-subreg-scarce-superreg.mir @@ -0,0 +1,130 @@ +# NOTE: Test for AIERegDefUseTracker - reserved subreg with scarce superreg from separate subreg liveranges +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker --aie-postpipeliner-filter-no-choice=false \ +# RUN: -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test 12: Verify scarce register creation from separate subreg liveranges +# +# This test exercises a complex scenario: +# 1. We have a RESERVED live range on x0 (subreg of y0 = [x0, x1]) +# 2. We have two normal live ranges on y0 (superregister containing x0) +# 3. We have two separate normal liveranges: one on x2, one on x3 (subregs of y1 = [x2, x3]) +# 4. The result should be y1 identified as a scarce register with two live ranges (from x2 and x3) +# +# Register structure: +# - y0 = [x0, x1] where x0 = [wl0, wh0], x1 = [wl1, wh1] +# - y1 = [x2, x3] where x2 = [wl2, wh2], x3 = [wl3, wh3] +# +# Program order (backward analysis processes in reverse): +# - y0: def x0, def x1 → use y0 in VMUL (early, normal, disjoint) +# - x2: def → use (normal, makes x2 available) +# - x3: def → use (normal, makes x3 available) +# - y0: def x0, def x1 → use y0 in VMUL (middle, normal, disjoint) +# - x0: def → use (LATE, RESERVED, feeds live-out, NOT killed) +# +# Expected behavior: +# - x0's late range (feeding live-out) should be marked RESERVED +# - y0's ranges should be normal (not reserved) +# - x0 should NOT appear in available physical registers (due to reserved range) +# - x2 and x3 SHOULD appear in available registers +# - y1 (composed of x2, x3) should be identified as a scarce register with 2 ranges + +# Verify final analysis results +# CHECK: FINAL LIVE RANGES +# CHECK: Total live ranges: 7 +# CHECK-DAG: Live Range #{{[0-9]+}} for x0 [RESERVED]: +# CHECK-DAG: Live Range #{{[0-9]+}} for x2: +# CHECK-DAG: Live Range #{{[0-9]+}} for x3: +# CHECK-DAG: Live Range #{{[0-9]+}} for y0: + +# Verify available registers - x2, x3, y1 should be available, but not x0/y0 +# CHECK: Available Physical Registers for Reallocation: +# CHECK-NEXT: ============================================== +# CHECK-DAG: wh2 +# CHECK-DAG: wh3 +# CHECK-DAG: wl2 +# CHECK-DAG: wl3 +# CHECK-DAG: x2 +# CHECK-DAG: x3 +# CHECK-DAG: y1 +# CHECK-NOT: x0 +# CHECK-NOT: wl0 +# CHECK-NOT: wh0 +# CHECK-NOT: x1 +# CHECK-NOT: y0 + +# Verify scarce range set - y0 should be identified with 2 ranges +# CHECK: Most promising scarce range set: 2 ranges +# CHECK-NEXT: Register class: eY +# CHECK-NEXT: [0] BaseReg=y0 Defs=2 Uses=1 +# CHECK-NEXT: [1] BaseReg=y0 Defs=2 Uses=1 + +--- | + define void @test_reserved_subreg_scarce_superreg() { + entry: + br label %loop + loop: + br i1 undef, label %loop, label %exit + exit: + ret void + } +... +--- +name: test_reserved_subreg_scarce_superreg +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $p0, $p7, $y4, $r2, $r3 + + $lc = ADD_NC_mv_add_ri $r3, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + liveins: $p0, $p7, $y4, $r2, $r3 + + ; EARLY in program order (analyzed LAST in backward pass): + ; First y0 range: normal, disjoint from reserved x0 + $x0, $p7 = VLD_x_pstm_nrm_imm_pseudo $p7, 0 :: (load (<16 x s32>)) + $x1, $p7 = VLD_x_pstm_nrm_imm_pseudo $p7, 64 :: (load (<16 x s32>)) + $dm3 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y0, $y4, $r2, implicit-def dead $srfpflags, implicit $crfpmask + + ; x2 range: normal, makes x2 available for reallocation + $x2, $p7 = VLD_x_pstm_nrm_imm_pseudo $p7, 128 :: (load (<16 x s32>)) + + ; x3 range: normal, makes x3 available for reallocation + $x3, $p7 = VLD_x_pstm_nrm_imm_pseudo $p7, 192 :: (load (<16 x s32>)) + + ; Second y0 range: normal, disjoint from reserved x0 range + $x0, $p7 = VLD_x_pstm_nrm_imm_pseudo $p7, 256 :: (load (<16 x s32>)) + $x1, $p7 = VLD_x_pstm_nrm_imm_pseudo $p7, 320 :: (load (<16 x s32>)) + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y0, $y4, $r2, implicit-def dead $srfpflags, implicit $crfpmask + + ; LATE in program order (analyzed FIRST in backward pass): + ; RESERVED range: x0 load that feeds into live-out + $x0, $p7 = VLD_x_pstm_nrm_imm_pseudo $p7, 384 :: (load (<16 x s32>)) + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + ; We make x0 live to force a reserved range + liveins: $x0 + + + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test13-read-modify-write.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test13-read-modify-write.mir new file mode 100644 index 000000000000..00c40a65ce05 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test13-read-modify-write.mir @@ -0,0 +1,87 @@ +# NOTE: Test for AIERegDefUseTracker - read-modify-write pattern +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker --aie-postpipeliner-filter-no-choice=false \ +# RUN: -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test 13: Read-modify-write pattern +# +# This tests the backward scan handling of an instruction that both reads and writes +# the same register: +# x0 = VLDA... p0 ; def x0 (first def) +# x0 = VSHUFFLE x0, x0, r0 ; use x0, use x0, def x0 (read-modify-write) +# VST x0 ; use x0 (final use) +# +# In the backward scan with correct def-before-use processing: +# 1. VST makes x0 live (uses x0) - creates Live Range #0 +# 2. VSHUFFLE def KILLS x0's liveness (terminates Range #0) +# 3. VSHUFFLE uses create a NEW live range #1 for x0 +# 4. VLDA def terminates Live Range #1 +# +# The expected result is TWO separate live ranges for x0: +# - Live Range 1: VLDA def -> VSHUFFLE uses (1 def, 2 uses) +# - Live Range 2: VSHUFFLE def -> VST use (1 def, 1 use) +# +# CHECK: FINAL LIVE RANGES +# CHECK: Live Range #{{[0-9]+}} for x0: +# CHECK: Definitions (1): +# CHECK: Uses (2): +# CHECK: Live Range #{{[0-9]+}} for x0: +# CHECK: Definitions (1): +# CHECK: Uses (1): + +--- | + define void @test_read_modify_write() { + entry: + br label %loop + loop: + br i1 undef, label %loop, label %exit + exit: + ret void + } +... +--- +name: test_read_modify_write +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $p0, $r0, $r1, $r2 + + $lc = ADD_NC_mv_add_ri $r1, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + liveins: $p0, $r0 + + ; Def x0 (first def) + $x0, $p0 = VLD_x_pstm_nrm_imm_pseudo $p0, 0 :: (load (<16 x s32>)) + + ; Read-modify-write: use x0 twice, def x0 + ; This instruction reads x0 and writes x0 + $x0 = VSHUFFLE_vec_shuffle_x $x0, $x0, $r0 + + ; Final use of x0 - store consumes the result + VST_dmx_sts_x_idx_imm $x0, $p0, 64 :: (store (<16 x s32>)) + + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test2-subreg-defs-composite-use.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test2-subreg-defs-composite-use.mir new file mode 100644 index 000000000000..997434cc8efb --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test2-subreg-defs-composite-use.mir @@ -0,0 +1,66 @@ +# NOTE: Test for AIERegDefUseTracker - sub-register defs with composite use +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker --aie-postpipeliner-filter-no-choice=false --aie-postpipeliner-filter-no-choice=false -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test 2: def(cml4) def(cmh4) use(dm4) leads to one live range with two defs and one use +# CHECK: Live Range {{.*}} for dm4: +# CHECK: Definitions (2): +# CHECK-DAG: Register: cml4 (SubRegIdx: 10) +# CHECK-DAG: Register: cmh4 (SubRegIdx: 9) +# CHECK: Uses (1): +# CHECK: Register: dm4 + +--- | + define void @test_subreg_defs_composite_use() { + entry: + br label %loop + loop: + br label %exit + exit: + ret void + } +... +--- +name: test_subreg_defs_composite_use +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $p0, $p7, $r0 + + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + liveins: $p0, $p7 + + ; Define low half of dm4 + $cml4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p7, 0 :: (load (<32 x s16>)) + + ; Define high half of dm4 + $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p7, 64 :: (load (<32 x s16>)) + + ; Use composite register dm4 + $ex2 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test2b-missing-subreg-def.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test2b-missing-subreg-def.mir new file mode 100644 index 000000000000..8eafee971e91 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test2b-missing-subreg-def.mir @@ -0,0 +1,60 @@ +# NOTE: Test for AIERegDefUseTracker - missing sub-register def with composite use +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker --aie-postpipeliner-filter-no-choice=false --aie-postpipeliner-filter-no-choice=false -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test 2b: def(cml4) use(dm4) with cmh4 as livein - should filter out dm4 live range +# Since cmh4 is not defined in the block but is live-in, the dm4 live range +# should be filtered out as it's not fully defined +# CHECK-NOT: Live Range {{.*}} for dm4: + +--- | + define void @test_missing_subreg_def() { + entry: + br label %loop + loop: + br label %exit + exit: + ret void + } +... +--- +name: test_missing_subreg_def +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $p0, $p7, $r0, $cmh4 + + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + liveins: $p0, $p7, $cmh4 + + ; Define only low half of dm4 (cmh4 is live-in) + $cml4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p7, 0 :: (load (<32 x s16>)) + + ; Use composite register dm4 (but cmh4 was not defined in this block) + $ex2 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test2c-aliasing-with-unmanaged.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test2c-aliasing-with-unmanaged.mir new file mode 100644 index 000000000000..4332bc1e96c9 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test2c-aliasing-with-unmanaged.mir @@ -0,0 +1,72 @@ +# NOTE: Test for AIERegDefUseTracker - aliasing with unmanaged live range +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker --aie-postpipeliner-filter-no-choice=false --aie-postpipeliner-filter-no-choice=false -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test 2c: def(cml4) use(dm4) with cmh4 as livein (unmanaged), followed by def/use of cml4 +# Both dm4 and the subsequent cml4 live ranges should be filtered out +# since they alias with the unmanaged cmh4 live range +# cmh4 should not appear as a live range since it's only live-in (not defined in block) +# With implicit operands filtered, we should only have ex2 live range +# CHECK: FINAL LIVE RANGES +# CHECK: Total live ranges: 1 +# CHECK: Live Range #{{[0-9]+}} for ex2: +# CHECK-NEXT: Definitions (1): +# CHECK: Register: ex2 +# CHECK-NEXT: Uses (0): +# CHECK-EMPTY: + +--- | + define void @test_aliasing_with_unmanaged() { + entry: + br label %loop + loop: + br label %exit + exit: + ret void + } +... +--- +name: test_aliasing_with_unmanaged +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $p0, $p7, $r0 + + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + liveins: $p0, $p7, $cmh4 + + ; Define only low half of dm4 (cmh4 is explicitly live-in, unmanaged) + $cml4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p7, 0 :: (load (<32 x s16>)) + + ; Use composite register dm4 (but cmh4 was not defined in this block) + $ex2 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + + ; Now define and use cml4 again (this should also be filtered since it aliases with unmanaged cmh4) + $cml4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p7, 32 :: (load (<32 x s16>)) + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cml4, $p0, 0, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test3-composite-def-subreg-uses.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test3-composite-def-subreg-uses.mir new file mode 100644 index 000000000000..3bc2015784fc --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test3-composite-def-subreg-uses.mir @@ -0,0 +1,67 @@ +# NOTE: Test for AIERegDefUseTracker - composite def with sub-register uses +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker --aie-postpipeliner-filter-no-choice=false --aie-postpipeliner-filter-no-choice=false -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test 3: def(dm4) use(cml4) use(cmh4) leads to one live range with one def and two uses +# The uses should have sub-register indices recorded +# CHECK: Live Range {{.*}} for dm4: +# CHECK: Definitions (1): +# CHECK: Register: dm4 +# CHECK: Uses (2): +# CHECK-DAG: Register: cml4 (SubRegIdx: 10) +# CHECK-DAG: Register: cmh4 (SubRegIdx: 9) + +--- | + define void @test_composite_def_subreg_uses() { + entry: + br label %loop + loop: + br label %exit + exit: + ret void + } +... +--- +name: test_composite_def_subreg_uses +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $p0, $y5, $y0, $r2, $r0 + + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + liveins: $p0, $y5, $y0, $r2 + + ; Define composite register dm4 + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + + ; Use low half cml4 + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cml4, $p0, 0, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + ; Use high half cmh4 + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cmh4, $p0, 64, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test3b-subreg-use-in-successor.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test3b-subreg-use-in-successor.mir new file mode 100644 index 000000000000..dda105e24f94 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test3b-subreg-use-in-successor.mir @@ -0,0 +1,75 @@ +# NOTE: Test for AIERegDefUseTracker - composite def with subreg use in successor +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker --aie-postpipeliner-filter-no-choice=false --aie-postpipeliner-filter-no-choice=false -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test 3b: def(dm4) use(cml4) with cmh4 used in successor block (live-out) +# Since cmh4 (a subreg of dm4) is live-out to the successor block, the dm4 live range +# is marked as RESERVED. +# CHECK: FINAL LIVE RANGES +# CHECK: Total live ranges: 1 +# CHECK: Live Range #{{[0-9]+}} for dm4 [RESERVED]: +# CHECK-NEXT: Definitions (1): +# CHECK-NEXT: [0] Register: dm4 +# CHECK: Uses (1): +# CHECK-NEXT: [0] Register: cml4 (SubRegIdx: 10) +# CHECK: Available Physical Registers for Reallocation: +# CHECK-NEXT: ============================================== +# CHECK-NEXT: Total: 0 registers + +--- | + define void @test_subreg_use_in_successor() { + entry: + br label %loop + loop: + br label %exit + exit: + ret void + } +... +--- +name: test_subreg_use_in_successor +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $p0, $y5, $y0, $r2, $r0 + + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + liveins: $p0, $y5, $y0, $r2 + + ; Define composite register dm4 + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + + ; Use low half cml4 in this block + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cml4, $p0, 0, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + ; cmh4 is live-out to successor block + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + liveins: $p0, $cmh4 + + ; Use high half cmh4 in successor block + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cmh4, $p0, 64, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test3c-aliasing-with-liveout.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test3c-aliasing-with-liveout.mir new file mode 100644 index 000000000000..ff9553b93079 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test3c-aliasing-with-liveout.mir @@ -0,0 +1,76 @@ +# NOTE: Test for AIERegDefUseTracker - aliasing with live-out range +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker --aie-postpipeliner-filter-no-choice=false --aie-postpipeliner-filter-no-choice=false -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test 3c: def(dm4) use(cml4) with cmh4 live-out to successor, followed by def/use of cmh4 +# The cmh4 def/use creates a RESERVED range (feeds live-out). +# The dm4->cml4 range is independent (cmh4 is fully redefined after) so it's a normal range. +# NOTE: Current status quo - no registers available for reallocation due to aliasing constraints. +# CHECK: FINAL LIVE RANGES +# CHECK: Total live ranges: 2 +# CHECK-DAG: Live Range #{{[0-9]+}} for cmh4 [RESERVED]: +# CHECK-DAG: Live Range #{{[0-9]+}} for dm4: +# CHECK: Available Physical Registers for Reallocation: +# CHECK: Total: 0 registers + +--- | + define void @test_aliasing_with_liveout() { + entry: + br label %loop + loop: + br label %exit + exit: + ret void + } +... +--- +name: test_aliasing_with_liveout +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $p0, $p7, $y5, $y0, $r2, $r0 + + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + liveins: $p0, $p7, $y5, $y0, $r2 + + ; Define composite register dm4 + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + + ; Use low half cml4 in this block + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cml4, $p0, 0, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + ; cmh4 is live-out to successor block, but also define and use it here + ; This should also be filtered since cmh4 is part of the live-out set + $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p7, 64 :: (load (<32 x s16>)) + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cmh4, $p0, 32, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + liveins: $p0, $cmh4 + + ; Use high half cmh4 in successor block + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cmh4, $p0, 64, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test4-def-only-garbage-bin.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test4-def-only-garbage-bin.mir new file mode 100644 index 000000000000..9f6529dd3e19 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test4-def-only-garbage-bin.mir @@ -0,0 +1,60 @@ +# NOTE: Test for AIERegDefUseTracker - def-only live range (garbage bin register) +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker --aie-postpipeliner-filter-no-choice=false --aie-postpipeliner-filter-no-choice=false -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test: Def-only live range should be kept (garbage bin register) +# r0 has def only - valid live range (garbage bin) +# CHECK: FINAL LIVE RANGES +# CHECK: Total live ranges: 1 +# CHECK: Live Range #0 for r0: +# CHECK: Definitions (1): +# CHECK: Register: r0 +# CHECK: Uses (0): + +--- | + define void @test_def_only_garbage_bin() { + entry: + br label %loop + loop: + br label %exit + exit: + ret void + } +... +--- +name: test_def_only_garbage_bin +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $r4 + + $lc = ADD_NC_mv_add_ri $r4, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + + ; r0: def-only (garbage bin register) - from immediate + dead $r0 = MOVA 100 + + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test5-two-subreg-def-chains.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test5-two-subreg-def-chains.mir new file mode 100644 index 000000000000..45bd3285a332 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test5-two-subreg-def-chains.mir @@ -0,0 +1,72 @@ +# NOTE: Test for AIERegDefUseTracker - two separate sub-register def chains +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker --aie-postpipeliner-filter-no-choice=false --aie-postpipeliner-filter-no-choice=false -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test 5: Two separate chains with subreg defs and composite use (like test1 but with test2 pattern) +# First chain: def(cml4) def(cmh4) use(dm4) +# Second chain: def(cml4) def(cmh4) use(dm4) +# Should create two separate live ranges for dm4 +# CHECK-DAG: Live Range {{.*}} for dm4: +# CHECK-DAG: Live Range {{.*}} for dm4: + +--- | + define void @test_two_subreg_def_chains() { + entry: + br label %loop + loop: + br label %exit + exit: + ret void + } +... +--- +name: test_two_subreg_def_chains +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $p0, $p7, $r0 + + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + liveins: $p0, $p7 + + ; First chain: subreg defs -> composite use + ; Define low half of dm4 + $cml4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p7, 0 :: (load (<32 x s16>)) + ; Define high half of dm4 + $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p7, 64 :: (load (<32 x s16>)) + ; Use composite register dm4 + $ex2 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + + ; Second chain: subreg defs -> composite use (separate from first) + ; Define low half of dm4 again + $cml4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p7, 128 :: (load (<32 x s16>)) + ; Define high half of dm4 again + $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p7, 192 :: (load (<32 x s16>)) + ; Use composite register dm4 again + $ex4 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd + + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test6-two-composite-def-chains.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test6-two-composite-def-chains.mir new file mode 100644 index 000000000000..4ced5b1f3f11 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test6-two-composite-def-chains.mir @@ -0,0 +1,72 @@ +# NOTE: Test for AIERegDefUseTracker - two separate composite def chains +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker --aie-postpipeliner-filter-no-choice=false --aie-postpipeliner-filter-no-choice=false -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test 6: Two separate chains with composite def and subreg uses (like test1 but with test3 pattern) +# First chain: def(dm4) use(cml4) use(cmh4) +# Second chain: def(dm4) use(cml4) use(cmh4) +# Should create two separate live ranges for dm4 +# CHECK-DAG: Live Range {{.*}} for dm4: +# CHECK-DAG: Live Range {{.*}} for dm4: + +--- | + define void @test_two_composite_def_chains() { + entry: + br label %loop + loop: + br label %exit + exit: + ret void + } +... +--- +name: test_two_composite_def_chains +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $p0, $y5, $y0, $r2, $r0 + + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + liveins: $p0, $y5, $y0, $r2 + + ; First chain: composite def -> subreg uses + ; Define composite register dm4 + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + ; Use low half cml4 + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cml4, $p0, 0, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + ; Use high half cmh4 + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cmh4, $p0, 64, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + ; Second chain: composite def -> subreg uses (separate from first) + ; Define composite register dm4 again + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + ; Use low half cml4 again + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cml4, $p0, 128, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + ; Use high half cmh4 again + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cmh4, $p0, 192, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test7-tied-operands.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test7-tied-operands.mir new file mode 100644 index 000000000000..6a998e344d11 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test7-tied-operands.mir @@ -0,0 +1,82 @@ +# NOTE: Test for AIERegDefUseTracker - tied register pairs +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker --aie-postpipeliner-filter-no-choice=false --aie-postpipeliner-filter-no-choice=false -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test 7: Instructions with tied operands should have their live ranges filtered out +# Tied operands cannot be independently allocated, so they should be rejected +# The LDA_2D_dms_lda instruction has tied operands where the output count +# is tied to the input mod.sub_dim_count (dc0 tied to d0.sub_dim_count) +# and p3 is tied as well (tied-def 1) +# p3's live range is filtered because it's used in a tied operand +# Only r1 should remain as it's not tied +# CHECK: FINAL LIVE RANGES +# CHECK-NEXT: ================================ +# CHECK-NEXT: Total live ranges: 1 +# CHECK-EMPTY: +# CHECK: Live Range #{{[0-9]+}} for r1: +# CHECK-NEXT: Definitions (1): +# CHECK-NEXT: [0] Register: r1 dead $r1, $p3, $dc0 = LDA_2D_dms_lda +# CHECK-NEXT: Uses (0): + +--- | + define void @test_tied_operands() { + entry: + br label %loop + loop: + br label %exit + exit: + ret void + } +... +--- +name: test_tied_operands +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $p0, $r0 + + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + liveins: $p0 + + ; Define p3 (not live-in) to use in the tied instruction + $p3 = MOV_alu_mv_mv_mv_scl $p0 + + ; Define d0 through its four subregs + ; d0 consists of: m0 (sub_mod), dn0 (sub_dim_size), dj0 (sub_dim_stride), dc0 (sub_dim_count) + $m0 = MOV_scalar_imm11_pseudo 0 + $dn0 = MOV_scalar_imm11_pseudo 16 + $dj0 = MOV_scalar_imm11_pseudo 1 + $dc0 = MOV_scalar_imm11_pseudo 256 + + ; LDA_2D_dms_lda has tied operands: $count_out=$mod.sub_dim_count + ; The output $dc0 (count_out) is tied to the input $d0.sub_dim_count + ; p3 is also tied (tied-def 1) + ; This creates a tied register constraint that should be filtered + ; Both the d0 live range and the tied operands (p3, dc0) should be filtered + dead $r1, $p3, $dc0 = LDA_2D_dms_lda $p3, $d0 :: (load (s32)) + + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test8-reserved-liveout-ranges.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test8-reserved-liveout-ranges.mir new file mode 100644 index 000000000000..9a5c60020659 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test8-reserved-liveout-ranges.mir @@ -0,0 +1,108 @@ +# NOTE: Test for AIERegDefUseTracker - reserved ranges for live-out defs +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker --aie-postpipeliner-filter-no-choice=false \ +# RUN: -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test 8: Verify reserved range handling for defs feeding live-out +# +# This test exercises the recent work on relaxing live-in/live-out restrictions: +# 1. A def that feeds into a live-out use should create a RESERVED range +# 2. An additional disjoint live range on the same register should NOT make +# that register available for reallocation +# +# Program order (backward analysis processes in reverse): +# - r0: def -> use (disjoint, early) - analyzed LAST -> normal +# - r1: def -> use (not live-out) - normal +# - r0: def -> use (late, feeds live-out) - analyzed FIRST -> RESERVED +# +# Expected behavior: +# - r0's late range (feeding live-out) should be marked RESERVED +# - r0's early disjoint range should be normal (not reserved) +# - r0 should NOT appear in available physical registers (due to reserved range) +# - r1 should appear in available physical registers + +# CHECK-DAG: Live Range #{{[0-9]+}} for r0 [RESERVED]: +# CHECK-DAG: Live Range #{{[0-9]+}} for r0: +# CHECK-DAG: Live Range #{{[0-9]+}} for r1: + +# Verify r0 is NOT in available registers (due to reserved range) +# CHECK: Available Physical Registers for Reallocation: +# CHECK-NEXT: ============================================== +# CHECK-DAG: l2 +# CHECK-DAG: r1 +# CHECK-DAG: r4 +# CHECK-DAG: r5 +# CHECK-DAG: r6 +# CHECK: Total: 5 registers + +--- | + define void @test_reserved_liveout() { + entry: + br label %loop + loop: + br i1 undef, label %loop, label %exit + exit: + ret void + } +... +--- +name: test_reserved_liveout +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $p0, $p6, $r2, $r3 + + $lc = ADD_NC_mv_add_ri $r3, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + liveins: $p0, $p6, $r2, $r3 + + ; EARLY in program order (analyzed LAST in backward pass): + ; r0 def -> use chain, disjoint from the later r0 range + ; This is a normal range + $r0 = MOV_alu_mv_mv_mv_scl $p6 + $r5 = ADD_NC_mv_add_rr $r0, $r3 + + ; r1 live range: def -> use (not live-out) + ; This should be a normal, non-reserved range + $r1 = MOV_alu_mv_mv_mv_scl $p6 + $r6 = ADD_NC_mv_add_rr $r1, $r2 + + ; LATE in program order (analyzed FIRST in backward pass): + ; r0 def -> use that feeds live-out to bb.2 + ; This should be marked as RESERVED because r0 is live-out to bb.2 + ; and this is the last def-use of r0 before the block end + $r0 = MOV_alu_mv_mv_mv_scl $p6 + $r4 = ADD_NC_mv_add_rr $r0, $r2 + + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + liveins: $r0, $r2 + + ; Use r0 from the loop - this makes r0 live-out from bb.1 + ; Backward analysis starts here, sees r0 is live-in to bb.2 + ; Then traces back and finds the LATE r0 def-use (closest to block end) + ; feeds into this, so that range becomes RESERVED + $r7 = ADD_NC_mv_add_rr $r0, $r2 + + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test9-reserved-composite-subreg-liveout.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test9-reserved-composite-subreg-liveout.mir new file mode 100644 index 000000000000..eb6d4b4bdc3b --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/regalloc/test9-reserved-composite-subreg-liveout.mir @@ -0,0 +1,100 @@ +# NOTE: Test for AIERegDefUseTracker - reserved composite ranges with subreg live-out +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 \ +# RUN: --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker --aie-postpipeliner-filter-no-choice=false \ +# RUN: -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test 9: Verify reserved range handling for composite registers with subreg live-out +# +# This test exercises reserved ranges with composite base registers: +# 1. A composite def (dm4) with multiple subreg uses (cml4, cmh4) that feeds +# into a subreg live-out (cml4) should create a RESERVED range +# 2. An additional disjoint live range on the same composite register should +# NOT make that register available for reallocation +# +# Program order (backward analysis processes in reverse): +# - dm4: def → use cml4, use cmh4 (disjoint, early) - analyzed LAST → normal +# - dm4: def → use cml4, use cmh4 (late, cml4 feeds live-out) - analyzed FIRST → RESERVED +# +# Expected behavior: +# - dm4's late range (feeding subreg live-out) should be marked RESERVED +# - dm4's early disjoint range should be normal (not reserved) +# - dm4 (and its subregs) should NOT appear in available physical registers + +# CHECK-DAG: Live Range #{{[0-9]+}} for dm4 [RESERVED]: +# CHECK-DAG: Live Range #{{[0-9]+}} for dm4: + +# Verify dm4 and its subregs are NOT in available registers (due to reserved range) +# CHECK: Available Physical Registers for Reallocation: +# CHECK-NEXT: ============================================== +# CHECK-NOT: dm4 +# CHECK-NOT: cml4 +# CHECK-NOT: cmh4 +# CHECK: Total: 0 registers + +--- | + define void @test_reserved_composite_subreg_liveout() { + entry: + br label %loop + loop: + br i1 undef, label %loop, label %exit + exit: + ret void + } +... +--- +name: test_reserved_composite_subreg_liveout +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $p0, $p1, $y5, $y0, $r2, $r3 + + $lc = ADD_NC_mv_add_ri $r3, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + liveins: $p0, $p1, $y5, $y0, $r2 + + ; EARLY in program order (analyzed LAST in backward pass): + ; dm4 composite def → subreg uses (cml4, cmh4), disjoint from later range + ; This is a normal range + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cml4, $p0, 0, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cmh4, $p1, 64, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + ; LATE in program order (analyzed FIRST in backward pass): + ; dm4 composite def → subreg uses (cml4, cmh4) where cml4 feeds live-out + ; This should be marked as RESERVED because cml4 is live-out to bb.2 + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y5, $y0, $r2, implicit-def dead $srfpflags, implicit $crfpmask + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cml4, $p0, 128, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cmh4, $p1, 192, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + liveins: $cml4, $p0 + + ; Use cml4 (subreg of dm4) from the loop - this makes cml4 live-out from bb.1 + ; Backward analysis starts here, sees cml4 is live-in to bb.2 + ; Then traces back and finds the LATE dm4 def with cml4 use (closest to block end) + ; feeds into this, so that range becomes RESERVED + VST_CONV_bf16_fp32_dmx_sts_srs_bf_idx_imm $cml4, $p0, 256, implicit-def $srf2fflags, implicit $crf2fmask, implicit $crrnd + + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/conv2d_bf16.mir b/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/conv2d_bf16.mir index e4b6d5f4ee46..77328d1f17a2 100644 --- a/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/conv2d_bf16.mir +++ b/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/conv2d_bf16.mir @@ -64,11 +64,11 @@ body: | ; CHECK-NEXT: $x3, $p0, $lf0, $r24 = VLDA_POP_dmx_lda_fifo_x_normal_pop killed $p0, killed $lf0, killed $r24, debug-location !6 :: (load unknown-size, align 1) ; CHECK-NEXT: $x6, $p0, $lf0, $r24 = VLDB_POP_dmx_ldb_fifo_x_normal_pop killed $p0, killed $lf0, killed $r24, debug-location !6 :: (load unknown-size, align 1) ; CHECK-NEXT: BUNDLE implicit-def $x10, implicit-def $wl10, implicit-def $wh10, implicit-def $p1, implicit-def $x1, implicit-def $wl1, implicit-def $wh1, implicit-def $p0, implicit-def $lf0, implicit-def $lfl0, implicit-def $lfh0, implicit-def $r24, implicit killed $p1, implicit killed $p0, implicit killed $lf0, implicit killed $r24, debug-location !6 { - ; CHECK-NEXT: renamable $x10, renamable $p1 = VLDA_dmx_lda_x_ld_pstm_nrm_imm killed renamable $p1, 64, debug-location !6 :: (load (<16 x s32>), addrspace 6) + ; CHECK-NEXT: $x10, renamable $p1 = VLDA_dmx_lda_x_ld_pstm_nrm_imm killed renamable $p1, 64, debug-location !6 :: (load (<16 x s32>), addrspace 6) ; CHECK-NEXT: $x1, $p0, $lf0, $r24 = VLDB_POP_dmx_ldb_fifo_x_normal_pop killed $p0, killed $lf0, killed $r24, debug-location !6 :: (load unknown-size, align 1) ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $x11, implicit-def $wl11, implicit-def $wh11, implicit-def $p1, implicit-def $p0, implicit-def $dc1, implicit-def $dc5, implicit killed $p1, implicit killed $p0, implicit $d1_3d, debug-location !6 { - ; CHECK-NEXT: renamable $x11, renamable $p1 = VLDA_dmx_lda_x_ld_pstm_nrm_imm killed renamable $p1, 64, debug-location !6 :: (load (<16 x s32>), addrspace 6) + ; CHECK-NEXT: $x11, renamable $p1 = VLDA_dmx_lda_x_ld_pstm_nrm_imm killed renamable $p1, 64, debug-location !6 :: (load (<16 x s32>), addrspace 6) ; CHECK-NEXT: $p0, $dc1, $dc5 = PADDS_3D killed $p0, $d1_3d, debug-location !6 ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $x5, implicit-def $wl5, implicit-def $wh5, implicit-def $p0, implicit-def $lf0, implicit-def $lfl0, implicit-def $lfh0, implicit-def $r24, implicit-def $lfe, implicit-def $srfifo_uf, implicit-def $lc, implicit $r30, implicit killed $p0, implicit killed $lf0, implicit killed $r24, implicit killed $lfe, implicit killed $r5, debug-location !6 { @@ -84,13 +84,13 @@ body: | ; CHECK-NEXT: MOVXM_lng_cg_le_abs , implicit-def $le, debug-location !6 ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $x10, implicit-def $wl10, implicit-def $wh10, implicit-def $p1, implicit-def $x1, implicit-def $wl1, implicit-def $wh1, implicit-def $p0, implicit-def $lf0, implicit-def $lfl0, implicit-def $lfh0, implicit-def $r24, implicit killed $p1, implicit killed $p0, implicit killed $lf0, implicit killed $r24, debug-location !6 { - ; CHECK-NEXT: renamable $x10, renamable $p1 = VLDA_dmx_lda_x_ld_pstm_nrm_imm killed renamable $p1, 64, debug-location !6 :: (load (<16 x s32>), addrspace 6) + ; CHECK-NEXT: $x10, renamable $p1 = VLDA_dmx_lda_x_ld_pstm_nrm_imm killed renamable $p1, 64, debug-location !6 :: (load (<16 x s32>), addrspace 6) ; CHECK-NEXT: $x1, $p0, $lf0, $r24 = VLDB_POP_dmx_ldb_fifo_x_normal_pop killed $p0, killed $lf0, killed $r24, debug-location !6 :: (load unknown-size, align 1) ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $x11, implicit-def $wl11, implicit-def $wh11, implicit-def $p1, implicit-def $p0, implicit-def $dc1, implicit-def $dc5, implicit-def $x7, implicit-def $wl7, implicit-def $wh7, implicit killed $p1, implicit killed $p0, implicit $d1_3d, implicit killed $x5, implicit killed $x3, implicit $r18, debug-location !6 { - ; CHECK-NEXT: renamable $x11, renamable $p1 = VLDA_dmx_lda_x_ld_pstm_nrm_imm killed renamable $p1, 64, debug-location !6 :: (load (<16 x s32>), addrspace 6) + ; CHECK-NEXT: $x11, renamable $p1 = VLDA_dmx_lda_x_ld_pstm_nrm_imm killed renamable $p1, 64, debug-location !6 :: (load (<16 x s32>), addrspace 6) ; CHECK-NEXT: $p0, $dc1, $dc5 = PADDS_3D killed $p0, $d1_3d, debug-location !6 - ; CHECK-NEXT: renamable $x7 = VSHUFFLE_vec_shuffle_x killed renamable $x5, killed renamable $x3, renamable $r18, debug-location !6 + ; CHECK-NEXT: $x7 = VSHUFFLE_vec_shuffle_x killed $x5, killed $x3, renamable $r18, debug-location !6 ; CHECK-NEXT: } ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.for.body266.i: @@ -99,27 +99,27 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUNDLE implicit-def $x5, implicit-def $wl5, implicit-def $wh5, implicit-def $p0, implicit-def $lf0, implicit-def $lfl0, implicit-def $lfh0, implicit-def $r24, implicit-def $lfe, implicit-def $srfifo_uf, implicit-def $x4, implicit-def $wl4, implicit-def $wh4, implicit $r30, implicit killed $p0, implicit killed $lf0, implicit killed $r24, implicit killed $lfe, implicit killed $x3, implicit $r12, debug-location !6 { ; CHECK-NEXT: $x5, $p0, $lf0, $r24 = VLDB_POPX $r30, $r30, killed $p0, killed $lf0, killed $r24, implicit-def $lfe, implicit-def $srfifo_uf, implicit killed $lfe, debug-location !6 :: (load unknown-size, align 1) - ; CHECK-NEXT: renamable $x4 = VSHUFFLE_vec_shuffle_x internal renamable $x5, killed renamable $x3, renamable $r12, debug-location !6 + ; CHECK-NEXT: $x4 = VSHUFFLE_vec_shuffle_x internal $x5, killed $x3, renamable $r12, debug-location !6 ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $x3, implicit-def $wl3, implicit-def $wh3, implicit-def $p0, implicit-def $lf0, implicit-def $lfl0, implicit-def $lfh0, implicit-def $r24, implicit-def $x8, implicit-def $wl8, implicit-def $wh8, implicit-def $cml3, implicit-def $bmll3, implicit-def $bmlh3, implicit-def dead $srfpflags, implicit killed $p0, implicit killed $lf0, implicit killed $r24, implicit killed $x6, implicit $x1, implicit $r18, implicit killed $cml3, implicit killed $x7, implicit $y5, implicit $r8, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 { ; CHECK-NEXT: $x3, $p0, $lf0, $r24 = VLDA_POP_dmx_lda_fifo_x_normal_pop killed $p0, killed $lf0, killed $r24, debug-location !6 :: (load unknown-size, align 1) - ; CHECK-NEXT: renamable $x8 = VSHUFFLE_vec_shuffle_x killed renamable $x6, renamable $x1, renamable $r18, debug-location !6 + ; CHECK-NEXT: $x8 = VSHUFFLE_vec_shuffle_x killed $x6, $x1, renamable $r18, debug-location !6 ; CHECK-NEXT: $cml3 = VMAC_f_vmac_bf_half_vmul_bf_core_X_Y killed $cml3, killed $x7, $y5, $r8, implicit-def dead $srfpflags, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $x6, implicit-def $wl6, implicit-def $wh6, implicit-def $p0, implicit-def $lf0, implicit-def $lfl0, implicit-def $lfh0, implicit-def $r24, implicit-def $cml2, implicit-def $bmll2, implicit-def $bmlh2, implicit-def dead $srfpflags, implicit killed $p0, implicit killed $lf0, implicit killed $r24, implicit killed $x1, implicit $r12, implicit killed $cml2, implicit killed $x4, implicit $y5, implicit $r8, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 { ; CHECK-NEXT: $x6, $p0, $lf0, $r24 = VLDB_POP_dmx_ldb_fifo_x_normal_pop killed $p0, killed $lf0, killed $r24, debug-location !6 :: (load unknown-size, align 1) - ; CHECK-NEXT: renamable $x6 = VSHUFFLE_vec_shuffle_x internal renamable $x6, killed renamable $x1, renamable $r12, debug-location !6 + ; CHECK-NEXT: $x6 = VSHUFFLE_vec_shuffle_x internal $x6, killed $x1, renamable $r12, debug-location !6 ; CHECK-NEXT: $cml2 = VMAC_f_vmac_bf_half_vmul_bf_core_X_Y killed $cml2, killed $x4, $y5, $r8, implicit-def dead $srfpflags, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $x10, implicit-def $wl10, implicit-def $wh10, implicit-def $p1, implicit-def $x1, implicit-def $wl1, implicit-def $wh1, implicit-def $p0, implicit-def $lf0, implicit-def $lfl0, implicit-def $lfh0, implicit-def $r24, implicit-def $cml1, implicit-def $bmll1, implicit-def $bmlh1, implicit-def dead $srfpflags, implicit killed $p1, implicit killed $p0, implicit killed $lf0, implicit killed $r24, implicit killed $cml1, implicit killed $x8, implicit $y5, implicit $r8, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 { - ; CHECK-NEXT: renamable $x10, renamable $p1 = VLDA_dmx_lda_x_ld_pstm_nrm_imm killed renamable $p1, 64, debug-location !6 :: (load (<16 x s32>), addrspace 6) + ; CHECK-NEXT: $x10, renamable $p1 = VLDA_dmx_lda_x_ld_pstm_nrm_imm killed renamable $p1, 64, debug-location !6 :: (load (<16 x s32>), addrspace 6) ; CHECK-NEXT: $x1, $p0, $lf0, $r24 = VLDB_POP_dmx_ldb_fifo_x_normal_pop killed $p0, killed $lf0, killed $r24, debug-location !6 :: (load unknown-size, align 1) ; CHECK-NEXT: $cml1 = VMAC_f_vmac_bf_half_vmul_bf_core_X_Y killed $cml1, killed $x8, $y5, $r8, implicit-def dead $srfpflags, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $x11, implicit-def $wl11, implicit-def $wh11, implicit-def $p1, implicit-def $p0, implicit-def $dc1, implicit-def $dc5, implicit-def $x7, implicit-def $wl7, implicit-def $wh7, implicit-def $cml0, implicit-def $bmll0, implicit-def $bmlh0, implicit-def dead $srfpflags, implicit killed $p1, implicit killed $p0, implicit $d1_3d, implicit killed $x5, implicit killed $x3, implicit $r18, implicit killed $cml0, implicit killed $x6, implicit killed $y5, implicit $r8, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 { - ; CHECK-NEXT: renamable $x11, renamable $p1 = VLDA_dmx_lda_x_ld_pstm_nrm_imm killed renamable $p1, 64, debug-location !6 :: (load (<16 x s32>), addrspace 6) + ; CHECK-NEXT: $x11, renamable $p1 = VLDA_dmx_lda_x_ld_pstm_nrm_imm killed renamable $p1, 64, debug-location !6 :: (load (<16 x s32>), addrspace 6) ; CHECK-NEXT: $p0, $dc1, $dc5 = PADDS_3D killed $p0, $d1_3d, debug-location !6 - ; CHECK-NEXT: renamable $x7 = VSHUFFLE_vec_shuffle_x killed renamable $x5, killed renamable $x3, renamable $r18, debug-location !6 + ; CHECK-NEXT: $x7 = VSHUFFLE_vec_shuffle_x killed $x5, killed $x3, renamable $r18, debug-location !6 ; CHECK-NEXT: $cml0 = VMAC_f_vmac_bf_half_vmul_bf_core_X_Y killed $cml0, killed $x6, killed $y5, $r8, implicit-def dead $srfpflags, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 ; CHECK-NEXT: } ; CHECK-NEXT: PseudoLoopEnd , %bb.2, debug-location !6 @@ -127,27 +127,27 @@ body: | ; CHECK-NEXT: bb.3.for.cond.cleanup265.i: ; CHECK-NEXT: liveins: $cml0, $cml1, $cml2, $cml3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $x4 = VSHUFFLE_vec_shuffle_x renamable $x5, renamable $x3, renamable $r12, debug-location !6 + ; CHECK-NEXT: $x4 = VSHUFFLE_vec_shuffle_x $x5, $x3, renamable $r12, debug-location !6 ; CHECK-NEXT: BUNDLE implicit-def $x8, implicit-def $wl8, implicit-def $wh8, implicit-def $cml3, implicit-def $bmll3, implicit-def $bmlh3, implicit-def dead $srfpflags, implicit $x6, implicit $x1, implicit $r18, implicit killed $cml3, implicit killed $x7, implicit $y5, implicit $r8, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 { - ; CHECK-NEXT: renamable $x8 = VSHUFFLE_vec_shuffle_x renamable $x6, renamable $x1, renamable $r18, debug-location !6 + ; CHECK-NEXT: $x8 = VSHUFFLE_vec_shuffle_x $x6, $x1, renamable $r18, debug-location !6 ; CHECK-NEXT: $cml3 = VMAC_f_vmac_bf_half_vmul_bf_core_X_Y killed $cml3, killed $x7, $y5, $r8, implicit-def dead $srfpflags, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $x6, implicit-def $wl6, implicit-def $wh6, implicit-def $cml2, implicit-def $bmll2, implicit-def $bmlh2, implicit-def dead $srfpflags, implicit killed $x6, implicit $x1, implicit $r12, implicit killed $cml2, implicit killed $x4, implicit $y5, implicit $r8, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 { - ; CHECK-NEXT: renamable $x6 = VSHUFFLE_vec_shuffle_x killed renamable $x6, renamable $x1, renamable $r12, debug-location !6 + ; CHECK-NEXT: $x6 = VSHUFFLE_vec_shuffle_x killed $x6, $x1, renamable $r12, debug-location !6 ; CHECK-NEXT: $cml2 = VMAC_f_vmac_bf_half_vmul_bf_core_X_Y killed $cml2, killed $x4, $y5, $r8, implicit-def dead $srfpflags, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 ; CHECK-NEXT: } ; CHECK-NEXT: $cml1 = VMAC_f_vmac_bf_half_vmul_bf_core_X_Y killed $cml1, killed $x8, $y5, $r8, implicit-def dead $srfpflags, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 ; CHECK-NEXT: BUNDLE implicit-def $x7, implicit-def $wl7, implicit-def $wh7, implicit-def $cml0, implicit-def $bmll0, implicit-def $bmlh0, implicit-def dead $srfpflags, implicit $x5, implicit $x3, implicit $r18, implicit killed $cml0, implicit $x6, implicit $y5, implicit $r8, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 { - ; CHECK-NEXT: renamable $x7 = VSHUFFLE_vec_shuffle_x renamable $x5, renamable $x3, renamable $r18, debug-location !6 + ; CHECK-NEXT: $x7 = VSHUFFLE_vec_shuffle_x $x5, $x3, renamable $r18, debug-location !6 ; CHECK-NEXT: $cml0 = VMAC_f_vmac_bf_half_vmul_bf_core_X_Y killed $cml0, $x6, $y5, $r8, implicit-def dead $srfpflags, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 ; CHECK-NEXT: } - ; CHECK-NEXT: renamable $x4 = VSHUFFLE_vec_shuffle_x killed renamable $x5, killed renamable $x3, renamable $r12, debug-location !6 + ; CHECK-NEXT: $x4 = VSHUFFLE_vec_shuffle_x killed $x5, killed $x3, renamable $r12, debug-location !6 ; CHECK-NEXT: BUNDLE implicit-def $x8, implicit-def $wl8, implicit-def $wh8, implicit-def $cml3, implicit-def $bmll3, implicit-def $bmlh3, implicit-def dead $srfpflags, implicit $x6, implicit $x1, implicit killed $r18, implicit killed $cml3, implicit killed $x7, implicit $y5, implicit $r8, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 { - ; CHECK-NEXT: renamable $x8 = VSHUFFLE_vec_shuffle_x renamable $x6, renamable $x1, killed renamable $r18, debug-location !6 + ; CHECK-NEXT: $x8 = VSHUFFLE_vec_shuffle_x $x6, $x1, killed renamable $r18, debug-location !6 ; CHECK-NEXT: $cml3 = VMAC_f_vmac_bf_half_vmul_bf_core_X_Y killed $cml3, killed $x7, $y5, $r8, implicit-def dead $srfpflags, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $x6, implicit-def $wl6, implicit-def $wh6, implicit-def $cml2, implicit-def $bmll2, implicit-def $bmlh2, implicit-def dead $srfpflags, implicit killed $x6, implicit killed $x1, implicit killed $r12, implicit killed $cml2, implicit killed $x4, implicit $y5, implicit $r8, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 { - ; CHECK-NEXT: renamable $x6 = VSHUFFLE_vec_shuffle_x killed renamable $x6, killed renamable $x1, killed renamable $r12, debug-location !6 + ; CHECK-NEXT: $x6 = VSHUFFLE_vec_shuffle_x killed $x6, killed $x1, killed renamable $r12, debug-location !6 ; CHECK-NEXT: $cml2 = VMAC_f_vmac_bf_half_vmul_bf_core_X_Y killed $cml2, killed $x4, $y5, $r8, implicit-def dead $srfpflags, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 ; CHECK-NEXT: } ; CHECK-NEXT: $cml1 = VMAC_f_vmac_bf_half_vmul_bf_core_X_Y killed $cml1, killed $x8, $y5, $r8, implicit-def dead $srfpflags, implicit $crbf8conf, implicit $crfp8conf, implicit $crfpmask, debug-location !6 diff --git a/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/regalloc/test14-read-modify-write.mir b/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/regalloc/test14-read-modify-write.mir new file mode 100644 index 000000000000..e888fe301487 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2ps/schedule/postpipeliner/regalloc/test14-read-modify-write.mir @@ -0,0 +1,96 @@ +# NOTE: Test for AIERegDefUseTracker - composite register pattern with AIE2PS VMUL +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -verify-machineinstrs --mtriple=aie2ps -O2 \ +# RUN: --start-before=postmisched %s \ +# RUN: --aie-postpipeliner-vreg-mode=1 \ +# RUN: --aie-postpipeliner-phys-mode=0 \ +# RUN: --debug-only=aie-reg-liverange \ +# RUN: --aie-test-regdefuse-tracker --aie-postpipeliner-filter-no-choice=false \ +# RUN: -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test 14: Composite register pattern with VMUL using Y registers +# +# Pattern: +# x10 = VLDB_UNPACK... ; first def of x10 +# x0 = (some op)... ; local def of x0 +# x10 = VMAX... ; second def of x10 (read-modify-write) +# dm1 = VMUL y0, y5, r9 ; uses y0 (x0 + x1) and y5 (x10 + x11) +# +# y0 is composed of: locally-defined x0 + live-in x1 +# y5 is composed of: locally-defined x10 + undefined x11 (not live-in) +# +# The goal is to have two separate live ranges for x10: +# - Live Range 1: VLDB_UNPACK def -> VMIN use (base register: x10) +# - Live Range 2: VMAX def -> VMUL use via y5 (base register: y5) +# +# This tests that lane-mask-based overlap checking correctly separates +# live ranges when a subreg is redefined within a super-register range. +# +# CHECK: FINAL LIVE RANGES +# CHECK: Live Range #7 for x10: +# CHECK-NEXT: Definitions (1): +# CHECK-NEXT: [0] Register: x10 $x10, $p0 = VLDB_UNPACK_dmw_ldb_unpack_pstm_nrm_imm_unpackSign1 +# CHECK-NEXT: Uses (1): +# CHECK-NEXT: [0] Register: x10 $x0, dead $r16 = VMIN_GE_16_vaddSign1 $x10, $x4 +# CHECK-EMPTY: +# CHECK-NEXT: Live Range #2 for y5: +# CHECK-NEXT: Definitions (1): +# CHECK-NEXT: [0] Register: x10 (SubRegIdx: 8) $x10, dead $r16 = VMAX_LT_16_vaddSign1 $x0, $x4 +# CHECK-NEXT: Uses (1): +# CHECK-NEXT: [0] Register: y5 $dm1 = VMUL_vmul_vmul_cm_core_Y_Y $y0, $y5, $r9 + +--- | + define void @test_vmul_composite_regs() { + entry: + br label %loop + loop: + br i1 undef, label %loop, label %exit + exit: + ret void + } +... +--- +name: test_vmul_composite_regs +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1 + liveins: $p0, $r1, $r9, $x1, $x4 + + $lc = ADD_NC_add_lc_ri $r1, 0 + MOVXM_lng_cg_ls_abs %bb.1, implicit-def $ls + MOVXM_lng_cg_le_abs , implicit-def $le + + bb.1.loop (align 16): + successors: %bb.1, %bb.2 + liveins: $p0, $r9, $x1, $x4 + + ; First def of x10 (from load with unpack) + $x10, $p0 = VLDB_UNPACK_dmw_ldb_unpack_pstm_nrm_imm_unpackSign1 $p0, 32, implicit $crunpacksize, implicit $unpacksign1 + + ; Local def of x0 (via min operation using x10 and x4) + $x0, dead $r16 = VMIN_GE_16_vaddSign1 $x10, $x4, implicit $crbf8conf, implicit $crfp8conf, implicit $vaddsign1 + + ; Second def of x10 (read-modify-write: uses x0, defines x10) + $x10, dead $r16 = VMAX_LT_16_vaddSign1 $x0, $x4, implicit $crbf8conf, implicit $crfp8conf, implicit $vaddsign1 + + ; VMUL uses y0 (x0 + x1) and y5 (x10 + x11) + ; y0 = locally-defined x0 + live-in x1 + ; y5 = locally-defined x10 + undefined x11 + $dm1 = VMUL_vmul_vmul_cm_core_Y_Y $y0, $y5, $r9 + + PseudoLoopEnd , %bb.1 + + bb.2.exit (align 16): + + RET implicit $lr + DelayedSchedBarrier + +...