diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp index bf08fdbf2ff4..1cbf15e7065f 100644 --- a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp +++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp @@ -269,8 +269,6 @@ class BiasDepth : public ScheduleDAGMutation { }; class RegionEndEdges : public ScheduleDAGMutation { - AAResults *AA; - void removeExitSUPreds(ScheduleDAGInstrs *DAG) { SUnit &ExitSU = DAG->ExitSU; while (!ExitSU.Preds.empty()) { @@ -278,7 +276,7 @@ class RegionEndEdges : public ScheduleDAGMutation { } } void apply(ScheduleDAGInstrs *DAG) override { - AIE::MaxLatencyFinder MaxLatency(DAG, AA); + AIE::MaxLatencyFinder MaxLatency(DAG); MachineBasicBlock *PrologueMBB = DAG->getBB(); unsigned int ZOLBundlesCount = 0; @@ -349,7 +347,7 @@ class RegionEndEdges : public ScheduleDAGMutation { }; public: - RegionEndEdges(AAResults *AA = nullptr) : AA(AA) {} + RegionEndEdges() {} }; /// This Mutator is responsible for emitting "fixed" SUnits at the top or bottom @@ -912,16 +910,30 @@ AIEBaseSubtarget::getPostRAMutationsImpl(const Triple &TT, AAResults *AA) { if (!TT.isAIE1()) { if (EnableWAWStickyRegisters) Mutations.emplace_back(std::make_unique()); - Mutations.emplace_back(std::make_unique(AA)); + // RegionEndEdges must run before MemoryEdges/WAWEdges/BiasDepth, and + // EmitFixedSUnits must run last. Both are applied via applyMutations() + // inside AIEPostRASchedStrategy::buildGraph, which also suppresses the + // redundant postProcessDAG() call from ScheduleDAGMI::schedule(). + Mutations.emplace_back(createRegionEndEdgesMutation()); Mutations.emplace_back(std::make_unique(true)); Mutations.emplace_back(std::make_unique()); Mutations.emplace_back(std::make_unique()); - Mutations.emplace_back(std::make_unique( - EnableAAInEmitFixedSUnits ? AA : nullptr)); + Mutations.emplace_back(createEmitFixedSUnitsMutation(AA)); } return Mutations; } +std::unique_ptr +AIEBaseSubtarget::createRegionEndEdgesMutation() { + return std::make_unique(); +} + +std::unique_ptr +AIEBaseSubtarget::createEmitFixedSUnitsMutation(AAResults *AA) { + return std::make_unique(EnableAAInEmitFixedSUnits ? AA + : nullptr); +} + // List the Mutations that apply to the interblock DAG construction. std::vector> AIEBaseSubtarget::getDDGMutationsImpl(const Triple &TT, bool ExactLatencies) { diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.h b/llvm/lib/Target/AIE/AIEBaseSubtarget.h index 9eaa4b29d920..9e0146d50d92 100644 --- a/llvm/lib/Target/AIE/AIEBaseSubtarget.h +++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.h @@ -56,8 +56,12 @@ class AIEBaseSubtarget : public TargetSubtargetInfo { } void getPostRAMutations(std::vector> &Mutations) const override { - Mutations = - AIEBaseSubtarget::getPostRAMutationsImpl(getTargetTriple(), nullptr); + // Post-RA mutations are applied directly in + // AIEPostRASchedStrategy::buildGraph, which owns the full graph + // construction pipeline. The registered Mutations list is intentionally + // empty so that the postProcessDAG() call in ScheduleDAGMI::schedule() + // is a no-op. + Mutations.clear(); } void overrideSchedPolicy(MachineSchedPolicy &Policy, @@ -84,6 +88,17 @@ class AIEBaseSubtarget : public TargetSubtargetInfo { static std::vector> getSMSMutationsImpl(const Triple &TT); + /// Create the RegionEndEdges mutation for use in buildGraph, where it is + /// invoked directly after the other post-RA mutations and before + /// createEmitFixedSUnitsMutation (ordering is significant). + static std::unique_ptr createRegionEndEdgesMutation(); + + /// Create the EmitFixedSUnits mutation for use in buildGraph, invoked after + /// createRegionEndEdgesMutation to preserve the ExitSU-edge ordering + /// invariant. + static std::unique_ptr + createEmitFixedSUnitsMutation(AAResults *AA); + /// Whether to enable the pre-RA MachinePipeliner. This can be disabled to let /// the post-RA pipeliner handle the scheduling. bool enableMachinePipeliner() const override; diff --git a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp index a8cb2cd06ef3..2b1b64d08f38 100644 --- a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp +++ b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp @@ -378,13 +378,11 @@ void AIEBasePassConfig::addPreSched2() { ScheduleDAGInstrs * AIEBaseTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const { - ScheduleDAGMI *DAG = - new AIEScheduleDAGMI(C, std::make_unique(C), - /* RemoveKillFlags=*/true); - for (auto &Mutation : - AIEBaseSubtarget::getPostRAMutationsImpl(getTargetTriple(), C->AA)) - DAG->addMutation(std::move(Mutation)); - return DAG; + // Post-RA mutations are applied directly in + // AIEPostRASchedStrategy::buildGraph, so the registered Mutations list is + // intentionally empty (matching the empty list from getPostRAMutations). + return new AIEScheduleDAGMI(C, std::make_unique(C), + /* RemoveKillFlags=*/true); } ScheduleDAGInstrs * diff --git a/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp b/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp index 08116d367768..80b79cba36c7 100644 --- a/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp +++ b/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// @@ -72,4 +72,67 @@ void DataDependenceHelper::dumpDot(raw_ostream &OS, OS << "}\n"; } +void InterBlockEdges::addNode(MachineInstr *MI) { + if (auto Index = initSUnit(*MI)) { + IndexMap &TheMap = Boundary ? SuccMap : PredMap; + TheMap.emplace(MI, *Index); + } +} + +void InterBlockEdges::markBoundary() { Boundary = SUnits.size(); } + +bool InterBlockEdges::mayAlias(SUnit *SUa, SUnit *SUb, bool TBAA) { + if (SafeToIgnoreMemDeps && Boundary) { + // Suppress memory edges that cross the pre/post boundary. + const bool AIsPost = SUa->NodeNum >= *Boundary; + const bool BIsPost = SUb->NodeNum >= *Boundary; + if (AIsPost != BIsPost) + return false; + } + return DataDependenceHelper::mayAlias(SUa, SUb, TBAA); +} + +const SUnit *InterBlockEdges::getPreBoundaryNode(MachineInstr *MI) const { + const auto Found = PredMap.find(MI); + if (Found == PredMap.end()) { + return nullptr; + } + return &SUnits.at(Found->second); +} + +bool InterBlockEdges::isPostBoundaryNode(SUnit *SU) const { + return Boundary ? SU->NodeNum >= *Boundary : false; +} + +void InterBlockEdges::recordPostDepth(MachineInstr *MI, int Depth) { + const auto Found = SuccMap.find(MI); + if (Found == SuccMap.end()) + return; + PostDepths[Found->second] = Depth; +} + +int InterBlockEdges::getPostDepthOr(const SUnit *SU, int Default) const { + const auto It = PostDepths.find(SU->NodeNum); + return It != PostDepths.end() ? It->second : Default; +} + +void InterBlockEdges::recordPreHeightsFromSuccessors() { + for (const auto &[MI, NodeNum] : PredMap) { + const SUnit &SU = SUnits.at(NodeNum); + int MinHeight = std::numeric_limits::max(); + for (const SDep &Dep : SU.Succs) { + if (!isPostBoundaryNode(Dep.getSUnit())) + continue; + MinHeight = std::min(MinHeight, int(Dep.getSUnit()->getHeight())); + } + if (MinHeight != std::numeric_limits::max()) + PreHeights[NodeNum] = MinHeight; + } +} + +int InterBlockEdges::getPreHeight(const SUnit *SU) const { + const auto It = PreHeights.find(SU->NodeNum); + return It != PreHeights.end() ? It->second : std::numeric_limits::max(); +} + } // end namespace llvm::AIE diff --git a/llvm/lib/Target/AIE/AIEDataDependenceHelper.h b/llvm/lib/Target/AIE/AIEDataDependenceHelper.h index 94ad326ef07f..dcace95370ad 100644 --- a/llvm/lib/Target/AIE/AIEDataDependenceHelper.h +++ b/llvm/lib/Target/AIE/AIEDataDependenceHelper.h @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// // @@ -17,6 +17,9 @@ #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include +#include +#include namespace llvm { @@ -36,6 +39,8 @@ class DataDependenceHelper : public ScheduleDAGInstrs { std::vector> Mutations; const MachineSchedContext &Context; void schedule() override {}; + +protected: bool mayAlias(SUnit *SUa, SUnit *SUb, bool TBAA) override; public: @@ -53,6 +58,111 @@ class DataDependenceHelper : public ScheduleDAGInstrs { // are printed. void dumpDot(raw_ostream &OS, bool IncludeBoundaries) const; }; + +/// This class generates all edges between nodes in two flow-adjacent regions. +/// The nodes are added in forward flow order, marking the boundary at the +/// appropriate point. +/// +/// When SafeToIgnoreMemDeps is set, memory-alias edges that cross the +/// pre/post boundary are suppressed via a mayAlias() override. +/// +/// The class also provides optional depth and height maps (both keyed by SUnit +/// NodeNum, so they remain unambiguous when the same MachineInstr* appears on +/// both sides of the boundary, e.g. in a single-block loop): +/// +/// PostDepths — top-down cycle of each post-boundary node. Populated by +/// recordPostDepth(); queried by getPostDepth(). +/// +/// PreHeights — for each pre-boundary node, the minimum getHeight() of its +/// post-boundary successors in the DDG. Populated by +/// recordPreHeightsFromSuccessors() after buildEdges(); queried by +/// getPreHeight(). +/// +/// PreRegionLength — total number of bundles in the pre-boundary region. +/// +/// PostRegionLength — total number of bundles in the post-boundary region, +/// used to represent the depth of the artificial ExitSU node. +class InterBlockEdges : public DataDependenceHelper { + // The boundary between Pred and Succ nodes. + std::optional Boundary; + // When true, memory edges crossing the boundary are suppressed. + bool SafeToIgnoreMemDeps = false; + + /// We can add the same instruction on both sides of the boundary. + /// We maintain explicit maps to retrieve the corresponding SUnit. + using IndexMap = std::map; + IndexMap PredMap; + IndexMap SuccMap; + + /// Depth (top-down cycle) of post-boundary SUnits, keyed by NodeNum. + std::map PostDepths; + /// For each pre-boundary SUnit, the minimum getHeight() of its + /// post-boundary successors (keyed by NodeNum). + std::map PreHeights; + /// Total number of bundles in the pre-boundary region. + int PreRegionLength = 0; + /// Total number of bundles in the post-boundary region. + int PostRegionLength = 0; + + bool mayAlias(SUnit *SUa, SUnit *SUb, bool TBAA) override; + +public: + InterBlockEdges(const MachineSchedContext &Context, + bool SafeToIgnoreMemDeps = false) + : DataDependenceHelper(Context, true, true), + SafeToIgnoreMemDeps(SafeToIgnoreMemDeps) {} + + /// Add a Node to the DAG. + void addNode(MachineInstr *); + + /// Mark the boundary between the predecessor block and the successor block. + /// In normal operation, there should just be one call to this method. + /// Nodes added before are part of the predecessor, nodes added after are + /// part of the successor. + void markBoundary(); + + /// To iterate forward across the SUnits of the underlying DDG. + auto begin() const { return SUnits.begin(); } + auto end() const { return SUnits.end(); } + + /// The following two methods are used to find the cross-boundary edges, + /// by starting from a pre-boundary node and selecting its successor edges + /// that connect to a post-boundary node. + /// --- + /// Retrieve the SUnit that represents MI's instance before the + /// boundary, null if not found. + const SUnit *getPreBoundaryNode(MachineInstr *MI) const; + + /// Check whether SU represents an instruction after the boundary. + bool isPostBoundaryNode(SUnit *SU) const; + + // Post-boundary depth interface. + /// Record the top-down cycle of a post-boundary instruction. + void recordPostDepth(MachineInstr *MI, int Depth); + /// Get the recorded top-down cycle of a post-boundary SUnit, or \p Default + /// if no depth has been recorded (e.g. the instruction is beyond the + /// conflict horizon). + int getPostDepthOr(const SUnit *SU, int Default) const; + /// Clear all recorded post-boundary depths. Call before repopulating. + void clearPostDepths() { PostDepths.clear(); } + + // Pre-boundary height interface. + /// Compute and store, for each pre-boundary SUnit, the minimum getHeight() + /// of its post-boundary successors. Must be called after buildEdges(). + void recordPreHeightsFromSuccessors(); + /// Get the stored height of a pre-boundary SUnit. + /// Returns INT_MAX if not recorded (conservative: no loop-carried use). + int getPreHeight(const SUnit *SU) const; + + // Pre-boundary region length. + void setPreRegionLength(int Length) { PreRegionLength = Length; } + int getPreRegionLength() const { return PreRegionLength; } + + // Post-boundary region length (used as depth of the ExitSU node). + void setPostRegionLength(int Length) { PostRegionLength = Length; } + int getPostRegionLength() const { return PostRegionLength; } +}; + } // namespace AIE } // namespace llvm diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp index 61ad93cc7711..814405a06e2d 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp @@ -383,6 +383,13 @@ void InterBlockScheduling::enterBlock(MachineBasicBlock *BB) { << CurrentBlockState->kindAsString() << " FixPointIter=" << CurrentBlockState->FixPoint.NumIters << " II=" << CurrentBlockState->FixPoint.II << "\n"); + // Emit SWP prologues/epilogues that belong to this block. This only applies + // in the Scheduling stage: during GatheringRegions the regions are only being + // recorded without physically inserting any SWP code yet. + if (CurrentBlockState->FixPoint.Stage != SchedulingStage::GatheringRegions) { + emitInterBlockTop(*CurrentBlockState); + emitInterBlockBottom(*CurrentBlockState); + } } namespace { /// This implements the interface to the postpipeliner to extract the @@ -529,7 +536,7 @@ InterBlockScheduling::resourcesConverged(BlockState &BS, return nullptr; } -MachineInstr *InterBlockScheduling::latencyConverged(BlockState &BS) const { +MachineInstr *InterBlockScheduling::latencyConverged(BlockState &BS) { const auto &SubTarget = BS.TheBlock->getParent()->getSubtarget(); auto *TII = static_cast(SubTarget.getInstrInfo()); auto *ItinData = SubTarget.getInstrItineraryData(); @@ -543,17 +550,18 @@ MachineInstr *InterBlockScheduling::latencyConverged(BlockState &BS) const { // If the successor is in Top, we lookup its depth in TopDepth const Region &Bottom = BS.getBottom(); const Region &Top = BS.getTop(); - const InterBlockEdges &BackEdges = BS.getBoundaryEdges(); + InterBlockEdges &BackEdges = BS.getBoundaryEdges(); - // Record the depth of all instructions in Top. Don't record the ones that - // can't cause problems - std::map TopDepth; - int Depth = 0; + // Repopulate the post-boundary depths from the current scheduled bundles of + // the top region, capped at the conflict horizon. Clear first so that stale + // values from a previous fixpoint iteration are not retained. + BackEdges.clearPostDepths(); + int TopDepth = 0; for (auto &Bundle : Top.Bundles) { for (auto *MI : Bundle.getInstrs()) { - TopDepth[MI] = Depth; + BackEdges.recordPostDepth(MI, TopDepth); } - if (++Depth > HR->getConflictHorizon()) { + if (++TopDepth > HR->getConflictHorizon()) { break; } } @@ -579,14 +587,14 @@ MachineInstr *InterBlockScheduling::latencyConverged(BlockState &BS) const { continue; } DEBUG_LOOPAWARE(dbgs() << " Backedge to " << Succ->NodeNum << "\n"); - auto DepthIt = TopDepth.find(Succ->getInstr()); - if (DepthIt == TopDepth.end()) { - // Over the horizon - continue; - } - DEBUG_LOOPAWARE(dbgs() << " Depth=" << DepthIt->second << "\n"); + // Instructions beyond the conflict horizon default to ConflictHorizon, + // so that Distance = Height + ConflictHorizon >= 1 + ConflictHorizon, + // which is always >= Latency, naturally avoiding false positives. + const int SuccDepth = + BackEdges.getPostDepthOr(Succ, HR->getConflictHorizon()); + DEBUG_LOOPAWARE(dbgs() << " Depth=" << SuccDepth << "\n"); int Latency = SDep.getSignedLatency(); - int Distance = Height + DepthIt->second; + int Distance = Height + SuccDepth; if (Distance < Latency) { DEBUG_LOOPAWARE(dbgs() << " Latency(" << Pred->NodeNum << "->" << Succ->NodeNum << ")=" << Latency @@ -610,19 +618,22 @@ MachineInstr *InterBlockScheduling::latencyConverged(BlockState &BS) const { } SchedulingStage InterBlockScheduling::updateFixPoint(BlockState &BS) { - if (BS.Kind != BlockType::Loop) { - return SchedulingStage::SchedulingDone; - } - if (BS.FixPoint.Stage == SchedulingStage::GatheringRegions) { - // This is the first time we schedule this loop. In that first - // iteration, we have recorded the region decomposition. - // Now we can create the interblock edges between the top and the bottom - // region - BS.initInterBlock(*Context, *HR); + // This is the first visit to this block. The region decomposition has been + // gathered. Now transition to Scheduling so the next pass actually + // schedules the gathered regions. + if (BS.Kind == BlockType::Loop) { + // For loops, also create the interblock edges between the top and the + // bottom region. + BS.initInterBlock(*Context, *HR); + } return SchedulingStage::Scheduling; } + if (BS.Kind != BlockType::Loop) { + return SchedulingStage::SchedulingDone; + } + BS.FixPoint.NumIters++; if (BS.FixPoint.Stage == SchedulingStage::Scheduling) { return updateScheduling(BS); @@ -839,16 +850,33 @@ void InterBlockScheduling::enterRegion(MachineBasicBlock *BB, DEBUG_BLOCKS(dbgs() << " >> enterRegion, Iter=" << BS.FixPoint.NumIters << "\n"); - // Only add regions of loops when in the GatheringRegions phase - if (BS.Kind != BlockType::Loop || - BS.FixPoint.Stage == SchedulingStage::GatheringRegions) { - ArrayRef TopFixedBundles = - RegionBegin == BB->begin() ? ArrayRef(BS.TopInsert) - : ArrayRef(); - ArrayRef BotFixedBundles = - RegionEnd == BB->end() ? ArrayRef(BS.BottomInsert) - : ArrayRef(); - BS.addRegion(BB, RegionBegin, RegionEnd, TopFixedBundles, BotFixedBundles); + if (BS.FixPoint.Stage == SchedulingStage::GatheringRegions) { + // Gather region boundaries and capture the invariant SemanticOrder for all + // block types. Fixed bundles are NOT set here: they result from loop + // pipelining, which happens during Scheduling, and are applied via the + // setTopFixedBundles / setBotFixedBundles calls in the Scheduling pass. + BS.addRegion(BB, RegionBegin, RegionEnd); + } else if (BS.Kind != BlockType::Loop) { + // Scheduling pass for non-loop blocks: set fixed bundles on the + // pre-gathered region now that emitInterBlockTop / emitInterBlockBottom + // has physically inserted the SWP instructions into the block. + // + // If Regions is empty, the block was empty during GatheringRegions (e.g. + // a newly-created dedicated exit block). The machine scheduler skips + // enterRegion for empty blocks so no region was captured. Create it now + // with correct free-instruction boundaries, excluding any fixed bundles. + if (BS.getRegions().empty()) { + const unsigned TopCount = + (RegionBegin == BB->begin()) ? BS.TopInsert.size() : 0u; + const unsigned BotCount = + (RegionEnd == BB->end()) ? BS.BottomInsert.size() : 0u; + BS.addRegion(BB, std::next(RegionBegin, TopCount), + std::prev(RegionEnd, BotCount)); + } + if (RegionBegin == BB->begin() && !BS.TopInsert.empty()) + BS.getCurrentRegion().setTopFixedBundles(BS.TopInsert); + if (RegionEnd == BB->end() && !BS.BottomInsert.empty()) + BS.getCurrentRegion().setBotFixedBundles(BS.BottomInsert); } } @@ -1042,26 +1070,34 @@ int InterBlockScheduling::getCyclesToRespectTiming( int DistFromLoopEntry = 0; int EntryNops = 0; - auto AddRegionToEdges = [&](const Region &R) { - for (auto &Bundle : R.Bundles) { - for (MachineInstr *MI : Bundle.getInstrs()) { - DistancesFromLoopEntry[MI] = DistFromLoopEntry; - } - ++DistFromLoopEntry; - } + auto AddRegionToEdges = [&](const Region &R, bool IsPostBoundary = false) { + // Add nodes first so that SuccMap/PredMap are populated before depth + // recording (recordPostDepth looks up SuccMap by MachineInstr*). // Here we need to iterate using semantic order. assert(R.top_fixed_instrs().empty() && "SWP epilogue already emitted?"); for (MachineInstr *MI : R.getFreeInstructions()) { Edges.addNode(MI); } + for (auto &Bundle : R.Bundles) { + for (MachineInstr *MI : Bundle.getInstrs()) { + if (IsPostBoundary) { + Edges.recordPostDepth(MI, DistFromLoopEntry); + } else { + DistancesFromLoopEntry[MI] = DistFromLoopEntry; + } + } + ++DistFromLoopEntry; + } }; // Construction of the superblock containing Loop+Epilogue // First part is the loop AddRegionToEdges(LoopBS.getBottom()); + Edges.setPreRegionLength(DistFromLoopEntry); Edges.markBoundary(); // Second part is the epilogue itself - AddRegionToEdges(EpilogueBS.getTop()); + AddRegionToEdges(EpilogueBS.getTop(), /*IsPostBoundary=*/true); + Edges.setPostRegionLength(DistFromLoopEntry - Edges.getPreRegionLength()); Edges.buildEdges(); DEBUG_LOOPAWARE(dumpInterBlock(Edges)); @@ -1081,11 +1117,12 @@ int InterBlockScheduling::getCyclesToRespectTiming( const int PostBoundOrExitDist = (PostBoundaryMI != nullptr) - ? DistancesFromLoopEntry[PostBoundaryMI] - // When getInstr returns nullptr, we reached - // ExitSU. We can consider the DistFromLoopEntry as - // depth of the ExitSU. - : DistFromLoopEntry; + ? Edges.getPostDepthOr(Succ, 0) + // When getInstr returns nullptr, we reached ExitSU. + // The coordinate system counts from the start of the loop + // (same as DistancesFromLoopEntry), so ExitSU is at + // pre-region + post-region bundles. + : Edges.getPreRegionLength() + Edges.getPostRegionLength(); const int Latency = SDep.getSignedLatency(); const int Distance = @@ -1150,56 +1187,14 @@ int InterBlockScheduling::getCyclesToAvoidResourceConflicts( return NopCounter; } -void InterBlockEdges::addNode(MachineInstr *MI) { - if (auto Index = DDG.initSUnit(*MI)) { - IndexMap &TheMap = Boundary ? SuccMap : PredMap; - TheMap.emplace(MI, *Index); - } -} - -// Mark the boundary between the predecessor block and the successor block -void InterBlockEdges::markBoundary() { Boundary = DDG.SUnits.size(); } - -const SUnit *InterBlockEdges::getPreBoundaryNode(MachineInstr *MI) const { - auto Found = PredMap.find(MI); - if (Found == PredMap.end()) { - return nullptr; - } - - return &DDG.SUnits.at(Found->second); -} - -bool InterBlockEdges::isPostBoundaryNode(SUnit *SU) const { - return Boundary ? SU->NodeNum >= *Boundary : false; -} - Region::Region(MachineBasicBlock *BB, MachineBasicBlock::iterator Begin, - MachineBasicBlock::iterator End, - ArrayRef TopFixedBundles, - ArrayRef BotFixedBundles) - : BB(BB), TopFixedBundles(TopFixedBundles), - BotFixedBundles(BotFixedBundles) { - MachineBasicBlock::iterator FreeBegin = - std::next(Begin, TopFixedBundles.size()); - MachineBasicBlock::iterator FreeEnd = std::prev(End, BotFixedBundles.size()); - - // Verify that all fixed instructions are at the right place in the MBB - assert(TopFixedBundles.empty() || Begin == BB->begin()); - assert(TopFixedBundles.empty() || - all_of(TopFixedBundles.back().Instrs, [FreeBegin]( - const MachineInstr *MI) { - return getBundleStart(MI->getIterator()) == std::prev(FreeBegin); - })); - assert(BotFixedBundles.empty() || End == BB->end()); - assert( - BotFixedBundles.empty() || - all_of(BotFixedBundles.front().Instrs, [FreeEnd](const MachineInstr *MI) { - return getBundleStart(MI->getIterator()) == FreeEnd; - })); - + MachineBasicBlock::iterator End) + : BB(BB) { // When the region is created, its instructions haven't been re-ordered yet, - // so this is effectively saving the semantic order. - for (auto It = FreeBegin; It != FreeEnd; ++It) { + // so this is effectively saving the semantic order. Fixed bundles (if any) + // are set separately via setTopFixedBundles / setBotFixedBundles, which + // will trim the corresponding entries from SemanticOrder. + for (auto It = Begin; It != End; ++It) { SemanticOrder.push_back(&*It); } if (End != BB->end()) { @@ -1207,6 +1202,32 @@ Region::Region(MachineBasicBlock *BB, MachineBasicBlock::iterator Begin, } } +void Region::setTopFixedBundles(ArrayRef Bundles) { + assert(TopFixedBundles.empty() && "TopFixedBundles already set."); + // Verify the fixed instructions are physically at the top of the block. + const auto FreeBegin = std::next(BB->begin(), Bundles.size()); + assert(all_of(Bundles.back().Instrs, [FreeBegin](const MachineInstr *MI) { + return getBundleStart(MI->getIterator()) == std::prev(FreeBegin); + })); + TopFixedBundles = Bundles; + // SemanticOrder was captured during GatheringRegions before the fixed + // bundles were inserted into the block, so it already contains only the + // free instructions. No adjustment is needed. +} + +void Region::setBotFixedBundles(ArrayRef Bundles) { + assert(BotFixedBundles.empty() && "BotFixedBundles already set."); + // Verify the fixed instructions are physically at the bottom of the block. + const auto FreeEnd = std::prev(BB->end(), Bundles.size()); + assert(all_of(Bundles.front().Instrs, [FreeEnd](const MachineInstr *MI) { + return getBundleStart(MI->getIterator()) == FreeEnd; + })); + BotFixedBundles = Bundles; + // SemanticOrder was captured during GatheringRegions before the fixed + // bundles were inserted into the block, so it already contains only the + // free instructions. No adjustment is needed. +} + BlockState::BlockState(MachineBasicBlock *Block) : TheBlock(Block) { classify(); setBlockProperties(); @@ -1267,7 +1288,6 @@ void BlockState::classify() { if (LoopAware && IsLoop(TheBlock) && llvm::all_of(TheBlock->successors(), CanFixLoopSchedule)) { Kind = BlockType::Loop; - FixPoint.Stage = SchedulingStage::GatheringRegions; } // We will mark the epilogues in a second sweep, when all states have been @@ -1325,6 +1345,7 @@ void BlockState::initInterBlock(const MachineSchedContext &Context, BoundaryEdges->addNode(MI); } BoundaryEdges->buildEdges(); + BoundaryEdges->recordPreHeightsFromSuccessors(); DEBUG_LOOPAWARE(dumpInterBlock(*BoundaryEdges)); } diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h index bbd3e75770f6..f8260989ed78 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h @@ -31,53 +31,6 @@ namespace llvm::AIE { -/// This class generates all edges between nodes in two flow-adjacent regions -/// The nodes are added in forward flow order, marking the boundary at the -/// appropriate point. -class InterBlockEdges { - DataDependenceHelper DDG; - // the boundary between Pred and Succ nodes - std::optional Boundary; - - /// We can add the same instruction on both sides of the boundary. - /// We maintain explicit maps to retrieve the corresponding SUnit - using IndexMap = std::map; - IndexMap PredMap; - IndexMap SuccMap; - -public: - InterBlockEdges(const MachineSchedContext &Context) - : DDG(Context, true, true) {} - - /// Add a Node to the DAG. - void addNode(MachineInstr *); - - /// Mark the boundary between the predecessor block and the successor block. - /// In normal operation, there should just be one call to this method. - /// Nodes added before are part of the predecesor, nodes added after are - /// part of the successor - void markBoundary(); - - /// Create all the edges by interpreting read and write events of the nodes - // in reverse order. - void buildEdges() { DDG.buildEdges(); } - - /// To iterate forward across the SUnits of the underlying DDG. - auto begin() const { return DDG.SUnits.begin(); } - auto end() const { return DDG.SUnits.end(); } - - /// The following two methods are used to find the cross-boundary edges, - /// by starting from a pre-boundary node and select its successor edges that - /// connect to a post-boundary node. - /// --- - /// Retrieve the SUnit that represents MI's instance before the - /// boundary, null if not found. - const SUnit *getPreBoundaryNode(MachineInstr *MI) const; - - /// Check whether SU represents an instruction after the boundary - bool isPostBoundaryNode(SUnit *SU) const; -}; - // BlockType determines scheduling priority, direction and safety margin // handling. enum class BlockType { Regular, Loop, Epilogue }; @@ -113,7 +66,7 @@ enum class SchedulingStage { /// Parameters that drive fixpoint convergence class FixedpointState { public: - SchedulingStage Stage = SchedulingStage::Scheduling; + SchedulingStage Stage = SchedulingStage::GatheringRegions; // Parameters of the loop-aware convergence int LatencyMargin = 0; SmallMapVector PerMILatencyMargin; @@ -153,9 +106,7 @@ class Region { public: Region(MachineBasicBlock *BB, MachineBasicBlock::iterator Begin, - MachineBasicBlock::iterator End, - ArrayRef TopFixedBundles, - ArrayRef BotFixedBundles); + MachineBasicBlock::iterator End); using free_iterator = std::vector::const_iterator; using fixed_iterator = MachineBasicBlock::iterator; @@ -181,6 +132,18 @@ class Region { } ArrayRef getBotFixedBundles() const { return BotFixedBundles; } + /// Set the fixed bundles at the top of the region (e.g. a SWP epilogue). + /// The instructions must already be physically present at the start of the + /// block. Trims SemanticOrder to exclude the newly fixed instructions. + /// \pre The region starts at BB->begin(). + void setTopFixedBundles(ArrayRef Bundles); + + /// Set the fixed bundles at the bottom of the region (e.g. a SWP prologue). + /// The instructions must already be physically present at the end of the + /// block. Trims SemanticOrder to exclude the newly fixed instructions. + /// \pre The region ends at BB->end(). + void setBotFixedBundles(ArrayRef Bundles); + MachineInstr *getExitInstr() const { return ExitInstr; } std::vector Bundles; @@ -240,15 +203,11 @@ class BlockState { TheBundles.insert(TheBundles.end(), Bundles.begin(), Bundles.end()); } void addRegion(MachineBasicBlock *BB, MachineBasicBlock::iterator RegionBegin, - MachineBasicBlock::iterator RegionEnd, - ArrayRef TopFixedBundles, - ArrayRef BotFixedBundles) { - assert((Kind == BlockType::Loop && - FixPoint.Stage == SchedulingStage::GatheringRegions) || - FixPoint.Stage == SchedulingStage::Scheduling); + MachineBasicBlock::iterator RegionEnd) { + assert(FixPoint.Stage == SchedulingStage::GatheringRegions || + (FixPoint.Stage == SchedulingStage::Scheduling && Regions.empty())); CurrentRegion = Regions.size(); - Regions.emplace_back(BB, RegionBegin, RegionEnd, TopFixedBundles, - BotFixedBundles); + Regions.emplace_back(BB, RegionBegin, RegionEnd); } auto &getCurrentRegion() const { return Regions.at(CurrentRegion); } auto &getCurrentRegion() { return Regions[CurrentRegion]; } @@ -256,6 +215,10 @@ class BlockState { const Region &getTop() const { return Regions.back(); } Region &getTop() { return Regions.back(); } const Region &getBottom() const { return Regions.front(); } + InterBlockEdges &getBoundaryEdges() { + assert(Kind == BlockType::Loop && BoundaryEdges); + return *BoundaryEdges; + } const InterBlockEdges &getBoundaryEdges() const { assert(Kind == BlockType::Loop && BoundaryEdges); return *BoundaryEdges; @@ -344,7 +307,7 @@ class InterBlockScheduling { /// Return one instruction that needs a higher latency cap, or nullptr if all /// latencies converged. - MachineInstr *latencyConverged(BlockState &BS) const; + MachineInstr *latencyConverged(BlockState &BS); /// After finding the loops, determine the epilogue blocks. void markEpilogueBlocks(); @@ -437,6 +400,8 @@ class InterBlockScheduling { AIEAlternateDescriptors &getSelectedAltDescs() { return SelectedAltDescs; } + const MachineSchedContext *getContext() const { return Context; } + std::optional getSWPEpilogueContext(MachineBasicBlock *MBB); }; diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp index 0aef0e14b0cd..e144eecb29c6 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp @@ -11,6 +11,7 @@ #include "AIEMachineScheduler.h" #include "AIEBaseAliasAnalysis.h" #include "AIEBaseInstrInfo.h" +#include "AIEBaseSubtarget.h" #include "AIEBundle.h" #include "AIEHazardRecognizer.h" #include "AIEInterBlockScheduling.h" @@ -783,13 +784,8 @@ void AIEPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) { CurMBB = MBB; // We force bottom up region processing, so the first region // from a block is the bottom one. We reset this when leaving any - // region + // region. IsBottomRegion = true; - - // The block may have a timed region, append its instructions. - auto &BS = InterBlock.getBlockState(MBB); - InterBlock.emitInterBlockTop(BS); - InterBlock.emitInterBlockBottom(BS); } static MachineBasicBlock::iterator @@ -1164,17 +1160,7 @@ int getEarliestLoopCarriedUse(const SUnit &SU, assert(SUInCurrentIteration); assert(SUInCurrentIteration->getHeight() >= SU.getHeight()); - // Look at loop-carried dependencies to see how early the instruction will be - // needed in the next iteration. - int EarliestCycle = std::numeric_limits::max(); - for (const SDep &Succ : SUInCurrentIteration->Succs) { - if (!LoopEdges.isPostBoundaryNode(Succ.getSUnit())) - continue; - - EarliestCycle = std::min(EarliestCycle, int(Succ.getSUnit()->getHeight())); - } - - return EarliestCycle; + return LoopEdges.getPreHeight(SUInCurrentIteration); } /// Apply a set of heuristics to a new candidate for PostRA scheduling. @@ -1737,6 +1723,14 @@ void llvm::AIEPostRASchedStrategy::buildGraph(ScheduleDAGMI &DAG, AAResults *AA, DAG.ExitSU.setInstr(Region.getExitInstr()); DAG.makeMaps(); DAG.buildEdges(Context->AA); + // Apply all post-RA mutations in the correct order. buildGraph owns the + // complete graph construction pipeline (raw edges + mutations); the + // postProcessDAG() call in ScheduleDAGMI::schedule() is a no-op because + // the registered Mutations list is intentionally empty (getPostRAMutations + // returns empty, and createPostMachineScheduler no longer registers them). + const Triple &TT = DAG.MF.getTarget().getTargetTriple(); + for (auto &M : AIEBaseSubtarget::getPostRAMutationsImpl(TT, Context->AA)) + M->apply(&DAG); static_cast(DAG).recordDbgInstrs(Region); } diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp index a8859864a1bd..1ca79c94e817 100644 --- a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp +++ b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp @@ -74,144 +74,89 @@ bool MaxLatencyFinder::isBottomRegion(MachineInstr *ExitMI) { return std::next(It) == CurBB->end(); } -/// Check whether SrcOp and DstOp might refer to the same value -static bool overlap(const MachineOperand &SrcOp, const MachineOperand &DstOp, - const TargetRegisterInfo *TRI) { - Register SrcReg = SrcOp.getReg(); - Register DstReg = DstOp.getReg(); +MaxLatencyFinder::MaxLatencyFinder(const MachineSchedContext &C, + const AIEPostRASchedStrategy *Scheduler, + MachineBasicBlock *CurBB) + : Scheduler(Scheduler), TII(static_cast( + C.MF->getSubtarget().getInstrInfo())), + Itineraries(C.MF->getSubtarget().getInstrItineraryData()), + TRI(C.MF->getSubtarget().getRegisterInfo()), CurBB(CurBB), + InterBlock(true) {} - // Use TRI's regsOverlap which handles both physical and virtual registers, - // including subregisters and lane masks - return TRI->regsOverlap(SrcReg, DstReg); -} - -/// Check whether Dst depends on Src -static bool depends(const MachineInstr &Src, const MachineInstr &Dst, - const TargetRegisterInfo *TRI, AAResults *AA, - bool SafeToIgnoreMemDeps) { - - const AIEBaseInstrInfo *const TII = static_cast( - Src.getMF()->getSubtarget().getInstrInfo()); - // Detect dependency between lock and ld/st intructions. - if ((TII->isLock(Src.getOpcode()) && (Dst.mayLoadOrStore())) || - (TII->isLock(Dst.getOpcode()) && (Src.mayLoadOrStore()))) { - return true; - } - - // We detect any common register input/output between Dst and Src - for (auto &SrcOp : Src.operands()) { - if (!SrcOp.isReg()) { - continue; - } - for (auto &DstOp : Dst.operands()) { - if (!DstOp.isReg()) { - continue; - } - // Exclude the RAR case - if (SrcOp.isUse() && DstOp.isUse()) { - continue; - } - if (overlap(SrcOp, DstOp, TRI)) { - return true; - } - } +// This is called from different contexts, so we need some case analysis +// If we have a basic block, we are in a regular MachineScheduler invocation, +// and we will be able to retrieve its strategy, +// Otherwise we are an abstract region; Scheduler will be nullptr, which +// will not be dereferenced. +MaxLatencyFinder::MaxLatencyFinder(ScheduleDAGInstrs *DAG) + : Scheduler(DAG->getBB() + ? static_cast(DAG)->getSchedImpl() + : nullptr), + TII(static_cast(DAG->TII)), + Itineraries(DAG->getSchedModel()->getInstrItineraries()), + TRI(DAG->MF.getSubtarget().getRegisterInfo()), CurBB(DAG->getBB()), + InterBlock(InterBlockLatency && CurBB && + isBottomRegion(DAG->ExitSU.getInstr()) && + Scheduler->successorsAreScheduled(CurBB)) { + if (CurBB && Scheduler) { + const Region &CurRegion = + Scheduler->getInterBlock().getBlockState(CurBB).getCurrentRegion(); + buildInterBlockEdges(CurRegion); } +} - // Use alias analysis if available. - // The memory latency is accounted for by maxLatency() and any - // possible dependence will be corrected for by its scheduled cycle. - // (RAW || WAW) || - // (WAR) - if ((Src.mayStore() && (Dst.mayLoad() || Dst.mayStore())) || - (Src.mayLoad() && Dst.mayStore())) { - - // For non-part-word memory instructions, use alias analysis (if available) - // to determine if Src and Dst may alias. Part-word instructions are always - // treated conservatively due to their read-modify-write behavior. - auto IsPartWordStore = [&TII](const MachineInstr &MaybePartStore) { - return MaybePartStore.mayStore() && - TII->isPartWordMemoryInst(MaybePartStore); - }; +void MaxLatencyFinder::buildInterBlockEdges(const Region &CurRegion) { + const MachineSchedContext &C = *Scheduler->getInterBlock().getContext(); + const InterBlockScheduling &IB = Scheduler->getInterBlock(); - if (!IsPartWordStore(Src)) { + HasUnknownSuccessors = CurBB->succ_empty(); - // If it's safe to ignore memory dependencies, skip memory checks. - if (SafeToIgnoreMemDeps) - return false; + // When the outer loop pipeliner has annotated the epilogue to indicate that + // epilogue stores will not alias with loads from the peeled iteration, we + // suppress cross-boundary memory edges in the inter-block DDG accordingly. + const bool SafeToIgnoreMemDeps = + IB.getBlockState(CurBB).isSafeToIgnoreMemDeps(); - if (AA) - return Src.mayAlias(AA, Dst, true); + for (MachineBasicBlock *SuccBB : CurBB->successors()) { + InterBlockEdges &SE = *PerSuccEdges.emplace_back( + std::make_unique(C, SafeToIgnoreMemDeps)); + + // Pre-boundary: free instructions of the current region. + for (MachineInstr *MI : CurRegion.getFreeInstructions()) + SE.addNode(MI); + + SE.markBoundary(); + + // Post-boundary: always use getFreeInstructions() as the single source of + // node identity. Empty regions signify empty basic blocks; in that case no + // post-boundary nodes are added. + const BlockState &SBS = IB.getBlockState(SuccBB); + if (!SBS.getRegions().empty()) { + for (MachineInstr *MI : SBS.getTop().getFreeInstructions()) + SE.addNode(MI); } - // Conservative: assume dependency for part-word instructions or when AA - // is unavailable - return true; - } + SE.buildEdges(); - return false; -} - -InstrAndCycle findEarliestRef(const MachineInstr &SrcMI, - ArrayRef Bundles, int Prune, - AAResults *AA, bool SafeToIgnoreMemDeps) { - const TargetRegisterInfo *TRI = - SrcMI.getMF()->getSubtarget().getRegisterInfo(); - int Cycle = 0; - for (const auto &Bundle : Bundles) { - if (Cycle >= Prune) { - LLVM_DEBUG(dbgs() << " prune at " << Cycle << "\n"); - return {/*MI=*/nullptr, Cycle}; + // After the graph is built, record the scheduled cycle depth for each + // post-boundary instruction and the total length of the successor block's + // top region. + // Instructions absent from Depths return depth 0 from getDepth(), which + // is the conservative value (no latency reduction) for unscheduled nodes. + if (!SBS.isScheduled() || SBS.getRegions().empty()) { + return; } - for (MachineInstr *DstMI : Bundle.getInstrs()) { - LLVM_DEBUG(dbgs() << " " << *DstMI); - if (depends(SrcMI, *DstMI, TRI, AA, SafeToIgnoreMemDeps)) { - LLVM_DEBUG(dbgs() << " depends in cycle=" << Cycle << "\n"); - return {DstMI, Cycle}; + int Cycle = 0; + for (const MachineBundle &Bundle : SBS.getTop().Bundles) { + for (MachineInstr *MI : Bundle.getInstrs()) { + SE.recordPostDepth(MI, Cycle); } + ++Cycle; } - Cycle++; + SE.setPostRegionLength(Cycle); } - return {/*MI=*/nullptr, Cycle}; } -MaxLatencyFinder::MaxLatencyFinder( - const AIEPostRASchedStrategy *const Scheduler, - const AIEBaseInstrInfo *const TII, - const InstrItineraryData *const Itineraries, - const MCRegisterInfo *const TRI, MachineBasicBlock *const CurBB, - AAResults *AA) - : Scheduler(Scheduler), TII(TII), Itineraries(Itineraries), TRI(TRI), - CurBB(CurBB), InterBlock(true), AA(AA), SafeToIgnoreMemDeps(false) {} - -// This is called from different contexts, so we need some case analysis -// If we have a basic block, we are in a regular MachineScheduler invocation, -// and we will be able to retrieve its strategy, -// Otherwise we are an abstract region; Scheduler will be nullptr, which -// will not be derefenced. -MaxLatencyFinder::MaxLatencyFinder(ScheduleDAGInstrs *DAG, AAResults *AA) - : Scheduler(DAG->getBB() - ? static_cast(DAG)->getSchedImpl() - : nullptr), - TII(static_cast(DAG->TII)), - Itineraries(DAG->getSchedModel()->getInstrItineraries()), - TRI(DAG->MF.getSubtarget().getRegisterInfo()), CurBB(DAG->getBB()), - InterBlock(InterBlockLatency && CurBB && - isBottomRegion(DAG->ExitSU.getInstr()) && - Scheduler->successorsAreScheduled(CurBB)), - AA(AA), - // This is a current assumption needed to achieve a proper compact - // schedule. - // A loop is considered a candidate for outer loop pipelining if there are - // no memory-carried dependencies. The outer loop pipeliner attaches - // related metadata to the loop/epilogue, which we capture here. This - // metadata indicates that epilogue stores will not alias with loads from - // the peeled iteration. We will further analyze why AA is too - // conservative in some cases and remove this assumption when possible. - SafeToIgnoreMemDeps(Scheduler && CurBB && - Scheduler->getInterBlock() - .getBlockState(CurBB) - .isSafeToIgnoreMemDeps()) {} - unsigned MaxLatencyFinder::operator()(MachineInstr &MI) { LLVM_DEBUG(dbgs() << MI << "\n"); // If we don't use interblock information, include the 'StageLatency' @@ -239,29 +184,55 @@ unsigned MaxLatencyFinder::operator()(MachineInstr &MI) { } return Latency; } - LLVM_DEBUG(dbgs() << "Earliest for: " << MI); - // Track the earliest use in any successor block, given the cycles in - // which these uses are scheduled - int Earliest = Latency; - for (MachineBasicBlock *SuccBB : CurBB->successors()) { - auto &SBS = IB.getBlockState(SuccBB); - assert(SBS.isScheduled()); - if (SBS.getRegions().empty()) { - // Blocks can be empty. getTop() will fail, and Earliest=0 is - // a conservative value - Earliest = 0; + + int EffectiveLatency = HasUnknownSuccessors ? Latency : 0; + LLVM_DEBUG(dbgs() << " EffectiveLatency=" << EffectiveLatency + << (HasUnknownSuccessors ? " (HasUnknownSuccessors)" + : " (known successors)") + << "\n"); + for (auto &SEPtr : PerSuccEdges) { + InterBlockEdges &SE = *SEPtr; + const SUnit *Pred = SE.getPreBoundaryNode(&MI); + if (!Pred) { + LLVM_DEBUG( + dbgs() << " No pre-boundary node for this successor, skip\n"); continue; } - const std::vector &TopBundles = SBS.getTop().Bundles; - Earliest = - findEarliestRef(MI, TopBundles, Earliest, AA, SafeToIgnoreMemDeps) - .Cycle; + LLVM_DEBUG(dbgs() << " Pre-boundary SU#" << Pred->NodeNum << " has " + << Pred->Succs.size() << " successor edge(s)\n"); + + for (const SDep &Dep : Pred->Succs) { + SUnit *Succ = Dep.getSUnit(); + if (!SE.isPostBoundaryNode(Succ)) { + LLVM_DEBUG(dbgs() << " SU#" << Succ->NodeNum + << " is not a post-boundary node, skip\n"); + continue; + } + + // For ExitSU the depth is the full length of the successor block's + // top region (all its cycles have elapsed before reaching ExitSU). + // For a regular instruction node the depth is its scheduled cycle + // within the block. + const int Depth = Succ->isBoundaryNode() ? SE.getPostRegionLength() + : SE.getPostDepthOr(Succ, 0); + const int EdgeLat = Dep.getSignedLatency(); + const int Remaining = EdgeLat - Depth; + LLVM_DEBUG( + dbgs() << " " << (Succ->isBoundaryNode() ? "ExitSU" : "SU#") + << (Succ->isBoundaryNode() ? "" + : std::to_string(Succ->NodeNum)) + << ": latency=" << EdgeLat << ", depth=" << Depth + << ", remaining=" << Remaining + << ", updating EffectiveLatency " << EffectiveLatency << " -> " + << std::max(EffectiveLatency, Remaining) << "\n"); + EffectiveLatency = std::max(EffectiveLatency, Remaining); + } } + // Cap at the raw maxLatency of the source instruction. + EffectiveLatency = std::min(EffectiveLatency, Latency); + LLVM_DEBUG(dbgs() << " EffectiveLatency=" << EffectiveLatency << "\n"); - LLVM_DEBUG(dbgs() << " Earliest=" << Earliest << "\n"); - Latency = std::max(Latency - Earliest, 1); - LLVM_DEBUG(dbgs() << "EffectiveLatency=" << Latency << "\n"); - return Latency; + return static_cast(EffectiveLatency); } } // namespace llvm::AIE diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h index e5e17d40452b..7705cf2ac4c5 100644 --- a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h +++ b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h @@ -16,8 +16,11 @@ #define LLVM_LIB_TARGET_AIE_MAXLATENCYFINDER_H #include "AIEBaseSubtarget.h" +#include "AIEDataDependenceHelper.h" #include "AIEMachineScheduler.h" #include "llvm/CodeGen/MachineInstr.h" +#include +#include using namespace llvm; @@ -27,20 +30,6 @@ namespace llvm::AIE { int maxLatency(const MachineInstr *MI, const AIEBaseInstrInfo &InstrInfo, const InstrItineraryData &Itineraries, bool IncludeStages); -struct InstrAndCycle { - MachineInstr *MI = nullptr; - int Cycle; -}; - -/// Find the first dependence on SrcMI in Bundles[0,Prune) -/// \returns the Cycle in which the dependence happens or a conservative lower -/// bound and the instruction responsible for the dependency if it is -/// found. -InstrAndCycle findEarliestRef(const MachineInstr &SrcMI, - ArrayRef Bundles, int Prune, - AAResults *AA = nullptr, - bool SafeToIgnoreMemDeps = false); - class MaxLatencyFinder { const AIEPostRASchedStrategy *const Scheduler; const AIEBaseInstrInfo *const TII; @@ -48,24 +37,34 @@ class MaxLatencyFinder { const MCRegisterInfo *const TRI; MachineBasicBlock *const CurBB; const bool InterBlock; - AAResults *AA; - bool SafeToIgnoreMemDeps; - // Check whether this region connects to the successor blocks - // + /// One entry per CFG successor of CurBB. InterBlockEdges is heap-allocated + /// via unique_ptr because it inherits from ScheduleDAGInstrs which is not + /// safely moveable. + std::vector> PerSuccEdges; + + /// True when CurBB has no CFG successors (e.g. a return block), requiring + /// the conservative raw latency as a floor. + bool HasUnknownSuccessors = false; + + // Check whether this region connects to the successor blocks. bool isBottomRegion(MachineInstr *ExitMI); + // Build one InterBlockEdges per CFG successor of CurBB and populate + // PerSuccEdges. + void buildInterBlockEdges(const Region &CurRegion); + public: // Constructors - MaxLatencyFinder(const AIEPostRASchedStrategy *const Scheduler, - const AIEBaseInstrInfo *const TII, - const InstrItineraryData *const Itineraries, - const MCRegisterInfo *const TRI, - MachineBasicBlock *const CurBB, AAResults *AA = nullptr); + // Derive TII, TRI, and Itineraries from the scheduling context, keeping + // only Scheduler and CurBB as explicit parameters. + MaxLatencyFinder(const MachineSchedContext &C, + const AIEPostRASchedStrategy *Scheduler, + MachineBasicBlock *CurBB); - MaxLatencyFinder(ScheduleDAGInstrs *DAG, AAResults *AA = nullptr); + MaxLatencyFinder(ScheduleDAGInstrs *DAG); - // Find the maximum latency of MI taking successors into account + // Find the maximum latency of MI taking successors into account. unsigned operator()(MachineInstr &MI); }; diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll index 68ee86c98a44..713f0d7e3981 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll @@ -34,8 +34,8 @@ declare { ptr, i20, i20 } @llvm.aie2.add.3d(ptr, i20, i20, i20, i20, i20, i20, i define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm2_data, ptr noalias %ofm_data, ptr %.out, ptr %conv.i.i.i.out, ptr %idx.ext9.out, ptr %.out1, ptr %.out2, ptr %.out3, ptr %.out4, ptr %.out5, ptr %conv.i.i.i.i.i.out, ptr %.out6, ptr %conv.i.i.i46.out, ptr %xtraiter.out, ptr %in_ptr1.051.unr.ce.out, ptr %in_ptr2.0.in50.unr.ce.out, ptr %out_ptr.049.unr.ce.out, ptr %itr_left_cnt0.048.unr.ce.out, ptr %itr_left_cnt1.047.unr.ce.out) #3 { ; ASM-LABEL: add2d: ; ASM: // %bb.0: // %newFuncRoot -; ASM-NEXT: paddb [p0], #40; lda r2, [p0, #64]; nops ; nopxm ; nopv -; ASM-NEXT: lda m2, [p0], #-4; nopx +; ASM-NEXT: lda r2, [p0, #64]; paddb [p0], #40; nopxm +; ASM-NEXT: lda m2, [p0], #-4 ; ASM-NEXT: lda m5, [p0], #8 ; ASM-NEXT: lda m4, [p0], #8 ; ASM-NEXT: lda m3, [p0], #-24 @@ -60,14 +60,14 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm ; ASM-NEXT: lda p0, [p7], #-4; st m1, [p0, #0]; add r7, r2, #-1; mov r6, #1 ; ASM-NEXT: lda p4, [p7], #-4; st m0, [p0, #0]; ne r6, r0, r6 ; ASM-NEXT: lda r13, [p7], #-4; st dj0, [p0, #0]; movx r0, #3 -; ASM-NEXT: st dj4, [p0, #0]; ltu r7, r7, r0 -; ASM-NEXT: st dn0, [p0, #0]; nez r1, r1 -; ASM-NEXT: lda r9, [p7], #-4; st dn4, [p5, #0]; jz r7, #.LBB0_2 -; ASM-NEXT: lda r10, [p7], #-4; st r1, [p6, #0] // Delay Slot 5 -; ASM-NEXT: lda r11, [p7], #-4; st r5, [p0, #0] // Delay Slot 4 -; ASM-NEXT: lda p7, [p7, #-4]; paddb [p2], m5; st r6, [p4, #0] // Delay Slot 3 -; ASM-NEXT: lda r12, [p7, #0]; paddb [p2], m4; and r8, r2, r0; mov p0, r13; padds [p1], m2 // Delay Slot 2 -; ASM-NEXT: mova r0, #0; paddb [p2], m3; st r8, [p0, #0] // Delay Slot 1 +; ASM-NEXT: lda r9, [p7], #-4; st dj4, [p0, #0]; ltu r7, r7, r0 +; ASM-NEXT: lda r10, [p7], #-4; st dn0, [p0, #0]; nez r1, r1 +; ASM-NEXT: lda r11, [p7], #-4; st dn4, [p5, #0]; jz r7, #.LBB0_2 +; ASM-NEXT: lda p7, [p7, #-4]; st r1, [p6, #0] // Delay Slot 5 +; ASM-NEXT: st r5, [p0, #0] // Delay Slot 4 +; ASM-NEXT: paddb [p2], m5; st r6, [p4, #0] // Delay Slot 3 +; ASM-NEXT: lda r12, [p7, #0]; paddb [p2], m4; and r8, r2, r0; mov p0, r13 // Delay Slot 2 +; ASM-NEXT: padda [p1], m2; paddb [p2], m3; movx r0, #0; st r8, [p0, #0] // Delay Slot 1 ; ASM-NEXT: // %bb.1: ; ASM-NEXT: j #.LBB0_5 ; ASM-NEXT: nop // Delay Slot 5 @@ -76,20 +76,14 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm ; ASM-NEXT: nop // Delay Slot 2 ; ASM-NEXT: mova r1, #0 // Delay Slot 1 ; ASM-NEXT: .LBB0_2: // %entry.new -; ASM-NEXT: nopb ; vlda.ups.s32.d8 cm2, s1, [p1], m1; nops ; nopx ; mov dc0, #0; nopv +; ASM-NEXT: vlda.ups.s32.d8 cm2, s1, [p1], m1; mov dc0, #0 ; ASM-NEXT: vlda.ups.s32.d8 cm1, s1, [p1], m1; mov dc4, dc0 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0; nopx -; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; mov crUPSSign, r4 -; ASM-NEXT: vlda.ups.s32.d8 cm4, s1, [p1], m1; mov s1, r3 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm5, s1, [p2], d0 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm3, s1, [p2], d0 -; ASM-NEXT: nop -; ASM-NEXT: movxm ls, #.LBB0_3 -; ASM-NEXT: mova r0, #-4; movxm le, #.L_LEnd0 -; ASM-NEXT: and r0, r2, r0 -; ASM-NEXT: mova r2, #-2; add r0, r0, #-4 -; ASM-NEXT: lshl r0, r0, r2; mov crSRSSign, r6 +; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0; movx r0, #-4; mov crUPSSign, r4 +; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; movxm ls, #.LBB0_3 +; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; movxm le, #.L_LEnd0 +; ASM-NEXT: vlda.ups.s32.d8 cm4, s1, [p1], m1; and r0, r2, r0; mov s1, r3 +; ASM-NEXT: vlda.3d.ups.s32.d8 cm5, s1, [p2], d0; add r0, r0, #-4; mov r2, #-2 +; ASM-NEXT: vlda.3d.ups.s32.d8 cm3, s1, [p2], d0; lshl r0, r0, r2; mov crSRSSign, r6 ; ASM-NEXT: add r0, r0, #1; mov s0, r5 ; ASM-NEXT: nopb ; nopa ; nops ; nopx ; add.nc lc, r0, #-1; nopv ; ASM-NEXT: .LBB0_3: // %for.body diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll index a70e2cd07a66..c0977cea0eeb 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll @@ -216,7 +216,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: .LBB0_1: // %outer.loop.header ; ZOL-NEXT: // =>This Loop Header: Depth=1 ; ZOL-NEXT: // Child Loop BB0_2 Depth 2 -; ZOL-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32]; nopx +; ZOL-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32]; nopb ; nopx ; ZOL-NEXT: vlda.ups.s32.s16 bml0, s0, [p2], m5 ; ZOL-NEXT: vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m7, p5 ; ZOL-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m7 @@ -229,17 +229,17 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32]; vldb wh3, [p0], m6 ; ZOL-NEXT: vlda.ups.s32.s16 bml7, s0, [p2], m7; vldb wl7, [p0], m6 ; ZOL-NEXT: vlda.ups.s32.s16 bmh5, s0, [p2, #32]; vldb.3d wh7, [p0], d0 -; ZOL-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m5; movxm ls, #.LBB0_2 -; ZOL-NEXT: vldb wl6, [p1], #32; movxm le, #.L_LEnd0 -; ZOL-NEXT: vlda wh6, [p1], #32; vldb wl5, [p0], m6; mov r1, p0 -; ZOL-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32]; vldb wh5, [p0], m6; and r0, r0, r9 -; ZOL-NEXT: vlda wl8, [p1], #32; vldb wl3, [p0], m6; add r0, r0, #33 -; ZOL-NEXT: vlda wh8, [p1], #32; vldb.3d wh3, [p0], d0; vshift.align x4, x4, s1, x3, r0 -; ZOL-NEXT: vlda.ups.s32.s16 bml6, s0, [p2, #0]; vldb wl1, [p1], #32; and r1, r1, r9; vshift.align x2, x2, s1, x7, r0 -; ZOL-NEXT: vldb wh1, [p1], #32; add r0, r1, #33; mov r1, p0 -; ZOL-NEXT: vldb wl10, [p1], #32; vshuffle x7, x4, x2, r2 -; ZOL-NEXT: vldb wh10, [p1], #32; vshuffle x9, x7, x0, r8 -; ZOL-NEXT: nopb ; nopa ; nops ; and r1, r1, r9; add.nc lc, r5, #-2; nopv +; ZOL-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m5 +; ZOL-NEXT: vldb wl6, [p1], #32; movxm ls, #.LBB0_2 +; ZOL-NEXT: vldb wh6, [p1], #32; movxm le, #.L_LEnd0 +; ZOL-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32]; vldb wl5, [p0], m6; and r0, r0, r9; mov r1, p0 +; ZOL-NEXT: vlda wl8, [p1], #32; vldb wh5, [p0], m6; add r0, r0, #33 +; ZOL-NEXT: vlda wh8, [p1], #32; vldb wl3, [p0], m6; vshift.align x4, x4, s1, x3, r0 +; ZOL-NEXT: vlda.ups.s32.s16 bml6, s0, [p2, #0]; vldb.3d wh3, [p0], d0; and r1, r1, r9; vshift.align x2, x2, s1, x7, r0 +; ZOL-NEXT: vldb wl1, [p1], #32; add r0, r1, #33; mov r1, p0 +; ZOL-NEXT: vldb wh1, [p1], #32; vshuffle x7, x4, x2, r2 +; ZOL-NEXT: vldb wl10, [p1], #32; vshuffle x9, x7, x0, r8 +; ZOL-NEXT: vldb wh10, [p1], #32; nopa ; nops ; and r1, r1, r9; add.nc lc, r5, #-2; nopv ; ZOL-NEXT: .LBB0_2: // %inner.loop ; ZOL-NEXT: // Parent Loop BB0_1 Depth=1 ; ZOL-NEXT: // => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll index 401aa4affd85..c4dae8bfdf2c 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll @@ -4,7 +4,7 @@ ; See https://llvm.org/LICENSE.txt for license information. ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception ; -; (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates +; (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates ; RUN: llc -O2 -mtriple=aie2 \ ; RUN: %s -o - | FileCheck %s @@ -65,6 +65,7 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non ; CHECK-NEXT: nopb ; nopa ; nops ; movxm r3, #16512; nopv ; CHECK-NEXT: nopa ; movxm r4, #-16256 ; CHECK-NEXT: movxm r5, #32767 +; CHECK-NEXT: movxm r6, #15616 ; CHECK-NEXT: movxm r0, #16256 ; CHECK-NEXT: movxm r1, #16384 ; CHECK-NEXT: lda r0, [p2, #0]; movxm r2, #16128 @@ -73,37 +74,36 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non ; CHECK-NEXT: vbcst.16 x2, r2 ; CHECK-NEXT: mova r1, #0; vconv.fp32.bf16 bmh0, wl2 ; CHECK-NEXT: vbcst.16 x2, r1 -; CHECK-NEXT: vldb wl3, [p0], #32; vmov wh0, wl2 -; CHECK-NEXT: mova r1, #-5; vmov wh3, wl2 +; CHECK-NEXT: vmov wh0, wl2 +; CHECK-NEXT: mova r1, #-5; vldb wl3, [p0], #32; vmov wh3, wl2 ; CHECK-NEXT: mova r1, #60; lshl r2, r0, r1; vconv.fp32.bf16 bmh1, wl3 -; CHECK-NEXT: movxm r6, #15616; vmul.f bmh2, x0, x3, r1 -; CHECK-NEXT: movxm r7, #16000 +; CHECK-NEXT: movxm r7, #16000; vmul.f bmh2, x0, x3, r1 ; CHECK-NEXT: vbcst.16 x1, r3 ; CHECK-NEXT: vbcst.16 x8, r4 -; CHECK-NEXT: vbcst.16 x10, r5; vmul.f bmh3, x0, x3, r1 +; CHECK-NEXT: vbcst.16 x10, r5 ; CHECK-NEXT: vbcst.16 x6, r6 -; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh2; vbcst.16 x4, r7 -; CHECK-NEXT: vmov wh6, wl2 +; CHECK-NEXT: vbcst.16 x4, r7; vmul.f bmh3, x0, x3, r1 +; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh2; vmov wh6, wl2 +; CHECK-NEXT: vmov wh4, wl2 ; CHECK-NEXT: vmin_ge.bf16 x3, r16, x3, x1 ; CHECK-NEXT: or r8, r16, r16; vmax_lt.bf16 x3, r16, x3, x8 -; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh3; vband x7, x10, x3 -; CHECK-NEXT: vmov wh7, wl2 +; CHECK-NEXT: vband x7, x10, x3 +; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh3; vmov wh3, wl2 +; CHECK-NEXT: vldb wl7, [p0], #32; vmov wh7, wl2 ; CHECK-NEXT: vmin_ge.bf16 x5, r16, x5, x1 ; CHECK-NEXT: vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x5, x8 -; CHECK-NEXT: vband x7, x10, x5 -; CHECK-NEXT: vldb wl7, [p0], #32; vmov wh7, wl2; vmul.f bmh2, x6, x7, r1 -; CHECK-NEXT: vmov wh4, wl2 -; CHECK-NEXT: vmov wh3, wl2; vmul.f bmh4, x6, x7, r1 -; CHECK-NEXT: nop -; CHECK-NEXT: vmov wh5, wl2; vmac.f bmh3, bmh0, x3, x4, r1 +; CHECK-NEXT: vband x7, x10, x5; vmul.f bmh2, x6, x7, r1 +; CHECK-NEXT: vmov wh7, wl2 +; CHECK-NEXT: vmac.f bmh3, bmh0, x3, x4, r1 +; CHECK-NEXT: vmul.f bmh4, x6, x7, r1 ; CHECK-NEXT: vmul.f bmh5, x0, x7, r1 -; CHECK-NEXT: vmac.f bmh6, bmh0, x5, x4, r1 +; CHECK-NEXT: vmov wh5, wl2 ; CHECK-NEXT: vconv.bf16.fp32 wl7, bmh2; vmul.f bmh7, x0, x7, r1 -; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh4 +; CHECK-NEXT: vmac.f bmh6, bmh0, x5, x4, r1 ; CHECK-NEXT: vmsc.f bmh3, bmh3, x7, x3, r1 -; CHECK-NEXT: movxm ls, #.LBB0_1; vmsc.f bml4, bmh6, x3, x5, r1 +; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh4; movxm ls, #.LBB0_1 ; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh5; movxm le, #.L_LEnd0 -; CHECK-NEXT: add.nc lc, r2, #-2 +; CHECK-NEXT: add.nc lc, r2, #-2; vmsc.f bml4, bmh6, x3, x5, r1 ; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh7; vmin_ge.bf16 x3, r16, x3, x1 ; CHECK-NEXT: vmax_lt.bf16 x3, r16, x3, x8 ; CHECK-NEXT: mova r0, #28; vconv.bf16.fp32 wl7, bmh3; vmin_ge.bf16 x11, r16, x5, x1 diff --git a/llvm/test/CodeGen/AIE/aie2/extract.ll b/llvm/test/CodeGen/AIE/aie2/extract.ll index 28e9ce6a1524..220b8e83f119 100644 --- a/llvm/test/CodeGen/AIE/aie2/extract.ll +++ b/llvm/test/CodeGen/AIE/aie2/extract.ll @@ -99,9 +99,9 @@ define dso_local noundef <32 x i8> @_Z30test_extract_v64uint4_256_1024Dv128_DU8_ ; CHECK-NEXT: jz r0, #.LBB2_6 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: vlda wl4, [sp, #-160] // 32-byte Folded Reload Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 3 ; CHECK-NEXT: vlda wh5, [sp, #-64] // 32-byte Folded Reload Delay Slot 2 -; CHECK-NEXT: nop // Delay Slot 1 +; CHECK-NEXT: vlda wl4, [sp, #-160] // 32-byte Folded Reload Delay Slot 1 ; CHECK-NEXT: // %bb.3: // %if.else.i ; CHECK-NEXT: j #.LBB2_6 ; CHECK-NEXT: nop // Delay Slot 5 @@ -444,9 +444,9 @@ define dso_local noundef <4 x i64> @_Z20test_extract_v8acc32Dv32_u7__acc32i(<16 ; CHECK-NEXT: jz r0, #.LBB13_6 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: vlda amll0, [sp, #-160] // 32-byte Folded Reload Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 3 ; CHECK-NEXT: vlda amhh0, [sp, #-64] // 32-byte Folded Reload Delay Slot 2 -; CHECK-NEXT: nop // Delay Slot 1 +; CHECK-NEXT: vlda amll0, [sp, #-160] // 32-byte Folded Reload Delay Slot 1 ; CHECK-NEXT: // %bb.3: // %if.else.i ; CHECK-NEXT: j #.LBB13_6 ; CHECK-NEXT: nop // Delay Slot 5 @@ -664,9 +664,9 @@ define dso_local noundef <4 x i64> @_Z20test_extract_v4acc64Dv16_u7__acc64i(<16 ; CHECK-NEXT: jz r0, #.LBB20_6 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: vlda amll0, [sp, #-160] // 32-byte Folded Reload Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 3 ; CHECK-NEXT: vlda amhh0, [sp, #-64] // 32-byte Folded Reload Delay Slot 2 -; CHECK-NEXT: nop // Delay Slot 1 +; CHECK-NEXT: vlda amll0, [sp, #-160] // 32-byte Folded Reload Delay Slot 1 ; CHECK-NEXT: // %bb.3: // %if.else.i ; CHECK-NEXT: j #.LBB20_6 ; CHECK-NEXT: nop // Delay Slot 5 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir index 2c48a9426854..19af57435e7d 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir @@ -4,7 +4,7 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates # RUN: llc --mtriple=aie2 --run-pass=postmisched \ # RUN: %s -o - | FileCheck %s @@ -35,9 +35,6 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: $s1 = MOV_mv_scl killed $r2 - ; CHECK-NEXT: NOP - ; CHECK-NEXT: NOP - ; CHECK-NEXT: NOP ; CHECK-NEXT: BUNDLE implicit-def $r1, implicit-def dead $srcarry, implicit-def $s0, implicit killed $r1, implicit killed $r4 { ; CHECK-NEXT: renamable $r1 = ADD_add_r_ri killed renamable $r1, -4, implicit-def dead $srcarry ; CHECK-NEXT: $s0 = MOV_mv_scl killed $r4 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/swp/doloop-stage0.ll b/llvm/test/CodeGen/AIE/aie2/schedule/swp/doloop-stage0.ll index c2b8cf49d88d..b3384ae6f241 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/swp/doloop-stage0.ll +++ b/llvm/test/CodeGen/AIE/aie2/schedule/swp/doloop-stage0.ll @@ -4,7 +4,7 @@ ; See https://llvm.org/LICENSE.txt for license information. ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception ; -; (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates +; (c) Copyright 2023-2026 Advanced Micro Devices, Inc. or its affiliates ; RUN: llc --mtriple=aie2 -O2 --aie-pipeliner-max-guards=2 -enable-aie-zol-without-minitercount=false %s -o - | FileCheck %s ; Similar to stage0.ll, but now with a do-while. Again we expect a three @@ -18,14 +18,13 @@ define dso_local i32 @dot(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 noundef %n) { ; CHECK-LABEL: dot: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: nopa ; movxm m0, #2044 -; CHECK-NEXT: lda r3, [p1], m0; add r5, r1, #-1 -; CHECK-NEXT: lda r2, [p0], m0; jz r5, #.LBB0_5 -; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: nopa ; nopb ; add r5, r1, #-1; nopm +; CHECK-NEXT: jz r5, #.LBB0_5 +; CHECK-NEXT: movxm m0, #2044 // Delay Slot 5 +; CHECK-NEXT: lda r2, [p0], m0 // Delay Slot 4 +; CHECK-NEXT: lda r3, [p1], m0 // Delay Slot 3 ; CHECK-NEXT: nop // Delay Slot 2 -; CHECK-NEXT: movx r0, #0 // Delay Slot 1 +; CHECK-NEXT: mova r0, #0 // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %do.body ; CHECK-NEXT: lda r4, [p1], m0; add r5, r5, #-1 ; CHECK-NEXT: lda r1, [p0], m0; jz r5, #.LBB0_4 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/swp/prepipeliner-ore.ll b/llvm/test/CodeGen/AIE/aie2/schedule/swp/prepipeliner-ore.ll index 2eb14f4b2d98..fa1c257540d1 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/swp/prepipeliner-ore.ll +++ b/llvm/test/CodeGen/AIE/aie2/schedule/swp/prepipeliner-ore.ll @@ -55,7 +55,7 @@ define dso_local i32 @dot(ptr addrspace(6) nocapture readonly %a, ptr addrspace( ; PRE-NEXT: - NS: '3' ; PRE-NEXT: - Loop: bb.2.for.body ; PRE-NEXT: - Prologue: bb.1.for.body.preheader -; PRE-NEXT: - PrologueBundles: '10' +; PRE-NEXT: - PrologueBundles: '7' ; PRE-NEXT: - Epilogue: bb.3 ; PRE-NEXT: - EpilogueBundles: '6' ; PRE-NEXT: ... diff --git a/llvm/test/CodeGen/AIE/aie2/set.ll b/llvm/test/CodeGen/AIE/aie2/set.ll index 66c81001a1ad..294939198b68 100644 --- a/llvm/test/CodeGen/AIE/aie2/set.ll +++ b/llvm/test/CodeGen/AIE/aie2/set.ll @@ -16,8 +16,8 @@ define dso_local noundef <64 x i8> @_Z29test_set_v128uint_set_512_256iDv32_DU8_( ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: vmov wl0, wl2 // Delay Slot 2 -; CHECK-NEXT: nop // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: vmov wl0, wl2 // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %if.else.i ; CHECK-NEXT: vmov wh0, wl0 ; CHECK-NEXT: .LBB0_2: // %_ZL13set_v128uint4iDv32_DU8_.exit @@ -199,8 +199,8 @@ define dso_local noundef <128 x i8> @_Z27test_set_v256uint4_1024_512iDv64_DU8_(i ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: vmov x4, x0 // Delay Slot 2 -; CHECK-NEXT: nop // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: vmov x4, x0 // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %if.else.i ; CHECK-NEXT: vmov x5, x4 ; CHECK-NEXT: .LBB4_2: // %_ZL13set_v256uint4iDv64_DU8_.exit diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll index 0dd29ac72714..ccf6cb3a1140 100644 --- a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll +++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll @@ -27,9 +27,9 @@ define weak_odr dso_local void @convert_bf16_to_bfp16(ptr noalias %in, ptr noali ; CHECK-NEXT: vlda.pop.512 x6, [p0, lf0, r24]; movs dc1, dj0; mov dn1, dn0 ; CHECK-NEXT: vldb.pop.512.2d x4, [p0, lf0, r24, d1] ; CHECK-NEXT: nop -; CHECK-NEXT: lda m0, [p2, #4]; vldb.fill.512 [p0, lf0, r24] +; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24] ; CHECK-NEXT: vlda.pop.512 x6, [p0, lf0, r24] -; CHECK-NEXT: vldb.pop.512.2d x4, [p0, lf0, r24, d1]; movxm ls, #.LBB0_1 +; CHECK-NEXT: lda m0, [p2, #4]; vldb.pop.512.2d x4, [p0, lf0, r24, d1]; movxm ls, #.LBB0_1 ; CHECK-NEXT: movxm le, #.L_LEnd0 ; CHECK-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopx ; add.nc lc, r0, #-3; nopv ; CHECK-NEXT: vlda.pop.512 x6, [p0, lf0, r24]; nopb ; nops ; nopx ; vconv.fp32.bf16 cml1, x6; nopv diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/gelu-templated.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/gelu-templated.ll index b72536458b9e..e52c94ef8500 100644 --- a/llvm/test/CodeGen/AIE/aie2p/end-to-end/gelu-templated.ll +++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/gelu-templated.ll @@ -17,54 +17,53 @@ define void @gelu_fn(ptr noalias %ifm, ptr noalias %ofm, ptr nonnull align 64 dereferenceable(64) %params) { ; CHECK-LABEL: gelu_fn: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; nopb ; nopx +; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; nopx ; CHECK-NEXT: movxm r0, #16544 ; CHECK-NEXT: vbcst.16 x6, r0 ; CHECK-NEXT: lda r1, [p2, #0]; movxm r0, #17280 ; CHECK-NEXT: mova r0, #60; vbcst.16 x2, r0 ; CHECK-NEXT: vadd.f dm3, dm1, dm0, r0 -; CHECK-NEXT: vconv.fp32.bf16 cml0, x6 +; CHECK-NEXT: vlda.conv.fp32.bf16 cml2, [p0], #64; vconv.fp32.bf16 cml0, x6 +; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda.conv.fp32.bf16 cml2, [p0], #64 ; CHECK-NEXT: movxm r2, #15821 -; CHECK-NEXT: mova r2, #255; movx r4, #1; vbcst.16 x4, r2 +; CHECK-NEXT: mova r2, #255; movx r4, #1; vbcst.16 x4, r2; vadd.f dm3, dm2, dm0, r0 ; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; vconv.bf16.fp32 x8, cml3; lshl r2, r1, r4; vbcst.16 x0, r2 -; CHECK-NEXT: mova r2, #828; mov m0, r2; vadd.f dm3, dm2, dm0, r0 -; CHECK-NEXT: vlda.conv.fp32.bf16 cml2, [p0], #64; vmul.f dm2, x8, x2, r2 +; CHECK-NEXT: mova r2, #828; mov m0, r2 +; CHECK-NEXT: vmul.f dm2, x8, x2, r2 ; CHECK-NEXT: nop -; CHECK-NEXT: vadd.f dm3, dm1, dm0, r0 ; CHECK-NEXT: nop -; CHECK-NEXT: vadd.f dm3, dm2, dm0, r0 -; CHECK-NEXT: vconv.bf16.fp32 x10, cml3 +; CHECK-NEXT: vconv.bf16.fp32 x10, cml3; vadd.f dm3, dm1, dm0, r0 +; CHECK-NEXT: nop +; CHECK-NEXT: vlda.conv.fp32.bf16 cml2, [p0], #64; vmul.f dm1, x10, x2, r2 ; CHECK-NEXT: vconv.bf16.fp32 x8, cml2 -; CHECK-NEXT: vmul.f dm1, x10, x2, r2 -; CHECK-NEXT: vconv.bf16.fp32 x1, cml3 -; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; vmul.f dm4, x8, x4, r2 -; CHECK-NEXT: vconv.bf16.fp32 x7, cml3; vmul.f dm2, x1, x2, r2 -; CHECK-NEXT: nop -; CHECK-NEXT: vmul.f dm3, x7, x2, r2 -; CHECK-NEXT: vconv.bf16.fp32 x10, cml1; vadd.f dm1, dm1, dm0, r0 -; CHECK-NEXT: nop -; CHECK-NEXT: vlda.conv.fp32.bf16 cml2, [p0], #64; vconv.bf16.fp32 x8, cml4; movx r3, #0; vmul.f dm4, x10, x4, r2 -; CHECK-NEXT: vconv.bf16.fp32 x5, cml2; mov s0, r3 -; CHECK-NEXT: vfloor.s32.bf16 x1, wl8, s0 -; CHECK-NEXT: vconv.bf16.fp32 x5, cml3; vmul.f dm4, x5, x4, r2 -; CHECK-NEXT: vconv.bf16.fp32 x7, cml1; movxm ls, #.LBB0_1; vadd.f dm2, dm2, dm0, r0 -; CHECK-NEXT: mova r4, #-5; nopb ; vfloor.s32.bf16 x3, wh8, s0; movxm le, #.L_LEnd0; vmul.f dm3, x5, x4, r2 -; CHECK-NEXT: mova r1, #2; vconv.bf16.fp32 x10, cml4; lshl r4, r1, r4; vmul.f dm4, x7, x2, r2 -; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; vshuffle x1, x1, x3, r1 -; CHECK-NEXT: vfloor.s32.bf16 x9, wl10, s0; vmin_ge.16 x3, r16, x1, x0, vaddsign1 -; CHECK-NEXT: vfloor.s32.bf16 x3, wh10, s0; vbcst.16 x6, r3 -; CHECK-NEXT: vconv.bf16.fp32 x8, cml4; vmax_lt.16 x11, r16, x3, x6, vaddsign1 -; CHECK-NEXT: padda [p1], m0; nopb ; nops ; nopx ; add.nc lc, r4, #-7; nopv +; CHECK-NEXT: nop +; CHECK-NEXT: vmul.f dm4, x8, x4, r2 +; CHECK-NEXT: vconv.bf16.fp32 x1, cml3; vadd.f dm3, dm2, dm0, r0 +; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64 +; CHECK-NEXT: vconv.bf16.fp32 x10, cml1; vmul.f dm2, x1, x2, r2 +; CHECK-NEXT: nop +; CHECK-NEXT: vmul.f dm4, x10, x4, r2 +; CHECK-NEXT: mova r3, #0; vconv.bf16.fp32 x8, cml4; vadd.f dm1, dm1, dm0, r0 +; CHECK-NEXT: vconv.bf16.fp32 x7, cml3; mov s0, r3 +; CHECK-NEXT: vlda.conv.fp32.bf16 cml2, [p0], #64; vfloor.s32.bf16 x1, wl8, s0 +; CHECK-NEXT: vconv.bf16.fp32 x5, cml2; vmul.f dm3, x7, x2, r2 +; CHECK-NEXT: vfloor.s32.bf16 x3, wh8, s0; movxm ls, #.LBB0_1 +; CHECK-NEXT: mova r4, #-5; nopb ; vconv.bf16.fp32 x10, cml4; movxm le, #.L_LEnd0; vmul.f dm4, x5, x4, r2 +; CHECK-NEXT: vconv.bf16.fp32 x7, cml1; lshl r4, r1, r4; vadd.f dm2, dm2, dm0, r0 +; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; movx r1, #2; vbcst.16 x6, r3 +; CHECK-NEXT: vfloor.s32.bf16 x9, wl10, s0; vshuffle x1, x1, x3, r1; vmul.f dm4, x7, x2, r2 +; CHECK-NEXT: vconv.bf16.fp32 x5, cml3; vmin_ge.16 x3, r16, x1, x0, vaddsign1 +; CHECK-NEXT: nopa ; nopb ; vfloor.s32.bf16 x3, wh10, s0; nopx ; add.nc lc, r4, #-7; nopv +; CHECK-NEXT: padda [p1], m0; nopb ; vconv.bf16.fp32 x8, cml4; nopx ; vmax_lt.16 x11, r16, x3, x6, vaddsign1; vmul.f dm3, x5, x4, r2 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: nopa ; nopb ; vconv.bf16.fp32 x10, cml2; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; vadd.f dm2, dm4, dm0, r0 ; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; nopb ; vconv.bf16.fp32 x7, cml4; nopx ; vmov cml4, cml1; vmul.f dm4, x10, x2, r2 ; CHECK-NEXT: nopa ; nopb ; vst x11, [p1], #64; nopx ; vshuffle x1, x9, x3, r1; nopv -; CHECK-NEXT: nopa ; nopb ; vfloor.s32.bf16 x3, wh8, s0; nopx ; vmin_ge.16 x5, r16, x1, x0, vaddsign1; nopv -; CHECK-NEXT: nopa ; nopb ; vfloor.s32.bf16 x9, wl8, s0; nopx ; vmax_lt.16 x11, r16, x5, x6, vaddsign1; nopv +; CHECK-NEXT: vfloor.s32.bf16 x3, wh8, s0; vmin_ge.16 x5, r16, x1, x0, vaddsign1 +; CHECK-NEXT: vfloor.s32.bf16 x9, wl8, s0; vmax_lt.16 x11, r16, x5, x6, vaddsign1 ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopa ; nopb ; vconv.bf16.fp32 x8, cml3; nopxm ; vmul.f dm3, x7, x4, r2 ; CHECK-NEXT: // %bb.2: diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/ore-hardware-loops.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/ore-hardware-loops.ll index b7354dcb9369..6febf7745d81 100644 --- a/llvm/test/CodeGen/AIE/aie2p/end-to-end/ore-hardware-loops.ll +++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/ore-hardware-loops.ll @@ -68,9 +68,9 @@ define weak_odr dso_local void @convert_bf16_to_bfp16(ptr noalias %in, ptr noali ; ASM-NEXT: vlda.pop.512 x6, [p0, lf0, r24]; movs dc1, dj0; mov dn1, dn0 ; ASM-NEXT: vldb.pop.512.2d x4, [p0, lf0, r24, d1] ; ASM-NEXT: nop -; ASM-NEXT: lda m0, [p2, #4]; vldb.fill.512 [p0, lf0, r24] +; ASM-NEXT: vldb.fill.512 [p0, lf0, r24] ; ASM-NEXT: vlda.pop.512 x6, [p0, lf0, r24] -; ASM-NEXT: vldb.pop.512.2d x4, [p0, lf0, r24, d1]; movxm ls, #.LBB0_1 +; ASM-NEXT: lda m0, [p2, #4]; vldb.pop.512.2d x4, [p0, lf0, r24, d1]; movxm ls, #.LBB0_1 ; ASM-NEXT: movxm le, #.L_LEnd0 ; ASM-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopx ; add.nc lc, r0, #-3; nopv ; ASM-NEXT: vlda.pop.512 x6, [p0, lf0, r24]; nopb ; nops ; nopx ; vconv.fp32.bf16 cml1, x6; nopv diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll index d890556cbc69..a6f59f17636e 100644 --- a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll +++ b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll @@ -44,18 +44,15 @@ define void @heavy_3d_user(i32 %dimsAI.sroa.5.0.copyload.i, i32 %dimsAI.sroa.7.0 ; FINE-GRAINED-NEXT: .LBB0_1: // %for.body.i ; FINE-GRAINED-NEXT: // =>This Loop Header: Depth=1 ; FINE-GRAINED-NEXT: // Child Loop BB0_2 Depth 2 -; FINE-GRAINED-NEXT: nopa ; nopb ; nopx ; mov dn2, r3; movs dj2, p6 -; FINE-GRAINED-NEXT: movs dn6, r3; mov r17, dc6 -; FINE-GRAINED-NEXT: movs dj6, p6; mov m2, m4 -; FINE-GRAINED-NEXT: mova p1, #0; movs dc6, r4; mov r25, r18 -; FINE-GRAINED-NEXT: vldb.pop.576.3d ex0, [p1, lf1, r25, d2] -; FINE-GRAINED-NEXT: nop +; FINE-GRAINED-NEXT: nopa ; nopb ; movs dj2, p6; nopx ; mov dn2, r3; nopv +; FINE-GRAINED-NEXT: nopa ; movs dn6, r3; nopx ; mov r17, dc6 +; FINE-GRAINED-NEXT: movs dj6, p6; or r6, r5, r5; mov r5, dj4 +; FINE-GRAINED-NEXT: movs m2, m4; vmov lfl1, lfl0 ; FINE-GRAINED-NEXT: movs m1, m5; mov dn1, r3 -; FINE-GRAINED-NEXT: movs dc1, dc0; vmov lfl1, lfl0 -; FINE-GRAINED-NEXT: movs dj1, m5; vmov lfh1, lfh0 +; FINE-GRAINED-NEXT: movs dc1, dc0; mov dj1, m5 ; FINE-GRAINED-NEXT: mova p0, #0; movs dn5, r3; mov dj5, m5 -; FINE-GRAINED-NEXT: paddb.3d [p0], d1; or r6, r5, r5; mov r5, dj4 -; FINE-GRAINED-NEXT: mova p0, #0; mov r21, dc5 +; FINE-GRAINED-NEXT: mova p1, #0; paddb.3d [p0], d1; or r25, r18, r18; vmov lfh1, lfh0; movs dc6, r4 +; FINE-GRAINED-NEXT: mova p0, #0; vldb.pop.576.3d ex0, [p1, lf1, r25, d2]; mov r21, dc5 ; FINE-GRAINED-NEXT: .LBB0_2: // %for.body125.i ; FINE-GRAINED-NEXT: // Parent Loop BB0_1 Depth=1 ; FINE-GRAINED-NEXT: // => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AIE/aie2p/upd_ext_bfp16.ll b/llvm/test/CodeGen/AIE/aie2p/upd_ext_bfp16.ll index a3ea23ad71f1..cb2c9076b76d 100644 --- a/llvm/test/CodeGen/AIE/aie2p/upd_ext_bfp16.ll +++ b/llvm/test/CodeGen/AIE/aie2p/upd_ext_bfp16.ll @@ -4,7 +4,7 @@ ; See https://llvm.org/LICENSE.txt for license information. ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception ; -; (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +; (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates ; RUN: llc < %s -verify-machineinstrs -mtriple=aie2p | FileCheck %s %struct.v64bfp16ebs16 = type <{ <64 x i8>, <8 x i8> }> @@ -197,8 +197,8 @@ define dso_local noundef <32 x i8> @_Z20test_extract_v32int813v64bfp16ebs16i(%st ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: vmov x0, x2 // Delay Slot 2 -; CHECK-NEXT: nop // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: vmov x0, x2 // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %if.else.i.i ; CHECK-NEXT: vmov wl0, wh0 ; CHECK-NEXT: .LBB10_2: // %_ZL15extract_v32int813v64bfp16ebs16i.exit @@ -238,8 +238,8 @@ define dso_local noundef <32 x i8> @_Z20test_extract_v32int812v64bfp16ebs8i(%str ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: vmov x0, x2 // Delay Slot 2 -; CHECK-NEXT: nop // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: vmov x0, x2 // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %if.else.i.i ; CHECK-NEXT: vmov wl0, wh0 ; CHECK-NEXT: .LBB11_2: // %_ZL15extract_v32int812v64bfp16ebs8i.exit @@ -340,8 +340,8 @@ define dso_local %struct.v128bfp16ebs8 @_Z11test_insert13v128bfp16ebs8i12v64bfp1 ; CHECK-NEXT: nopa ; jz r0, #.LBB15_2 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: mov r4, el2 // Delay Slot 3 -; CHECK-NEXT: vmov x1, x2 // Delay Slot 2 +; CHECK-NEXT: vmov x1, x2 // Delay Slot 3 +; CHECK-NEXT: mov r4, el2 // Delay Slot 2 ; CHECK-NEXT: mov r5, eh2 // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %if.end.i ; CHECK-NEXT: j #.LBB15_3 @@ -622,8 +622,8 @@ define dso_local %struct.v128bfp16ebs16 @_Z11test_insert14v128bfp16ebs16i13v64bf ; CHECK-NEXT: nopa ; jz r0, #.LBB22_2 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: mov r4, el2 // Delay Slot 3 -; CHECK-NEXT: vmov x1, x2 // Delay Slot 2 +; CHECK-NEXT: vmov x1, x2 // Delay Slot 3 +; CHECK-NEXT: mov r4, el2 // Delay Slot 2 ; CHECK-NEXT: mov r5, eh2 // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %if.end.i ; CHECK-NEXT: j #.LBB22_3 diff --git a/llvm/test/CodeGen/AIE/aie2ps/conv2d-outer-loop.ll b/llvm/test/CodeGen/AIE/aie2ps/conv2d-outer-loop.ll index 10b3d045ad97..c42209602468 100644 --- a/llvm/test/CodeGen/AIE/aie2ps/conv2d-outer-loop.ll +++ b/llvm/test/CodeGen/AIE/aie2ps/conv2d-outer-loop.ll @@ -32,16 +32,16 @@ define void @conv2d_outer_loop(ptr noalias %ifm, ptr noalias %wts, ptr noalias % ; CHECK-NEXT: eq r18, r20, r16 ; CHECK-NEXT: paddxm [sp], #64; lshl r28, r24, r16 ; CHECK-NEXT: st p6, [sp, #-64]; ltu r27, r16, r6 // 4-byte Folded Spill -; CHECK-NEXT: mova dj1, #96; st r26, [p3, dj0]; or r18, r28, r18 -; CHECK-NEXT: st.s8 r18, [p3, dj1]; add r2, r18, r2 -; CHECK-NEXT: sel.nez r18, r2, r0, r27 +; CHECK-NEXT: st p7, [sp, #-60]; or r18, r28, r18 // 4-byte Folded Spill +; CHECK-NEXT: mova dj1, #96; st r26, [p3, dj0]; add r2, r18, r2 +; CHECK-NEXT: st.s8 r18, [p3, dj1]; sel.nez r18, r2, r0, r27 ; CHECK-NEXT: ne r26, r2, r16 ; CHECK-NEXT: jnz r26, #.LBB0_2 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: mova dj0, #76; ltu r28, r16, r18 // Delay Slot 3 -; CHECK-NEXT: st r28, [p3, dj0]; mov r7, r8 // Delay Slot 2 -; CHECK-NEXT: mova r2, #5; st p7, [sp, #-60]; or r17, r10, r10; mov r19, r12 // 4-byte Folded Spill Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: mova dj0, #76; ltu r28, r16, r18; mov r7, r8 // Delay Slot 2 +; CHECK-NEXT: mova r2, #5; st r28, [p3, dj0]; or r17, r10, r10; mov r19, r12 // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: jnz r24, #.LBB0_3 ; CHECK-NEXT: nop // Delay Slot 5 @@ -73,11 +73,9 @@ define void @conv2d_outer_loop(ptr noalias %ifm, ptr noalias %wts, ptr noalias % ; CHECK-NEXT: or r26, r24, r18 ; CHECK-NEXT: .LBB0_3: // %if.end.i ; CHECK-NEXT: mova m0, #80; nopb ; nops ; nopx ; mov p2, p3; nopv -; CHECK-NEXT: padda [p2], m0 -; CHECK-NEXT: st r28, [p2], #24 -; CHECK-NEXT: st.s8 r26, [p2, #0]; ne r6, r20, r6 -; CHECK-NEXT: jnz r6, #.LBB0_5 -; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: padda [p2], m0; ne r6, r20, r6; nopm +; CHECK-NEXT: st r28, [p2], #24; jnz r6, #.LBB0_5 +; CHECK-NEXT: st.s8 r26, [p2, #0] // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 ; CHECK-NEXT: nop // Delay Slot 2 @@ -86,17 +84,15 @@ define void @conv2d_outer_loop(ptr noalias %ifm, ptr noalias %wts, ptr noalias % ; CHECK-NEXT: movxm r6, #16777215 ; CHECK-NEXT: mova dj0, #92; and r4, r4, r6 ; CHECK-NEXT: st r4, [p3, dj0] -; CHECK-NEXT: nop ; CHECK-NEXT: .LBB0_5: // %_Z24setup_conv2d_iter_paramsR13conv2d_params.exit -; CHECK-NEXT: mova dj0, #84; nopb ; nopxm -; CHECK-NEXT: lda r20, [p3, dj0]; extend.u8 r4, r26 -; CHECK-NEXT: mova dj0, #120; eq r6, r4, r22 -; CHECK-NEXT: lda r24, [p3, dj0]; jnz r6, #.LBB0_7 +; CHECK-NEXT: nopa ; nopb ; nops ; extend.u8 r4, r26; nopm ; nopv +; CHECK-NEXT: eq r6, r4, r22 +; CHECK-NEXT: jnz r6, #.LBB0_7 ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: nop // Delay Slot 2 -; CHECK-NEXT: nop // Delay Slot 1 +; CHECK-NEXT: mova dj0, #84 // Delay Slot 4 +; CHECK-NEXT: lda r20, [p3, dj0] // Delay Slot 3 +; CHECK-NEXT: mova dj0, #120 // Delay Slot 2 +; CHECK-NEXT: lda r24, [p3, dj0] // Delay Slot 1 ; CHECK-NEXT: // %bb.6: // %_Z24setup_conv2d_iter_paramsR13conv2d_params.exit ; CHECK-NEXT: ne r4, r4, r16 ; CHECK-NEXT: jnz r4, #.LBB0_11 @@ -142,22 +138,21 @@ define void @conv2d_outer_loop(ptr noalias %ifm, ptr noalias %wts, ptr noalias % ; CHECK-NEXT: .LBB0_8: // %for.body.i68 ; CHECK-NEXT: // =>This Loop Header: Depth=1 ; CHECK-NEXT: // Child Loop BB0_9 Depth 2 -; CHECK-NEXT: nopa ; vldb x1, [p1, #64]; nopx ; mov r0, dc6; nops +; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; mov r0, dc6; nopv ; CHECK-NEXT: vlda.ups.2x cml1, s0, upssign1, [p4, #0]; vldb.popx x4, [p0, lf0, r24]; lshl r0, r0, r2; mov dc4, dc3 ; CHECK-NEXT: vlda.pop.3d x6, [p0, lf0, r24, d0]; or r20, r0, r16; mov dj3, r0 ; CHECK-NEXT: vldb.128 wl2, [p2, dj3]; mov dj3, r20 ; CHECK-NEXT: vldb.128 wl8, [p2, dj3] ; CHECK-NEXT: vlda.ups.2x cml0, s0, upssign1, [p6, #0]; vldb x10, [p1, #0] -; CHECK-NEXT: vlda.ups.2x cmh1, s0, upssign1, [p4, #64] -; CHECK-NEXT: vlda.ups.2x cmh0, s0, upssign1, [p6, #64]; vldb.popx x10, [p0, lf0, r24] -; CHECK-NEXT: vldb.pop.3d x8, [p0, lf0, r24, d0] -; CHECK-NEXT: vldb.popx x10, [p0, lf0, r24]; mov p7, p1 -; CHECK-NEXT: vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]; vshuffle x2, x4, x6, r6; vmul dm2, x0, x2, r10 -; CHECK-NEXT: vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; add.nc lc, r18, #-6; padds [p7], #128; vmul dm3, x0, x8, r10 -; CHECK-NEXT: vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]; movxm ls, #.LBB0_9; vaddmac dm1, dm1, dm2, x2, x10, r12 -; CHECK-NEXT: vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; padds [p7], #128; movxm le, #.L_LEnd0; vaddmac dm0, dm0, dm3, x2, x1, r12 +; CHECK-NEXT: vlda.ups.2x cmh1, s0, upssign1, [p4, #64]; vldb.popx x10, [p0, lf0, r24] +; CHECK-NEXT: vlda x1, [p1, #64]; vldb.pop.3d x8, [p0, lf0, r24, d0] +; CHECK-NEXT: vlda.ups.2x cmh0, s0, upssign1, [p6, #64]; vldb.popx x10, [p0, lf0, r24]; mov p7, p1 +; CHECK-NEXT: vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0] +; CHECK-NEXT: vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; padds [p7], #128; add.nc lc, r18, #-6; vshuffle x2, x4, x6, r6; vmul dm2, x0, x2, r10 +; CHECK-NEXT: vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]; movxm ls, #.LBB0_9; vmul dm3, x0, x8, r10 +; CHECK-NEXT: vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; padds [p7], #128; movxm le, #.L_LEnd0; vaddmac dm1, dm1, dm2, x2, x10, r12 ; CHECK-NEXT: vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]; nops ; nopxm ; nopv -; CHECK-NEXT: vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; padds [p7], #128; nopxm ; nopv +; CHECK-NEXT: vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; padds [p7], #128; nopxm ; vaddmac dm0, dm0, dm3, x2, x1, r12 ; CHECK-NEXT: vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]; nops ; nopx ; vshuffle x2, x10, x8, r6; nopv ; CHECK-NEXT: vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; padds [p7], #128; nopxm ; nopv ; CHECK-NEXT: vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]; nops ; nopx ; vshuffle x2, x10, x8, r6; vmac dm1, dm1, x2, x6, r8 @@ -170,9 +165,9 @@ define void @conv2d_outer_loop(ptr noalias %ifm, ptr noalias %wts, ptr noalias % ; CHECK-NEXT: // %bb.10: // %for.cond.cleanup54.i89 ; CHECK-NEXT: // in Loop: Header=BB0_8 Depth=1 ; CHECK-NEXT: vlda x4, [p7, #192]; paddb [p1], m3; padds [p7], #128; add r4, r4, #-1; nopm ; vmac dm0, dm0, x2, x4, r8 -; CHECK-NEXT: vlda x6, [p7, #128]; paddb [p4], #128; padds [p6], #128; nopx ; vshuffle x2, x10, x8, r6; vmac dm1, dm1, x2, x6, r8 -; CHECK-NEXT: vlda x4, [p7, #192]; paddb.3d [p0], d1; padds [p7], #128; nopxm ; vmac dm0, dm0, x2, x4, r8 -; CHECK-NEXT: nopa ; paddb.3d [p1], d2; nops ; nopx ; vshuffle x2, x10, x8, r6; vmac dm1, dm1, x2, x6, r8 +; CHECK-NEXT: vlda x6, [p7, #128]; paddb.3d [p1], d2; padds [p4], #128; nopx ; vshuffle x2, x10, x8, r6; vmac dm1, dm1, x2, x6, r8 +; CHECK-NEXT: vlda x4, [p7, #192]; paddb [p6], #128; padds [p7], #128; nopxm ; vmac dm0, dm0, x2, x4, r8 +; CHECK-NEXT: nopa ; paddb.3d [p0], d1; nops ; nopx ; vshuffle x2, x10, x8, r6; vmac dm1, dm1, x2, x6, r8 ; CHECK-NEXT: vmac dm0, dm0, x2, x4, r8 ; CHECK-NEXT: vshuffle x2, x10, x8, r6; vmac dm1, dm1, x2, x6, r8 ; CHECK-NEXT: vmac dm0, dm0, x2, x4, r8 diff --git a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_int8_outerloop_pipelined.ll b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_int8_outerloop_pipelined.ll index 3ba9874aebe2..f4015e67392c 100644 --- a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_int8_outerloop_pipelined.ll +++ b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_int8_outerloop_pipelined.ll @@ -43,8 +43,8 @@ declare i1 @llvm.loop.decrement.i32(i32) #3 define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %cond, ptr %cond.i, ptr %ifm, i32 %cond88.i, i20 %idx.ext.i.i, i20 %idx.ext.i330.i, i20 %idx.ext.i334.i, i32 %1, i20 %idx.ext.i338.i, i20 %2, i20 %3, i20 %4, i20 %5, i20 %6, i20 %7, i20 %8, i20 %9, i20 %10, i20 %11, i20 %12, i20 %13, i20 %14, i20 %15, i20 %16, i32 %conv197.i, i32 %conv.i.i.i.i.i, i20 %idx.ext.i342.i, i20 %idx.ext.i344.i, i20 %17, i20 %18, i20 %19, i32 %or.i.i, i32 %cond15.i.i.i.i.i, i20 %20, i20 %21, i20 %22, i20 %23, i20 %24, i32 %or22.i.i.i.i.i) #4 personality ptr @__gxx_personality_v0 { ; CHECK-LABEL: conv2d: ; CHECK: // %bb.0: // %newFuncRoot -; CHECK-NEXT: paddxm [sp], #64; nopb ; nops ; nopx ; mov m4, p4; nopv -; CHECK-NEXT: mova m0, #-68; st p6, [sp, #-64]; nopx // 4-byte Folded Spill +; CHECK-NEXT: paddxm [sp], #64; nopb ; nopx ; mov m4, p4 +; CHECK-NEXT: mova m0, #-68; st p6, [sp, #-64] // 4-byte Folded Spill ; CHECK-NEXT: vlda.ups.2x cml1, s0, upssign1, [p1], m4; movs m2, p5; mov p6, sp ; CHECK-NEXT: vlda.ups.2x cmh1, s0, upssign1, [p1], m2; paddb [p6], m0 ; CHECK-NEXT: lda m5, [p6], #-4 @@ -53,8 +53,9 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %cond, ptr %cond.i, ptr ; CHECK-NEXT: lda r21, [p6], #-4; mov s0, r1 ; CHECK-NEXT: lda r29, [p6], #-4; movx crupsmode, #0 ; CHECK-NEXT: lda r25, [p6], #-4 -; CHECK-NEXT: lda dn0, [p6], #-4 -; CHECK-NEXT: lda r27, [p6], #-4 +; CHECK-NEXT: lda dn0, [p6], #-4; paddb [p1], m4 +; CHECK-NEXT: lda r27, [p6], #-4; paddb [p1], m5 +; CHECK-NEXT: vlda.ups.2x cml0, s0, upssign1, [p1, #0] ; CHECK-NEXT: lda m1, [p6], #-4 ; CHECK-NEXT: lda r31, [p6], #-4 ; CHECK-NEXT: lda r16, [p6], #-4 @@ -64,20 +65,16 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %cond, ptr %cond.i, ptr ; CHECK-NEXT: lda r18, [p6], #-4 ; CHECK-NEXT: lda dn3, [p6], #-4 ; CHECK-NEXT: lda dn7, [p6], #-4 -; CHECK-NEXT: lda r20, [p6], #-4 -; CHECK-NEXT: lda m6, [p6], #-4 -; CHECK-NEXT: lda r1, [p6], #-4 -; CHECK-NEXT: lda r26, [p6], #-4 -; CHECK-NEXT: lda r22, [p6], #-4; mov dj4, #0 -; CHECK-NEXT: lda m2, [p6], #-4; mov s1, r3 -; CHECK-NEXT: lda dj2, [p6], #-4; or r28, r8, r8; mov dj3, #0 -; CHECK-NEXT: lda dj6, [p6], #-4; movs dc2, dj4; or r30, r5, r5; mov r5, dj4 -; CHECK-NEXT: lda dn2, [p6, #0]; movs dc6, dj4; or r8, r7, r7; mov r7, dj4 -; CHECK-NEXT: lda dn6, [p6, #-4]; movs dc0, dj4; mov dj1, r31 -; CHECK-NEXT: padda [p1], m4; movs dc1, dj4; mov dj5, r16 -; CHECK-NEXT: padda [p1], m5; movs dc5, dj4; mov dj7, r18 -; CHECK-NEXT: vlda.ups.2x cml0, s0, upssign1, [p1, #0]; movs dc3, dj4; mov r23, m5 -; CHECK-NEXT: padda [p1], m4; movs dc7, dj4; add r0, r0, #-1; mov p6, p0 +; CHECK-NEXT: lda r20, [p6], #-4; mov dj4, #0 +; CHECK-NEXT: lda m6, [p6], #-4; mov s1, r3 +; CHECK-NEXT: lda r1, [p6], #-4; mov dj3, #0 +; CHECK-NEXT: lda r26, [p6], #-4; movs dc2, dj4; or r28, r8, r8; mov dc6, dj4 +; CHECK-NEXT: lda r22, [p6], #-4; movs dc0, dj4; or r30, r5, r5; mov r5, dj4 +; CHECK-NEXT: lda m2, [p6], #-4; movs dc1, dj4; or r8, r7, r7; mov r7, dj4 +; CHECK-NEXT: lda dj2, [p6], #-4; movs dc5, dj4; mov r23, m5 +; CHECK-NEXT: lda dj6, [p6], #-4; movs dc3, dj4; mov dj1, r31 +; CHECK-NEXT: lda dn2, [p6, #0]; movs dc7, dj4; mov dj5, r16 +; CHECK-NEXT: lda dn6, [p6, #-4]; paddb [p1], m4; add r0, r0, #-1; mov p6, p0; movs dj7, r18 ; CHECK-NEXT: vlda.ups.2x cmh0, s0, upssign1, [p1, #0]; movs p0, p3; movx crsrsmode, #0; mov m4, r1 ; CHECK-NEXT: .LBB0_1: // %for.body.i ; CHECK-NEXT: // =>This Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_opt_outerloop_blocks.ll b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_opt_outerloop_blocks.ll index a4a5c9b46a78..fba50a7537bb 100644 --- a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_opt_outerloop_blocks.ll +++ b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_opt_outerloop_blocks.ll @@ -51,7 +51,7 @@ declare i1 @llvm.loop.decrement.i32(i32) #3 define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm, ptr %psum_1_tdm, ptr %ifm, ptr %add.ptr.i, <64 x i8> %1, i32 %conv10.i, i20 %2, i20 %3, i20 %4, i20 %5, i20 %6, i32 %7, i32 %or25.i.i.i, i32 %8, i20 %9, i20 %10, i20 %11, i20 %12, i20 %13, i20 %14, i20 %15, i20 %16, i20 %17, i20 %18, i20 %19, i32 %conv91.i, i32 %20, i20 %idx.ext.i216.i, i20 %21, i20 %22, i20 %23, i32 %or22.i.i.i) #4 { ; CHECK-LABEL: conv2d: ; CHECK: // %bb.0: // %newFuncRoot -; CHECK-NEXT: paddxm [sp], #64 +; CHECK-NEXT: paddxm [sp], #64; nopb ; nopx ; CHECK-NEXT: st p6, [sp, #-64] // 4-byte Folded Spill ; CHECK-NEXT: mova m0, #-68; mov p6, sp ; CHECK-NEXT: padda [p6], m0 @@ -77,15 +77,14 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm, ; CHECK-NEXT: lda dn3, [p6, #-4]; movs p6, p1; movx r25, #0; mov p1, p4 ; CHECK-NEXT: movs dc0, dc7; vldb.popx x10, [p1, lf1, r25]; mov dc4, dc7 ; CHECK-NEXT: vldb.pop.3d x1, [p1, lf1, r25, d0] -; CHECK-NEXT: nop -; CHECK-NEXT: vldb.128 wl2, [p5, #0]; or r22, r12, r12; mov r19, r8 -; CHECK-NEXT: vlda.ups.2x cml1, s0, upssign1, [p2], #64; or r21, r10, r10; mov s0, r1 -; CHECK-NEXT: mova r16, #5; vldb x8, [p0, #0]; or r10, r3, r3; mov s1, r5 -; CHECK-NEXT: vlda.ups.2x cml0, s0, upssign1, [p3], #64; vldb x6, [p0, #64]; add r0, r0, #-1; mov dc6, dc7; movs dc3, dc7 -; CHECK-NEXT: vlda.ups.2x cmh1, s0, upssign1, [p2], #64; movs dc2, dc7; or r8, r7, r7; addm.nc r1, r0, #-1 -; CHECK-NEXT: vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vldb.128 wl4, [p5, #16]; movxm p4, #.LBB0_1; movs dc5, dc7 -; CHECK-NEXT: mova r18, #16; movs dc1, dc7; movx crupsmode, #0; vshuffle x10, x10, x1, r2 -; CHECK-NEXT: mova r12, #264; st p7, [sp, #-60]; movx crsrsmode, #0; mov m5, r17 // 4-byte Folded Spill +; CHECK-NEXT: or r22, r12, r12; mov r19, r8 +; CHECK-NEXT: mova r16, #5; or r21, r10, r10; mov s0, r1 +; CHECK-NEXT: mova r18, #16; vldb.128 wl4, [p5, #16]; or r10, r3, r3; mov s1, r5 +; CHECK-NEXT: vlda.ups.2x cml1, s0, upssign1, [p2], #64; movs dc3, dc7; or r8, r7, r7; mov dc6, dc7 +; CHECK-NEXT: mova r12, #264; vldb.128 wl2, [p5, #0]; add r0, r0, #-1; mov dc5, dc7; movs dc2, dc7 +; CHECK-NEXT: vlda.ups.2x cml0, s0, upssign1, [p3], #64; vldb x8, [p0, #0]; movx crupsmode, #0; addm.nc r1, r0, #-1; movs dc1, dc7 +; CHECK-NEXT: vlda.ups.2x cmh1, s0, upssign1, [p2], #64; st p7, [sp, #-60]; movxm p4, #.LBB0_1 // 4-byte Folded Spill +; CHECK-NEXT: vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vldb x6, [p0, #64]; movx crsrsmode, #0; vshuffle x10, x10, x1, r2; movs m5, r17 ; CHECK-NEXT: .LBB0_1: // %for.body.i ; CHECK-NEXT: // =>This Loop Header: Depth=1 ; CHECK-NEXT: // Child Loop BB0_2 Depth 2 @@ -110,10 +109,10 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm, ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup54.i ; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: vlda x4, [p7, #192]; paddb.3d [p1], d1; padds [p7], #128; nopxm ; vmac dm0, dm0, x2, x4, r8 -; CHECK-NEXT: vlda x6, [p7, #128]; vldb.popx x10, [p1, lf1, r25]; movs dc4, dc7; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 -; CHECK-NEXT: vlda x4, [p7, #192]; paddb [p0], m4; padds [p7], #128; nopx ; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8 -; CHECK-NEXT: padds.3d [p0], d2; vldb.pop.3d x1, [p1, lf1, r25, d0]; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 -; CHECK-NEXT: vlda x8, [p0, #0]; mov r0, dc6; vmac dm0, dm0, x2, x4, r8 +; CHECK-NEXT: vlda x6, [p7, #128]; paddb [p0], m4; movs dc4, dc7; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 +; CHECK-NEXT: vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p7], #128; nopx ; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8 +; CHECK-NEXT: vlda.pop.3d x1, [p1, lf1, r25, d0]; paddb.3d [p0], d2; nops ; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 +; CHECK-NEXT: vldb x8, [p0, #0]; nopx ; mov r0, dc6; vmac dm0, dm0, x2, x4, r8 ; CHECK-NEXT: vldb x6, [p0, #64]; lshl r0, r0, r16; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 ; CHECK-NEXT: or r20, r0, r18; mov dj7, r0; vmac dm0, dm0, x2, x4, r8 ; CHECK-NEXT: movs dj7, r20; vldb.128 wl2, [p5, dj7]; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 @@ -130,11 +129,11 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm, ; CHECK-NEXT: vldb.popx x8, [p1, lf1, r25] ; CHECK-NEXT: vldb.pop.3d x6, [p1, lf1, r25, d0] ; CHECK-NEXT: vldb.popx x8, [p1, lf1, r25] -; CHECK-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; vmul dm2, x0, x2, r12 -; CHECK-NEXT: vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; add.nc lc, r4, #-6; padds [p0], #128; vmul dm3, x0, x4, r12 -; CHECK-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; movxm ls, #.LBB0_5; vaddmac dm1, dm1, dm2, x10, x8, r10 -; CHECK-NEXT: vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; movxm le, #.L_LEnd0; vaddmac dm0, dm0, dm3, x10, x6, r10 -; CHECK-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopxm ; nopv +; CHECK-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0] +; CHECK-NEXT: vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; add.nc lc, r4, #-6; padds [p0], #128; vmul dm2, x0, x2, r12 +; CHECK-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; movxm ls, #.LBB0_5; vmul dm3, x0, x4, r12 +; CHECK-NEXT: vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; movxm le, #.L_LEnd0; vaddmac dm1, dm1, dm2, x10, x8, r10 +; CHECK-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopxm ; vaddmac dm0, dm0, dm3, x10, x6, r10 ; CHECK-NEXT: vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; nopv ; CHECK-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x0, x8, x6, r2; nopv ; CHECK-NEXT: vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; nopv @@ -168,7 +167,7 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm, ; ; NO-PROLOGUE-SPLIT-LABEL: conv2d: ; NO-PROLOGUE-SPLIT: // %bb.0: // %newFuncRoot -; NO-PROLOGUE-SPLIT-NEXT: paddxm [sp], #64; nopb ; nopx +; NO-PROLOGUE-SPLIT-NEXT: paddxm [sp], #64; nopb ; nops ; nopxm ; nopv ; NO-PROLOGUE-SPLIT-NEXT: st p6, [sp, #-64] // 4-byte Folded Spill ; NO-PROLOGUE-SPLIT-NEXT: mova m0, #-68; mov p6, sp ; NO-PROLOGUE-SPLIT-NEXT: padda [p6], m0 @@ -192,22 +191,18 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm, ; NO-PROLOGUE-SPLIT-NEXT: lda m3, [p6], #-4 ; NO-PROLOGUE-SPLIT-NEXT: lda dj3, [p6, #0]; movx r30, #63; mov dc7, #0 ; NO-PROLOGUE-SPLIT-NEXT: lda dn3, [p6, #-4]; movs p6, p1; movx r25, #0; mov p1, p4 -; NO-PROLOGUE-SPLIT-NEXT: movs dc0, dc7; vldb.popx x4, [p1, lf1, r25]; mov dc4, dc7 -; NO-PROLOGUE-SPLIT-NEXT: vlda.128 wl2, [p5, #0]; vldb.pop.3d x6, [p1, lf1, r25, d0] -; NO-PROLOGUE-SPLIT-NEXT: vlda.128 wl8, [p5, #16] -; NO-PROLOGUE-SPLIT-NEXT: vlda.ups.2x cml1, s0, upssign1, [p2, #0] -; NO-PROLOGUE-SPLIT-NEXT: vldb x10, [p0, #0] -; NO-PROLOGUE-SPLIT-NEXT: vlda.ups.2x cmh1, s0, upssign1, [p2, #64] -; NO-PROLOGUE-SPLIT-NEXT: nop -; NO-PROLOGUE-SPLIT-NEXT: mov s0, r1 -; NO-PROLOGUE-SPLIT-NEXT: mova r12, #264; or r21, r10, r10; mov r22, r12 -; NO-PROLOGUE-SPLIT-NEXT: vlda.ups.2x cml0, s0, upssign1, [p3, #0]; vldb x1, [p0, #64]; movx crupsmode, #0; vshuffle x2, x4, x6, r2; vmul dm2, x0, x2, r12 -; NO-PROLOGUE-SPLIT-NEXT: vlda.ups.2x cmh0, s0, upssign1, [p3, #64]; or r10, r3, r3; mov r3, p5 -; NO-PROLOGUE-SPLIT-NEXT: st p7, [sp, #-60]; mov s1, r5; vaddmac dm1, dm1, dm2, x2, x10, r10 // 4-byte Folded Spill -; NO-PROLOGUE-SPLIT-NEXT: movs p5, p2; mov p7, p0; vmul dm3, x0, x8, r12 -; NO-PROLOGUE-SPLIT-NEXT: movs dc3, dc7; add r0, r0, #-1; mov dc6, dc7 -; NO-PROLOGUE-SPLIT-NEXT: movs dc2, dc7; or r19, r8, r8; addm.nc r1, r0, #-1 -; NO-PROLOGUE-SPLIT-NEXT: mova r16, #5; movs dc5, dc7; or r8, r7, r7; mov dc1, dc7 +; NO-PROLOGUE-SPLIT-NEXT: vlda.128 wl2, [p5, #0]; vldb.popx x4, [p1, lf1, r25]; movs dc0, dc7; mov dc4, dc7 +; NO-PROLOGUE-SPLIT-NEXT: vlda.128 wl8, [p5, #16]; vldb.pop.3d x6, [p1, lf1, r25, d0] +; NO-PROLOGUE-SPLIT-NEXT: vlda x1, [p0, #64] +; NO-PROLOGUE-SPLIT-NEXT: vlda.ups.2x cml1, s0, upssign1, [p2, #0]; mov s0, r1 +; NO-PROLOGUE-SPLIT-NEXT: vlda.ups.2x cml0, s0, upssign1, [p3, #0]; vldb x10, [p0, #0]; mov r21, r10 +; NO-PROLOGUE-SPLIT-NEXT: vlda.ups.2x cmh1, s0, upssign1, [p2, #64]; or r10, r3, r3; mov r3, p5 +; NO-PROLOGUE-SPLIT-NEXT: vlda.ups.2x cmh0, s0, upssign1, [p3, #64]; mov p5, p2 +; NO-PROLOGUE-SPLIT-NEXT: st p7, [sp, #-60]; add r0, r0, #-1; mov s1, r5 // 4-byte Folded Spill +; NO-PROLOGUE-SPLIT-NEXT: mova r12, #264; movs p7, p0; or r22, r12, r12; mov dc3, dc7 +; NO-PROLOGUE-SPLIT-NEXT: movs dc6, dc7; movx crupsmode, #0; vshuffle x2, x4, x6, r2; vmul dm2, x0, x2, r12 +; NO-PROLOGUE-SPLIT-NEXT: movs dc2, dc7; or r19, r8, r8; addm.nc r1, r0, #-1; vmul dm3, x0, x8, r12 +; NO-PROLOGUE-SPLIT-NEXT: mova r16, #5; nopb ; movs dc5, dc7; or r8, r7, r7; mov dc1, dc7; vaddmac dm1, dm1, dm2, x2, x10, r10 ; NO-PROLOGUE-SPLIT-NEXT: mova r18, #16; nopb ; movs p4, p3; movx crsrsmode, #0; mov m5, r17; vaddmac dm0, dm0, dm3, x2, x1, r10 ; NO-PROLOGUE-SPLIT-NEXT: .LBB0_1: // %for.body.i ; NO-PROLOGUE-SPLIT-NEXT: // =>This Loop Header: Depth=1 @@ -235,24 +230,20 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm, ; NO-PROLOGUE-SPLIT-NEXT: vlda x4, [p0, #192]; paddb [p7], m4; padds [p0], #128; nopx ; mov dc4, dc7; vmac dm0, dm0, x2, x4, r8 ; NO-PROLOGUE-SPLIT-NEXT: vlda x6, [p0, #128]; paddb.3d [p7], d2; padds.3d [p1], d1; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 ; NO-PROLOGUE-SPLIT-NEXT: vlda x4, [p0, #192]; nopb ; padds [p0], #128; nopx ; mov r0, dc6; vmac dm0, dm0, x2, x4, r8 -; NO-PROLOGUE-SPLIT-NEXT: nopa ; vldb x10, [p7, #0]; movs p0, r3; lshl r0, r0, r16; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 -; NO-PROLOGUE-SPLIT-NEXT: vlda x1, [p7, #64]; vldb.popx x4, [p1, lf1, r25]; nops ; or r20, r0, r18; mov dj7, r0; vmac dm0, dm0, x2, x4, r8 +; NO-PROLOGUE-SPLIT-NEXT: movs p0, r3; vldb x10, [p7, #0]; lshl r0, r0, r16; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 +; NO-PROLOGUE-SPLIT-NEXT: vlda x1, [p7, #64]; vldb.popx x4, [p1, lf1, r25]; or r20, r0, r18; mov dj7, r0; vmac dm0, dm0, x2, x4, r8 ; NO-PROLOGUE-SPLIT-NEXT: vlda.128 wl2, [p0, dj7]; vldb.pop.3d x6, [p1, lf1, r25, d0]; movs dj7, r20; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 -; NO-PROLOGUE-SPLIT-NEXT: vlda.128 wl8, [p0, dj7]; nopb ; nopx ; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8 +; NO-PROLOGUE-SPLIT-NEXT: vlda.128 wl8, [p0, dj7]; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8 ; NO-PROLOGUE-SPLIT-NEXT: vlda.ups.2x cml1, s0, upssign1, [p2, #128]; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 ; NO-PROLOGUE-SPLIT-NEXT: vlda.ups.2x cmh1, s0, upssign1, [p2, #192]; nopb ; movs p2, p5; movxm p0, #.LBB0_1; vmac dm0, dm0, x2, x4, r8 ; NO-PROLOGUE-SPLIT-NEXT: vlda.ups.2x cml0, s0, upssign1, [p3, #128]; vmac dm1, dm1, x2, x6, r8 ; NO-PROLOGUE-SPLIT-NEXT: vlda.ups.2x cmh0, s0, upssign1, [p3, #192]; mov p3, p4; vmac dm0, dm0, x2, x4, r8 -; NO-PROLOGUE-SPLIT-NEXT: nop -; NO-PROLOGUE-SPLIT-NEXT: vmul dm2, x0, x2, r12 -; NO-PROLOGUE-SPLIT-NEXT: vshuffle x2, x4, x6, r2; vmul dm3, x0, x8, r12 -; NO-PROLOGUE-SPLIT-NEXT: nop -; NO-PROLOGUE-SPLIT-NEXT: vst.srs.4x dm1, s1, srssign0, [p6], m5; jnzd r1, r1, p0; vaddmac dm1, dm1, dm2, x2, x10, r10 -; NO-PROLOGUE-SPLIT-NEXT: vst.2d.srs.4x dm0, s1, srssign0, [p6], d3; movx srssign0, #0; vaddmac dm0, dm0, dm3, x2, x1, r10 // Delay Slot 5 -; NO-PROLOGUE-SPLIT-NEXT: nop // Delay Slot 4 +; NO-PROLOGUE-SPLIT-NEXT: jnzd r1, r1, p0 +; NO-PROLOGUE-SPLIT-NEXT: vmul dm2, x0, x2, r12 // Delay Slot 5 +; NO-PROLOGUE-SPLIT-NEXT: vshuffle x2, x4, x6, r2; vmul dm3, x0, x8, r12 // Delay Slot 4 ; NO-PROLOGUE-SPLIT-NEXT: nop // Delay Slot 3 -; NO-PROLOGUE-SPLIT-NEXT: nop // Delay Slot 2 -; NO-PROLOGUE-SPLIT-NEXT: nop // Delay Slot 1 +; NO-PROLOGUE-SPLIT-NEXT: vst.srs.4x dm1, s1, srssign0, [p6], m5; vaddmac dm1, dm1, dm2, x2, x10, r10 // Delay Slot 2 +; NO-PROLOGUE-SPLIT-NEXT: vst.2d.srs.4x dm0, s1, srssign0, [p6], d3; movx srssign0, #0; vaddmac dm0, dm0, dm3, x2, x1, r10 // Delay Slot 1 ; NO-PROLOGUE-SPLIT-NEXT: // %bb.4: // %cooldown.entry ; NO-PROLOGUE-SPLIT-NEXT: vldb.popx x8, [p1, lf1, r25] ; NO-PROLOGUE-SPLIT-NEXT: vldb.pop.3d x6, [p1, lf1, r25, d0] @@ -295,7 +286,7 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm, ; ; NO-JNZD-LABEL: conv2d: ; NO-JNZD: // %bb.0: // %newFuncRoot -; NO-JNZD-NEXT: paddxm [sp], #64; nopb ; nopxm ; nops +; NO-JNZD-NEXT: paddxm [sp], #64; nopb ; nops ; nopxm ; nopv ; NO-JNZD-NEXT: st p6, [sp, #-64] // 4-byte Folded Spill ; NO-JNZD-NEXT: mova m0, #-68; mov p6, sp ; NO-JNZD-NEXT: padda [p6], m0 @@ -322,14 +313,13 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm, ; NO-JNZD-NEXT: movs dc0, dc7; vldb.popx x10, [p1, lf1, r25]; mov dc4, dc7 ; NO-JNZD-NEXT: vldb.pop.3d x1, [p1, lf1, r25, d0] ; NO-JNZD-NEXT: nop -; NO-JNZD-NEXT: vldb.128 wl2, [p5, #0] -; NO-JNZD-NEXT: vlda.ups.2x cml1, s0, upssign1, [p2], #64; movx r18, #5; mov r23, r12 -; NO-JNZD-NEXT: mova r20, #1; vldb x8, [p0, #0]; or r19, r8, r8; mov s0, r1 -; NO-JNZD-NEXT: vlda.ups.2x cml0, s0, upssign1, [p3], #64; vldb x6, [p0, #64]; or r21, r10, r10; mov s1, r5 -; NO-JNZD-NEXT: vlda.ups.2x cmh1, s0, upssign1, [p2], #64; or r10, r3, r3; mov dc3, dc7 -; NO-JNZD-NEXT: vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vldb.128 wl4, [p5, #16]; or r8, r7, r7; mov dc2, dc7; movs dc6, dc7 -; NO-JNZD-NEXT: mova r22, #16; movs dc5, dc7; movx crupsmode, #0; vshuffle x10, x10, x1, r2 -; NO-JNZD-NEXT: mova r12, #264; movs dc1, dc7; movx crsrsmode, #0; mov m5, r17 +; NO-JNZD-NEXT: mova r20, #1; movx r18, #5; mov r23, r12 +; NO-JNZD-NEXT: mova r22, #16; vldb.128 wl4, [p5, #16]; or r19, r8, r8; mov s0, r1 +; NO-JNZD-NEXT: vlda.ups.2x cml1, s0, upssign1, [p2], #64; or r21, r10, r10; mov s1, r5 +; NO-JNZD-NEXT: mova r12, #264; vldb.128 wl2, [p5, #0]; or r10, r3, r3; mov dc3, dc7 +; NO-JNZD-NEXT: vlda.ups.2x cml0, s0, upssign1, [p3], #64; vldb x8, [p0, #0]; or r8, r7, r7; mov dc2, dc7; movs dc6, dc7 +; NO-JNZD-NEXT: vlda.ups.2x cmh1, s0, upssign1, [p2], #64; movs dc5, dc7; movx crupsmode, #0; mov dc1, dc7 +; NO-JNZD-NEXT: vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vldb x6, [p0, #64]; movx crsrsmode, #0; vshuffle x10, x10, x1, r2; movs m5, r17 ; NO-JNZD-NEXT: .LBB0_1: // %for.body.i ; NO-JNZD-NEXT: // =>This Loop Header: Depth=1 ; NO-JNZD-NEXT: // Child Loop BB0_2 Depth 2 @@ -353,18 +343,18 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm, ; NO-JNZD-NEXT: vlda x6, [p4, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 ; NO-JNZD-NEXT: // %bb.3: // %for.cond.cleanup54.i ; NO-JNZD-NEXT: // in Loop: Header=BB0_1 Depth=1 -; NO-JNZD-NEXT: vlda x4, [p4, #192]; paddb.3d [p1], d1; padds [p4], #128; add r0, r0, #-1; mov dc4, dc7; vmac dm0, dm0, x2, x4, r8 -; NO-JNZD-NEXT: vlda x6, [p4, #128]; paddb [p0], m4; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 -; NO-JNZD-NEXT: vlda x4, [p4, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p4], #128; nopx ; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8 -; NO-JNZD-NEXT: vlda.pop.3d x1, [p1, lf1, r25, d0]; paddb.3d [p0], d2; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 -; NO-JNZD-NEXT: vldb x8, [p0, #0]; mov r16, dc6; vmac dm0, dm0, x2, x4, r8 -; NO-JNZD-NEXT: vldb x6, [p0, #64]; lshl r16, r16, r18; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 -; NO-JNZD-NEXT: or r24, r16, r22; mov dj7, r16; vmac dm0, dm0, x2, x4, r8 +; NO-JNZD-NEXT: vlda x4, [p4, #192]; paddb [p0], m4; padds [p4], #128; add r0, r0, #-1; mov dc4, dc7; vmac dm0, dm0, x2, x4, r8 +; NO-JNZD-NEXT: vlda x6, [p4, #128]; paddb.3d [p1], d1; padds.3d [p0], d2; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 +; NO-JNZD-NEXT: vlda x4, [p4, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p4], #128; nopx ; mov r16, dc6; vmac dm0, dm0, x2, x4, r8 +; NO-JNZD-NEXT: vlda x8, [p0, #0]; vldb.pop.3d x1, [p1, lf1, r25, d0]; nops ; lshl r16, r16, r18; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 +; NO-JNZD-NEXT: vlda x6, [p0, #64]; nopb ; or r24, r16, r22; mov dj7, r16; vmac dm0, dm0, x2, x4, r8 ; NO-JNZD-NEXT: movs dj7, r24; vldb.128 wl2, [p5, dj7]; eq r16, r0, r20; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 -; NO-JNZD-NEXT: vlda.ups.2x cml0, s0, upssign1, [p3], #64; vldb.128 wl4, [p5, dj7]; vmac dm0, dm0, x2, x4, r8 -; NO-JNZD-NEXT: vlda.ups.2x cml1, s0, upssign1, [p2], #64; vmac dm1, dm1, x2, x6, r8 -; NO-JNZD-NEXT: vlda.ups.2x cmh1, s0, upssign1, [p2], #64; vmac dm0, dm0, x2, x4, r8 -; NO-JNZD-NEXT: vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vshuffle x10, x10, x1, r2 +; NO-JNZD-NEXT: vldb.128 wl4, [p5, dj7]; vmac dm0, dm0, x2, x4, r8 +; NO-JNZD-NEXT: vlda.ups.2x cml1, s0, upssign1, [p2], #64; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 +; NO-JNZD-NEXT: vlda.ups.2x cmh1, s0, upssign1, [p2], #64; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8 +; NO-JNZD-NEXT: vlda.ups.2x cml0, s0, upssign1, [p3], #64; vmac dm1, dm1, x2, x6, r8 +; NO-JNZD-NEXT: vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vmac dm0, dm0, x2, x4, r8 +; NO-JNZD-NEXT: vshuffle x10, x10, x1, r2 ; NO-JNZD-NEXT: jz r16, #.LBB0_1 ; NO-JNZD-NEXT: nop // Delay Slot 5 ; NO-JNZD-NEXT: nop // Delay Slot 4 @@ -375,11 +365,11 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm, ; NO-JNZD-NEXT: vldb.popx x8, [p1, lf1, r25] ; NO-JNZD-NEXT: vldb.pop.3d x6, [p1, lf1, r25, d0] ; NO-JNZD-NEXT: vldb.popx x8, [p1, lf1, r25] -; NO-JNZD-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; vmul dm2, x0, x2, r12 -; NO-JNZD-NEXT: vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; add.nc lc, r4, #-6; padds [p0], #128; vmul dm3, x0, x4, r12 -; NO-JNZD-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; movxm ls, #.LBB0_5; vaddmac dm1, dm1, dm2, x10, x8, r10 -; NO-JNZD-NEXT: vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; movxm le, #.L_LEnd0; vaddmac dm0, dm0, dm3, x10, x6, r10 -; NO-JNZD-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopxm ; nopv +; NO-JNZD-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0] +; NO-JNZD-NEXT: vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; add.nc lc, r4, #-6; padds [p0], #128; vmul dm2, x0, x2, r12 +; NO-JNZD-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; movxm ls, #.LBB0_5; vmul dm3, x0, x4, r12 +; NO-JNZD-NEXT: vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; movxm le, #.L_LEnd0; vaddmac dm1, dm1, dm2, x10, x8, r10 +; NO-JNZD-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopxm ; vaddmac dm0, dm0, dm3, x10, x6, r10 ; NO-JNZD-NEXT: vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; nopv ; NO-JNZD-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x0, x8, x6, r2; nopv ; NO-JNZD-NEXT: vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; nopv @@ -410,6 +400,121 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm, ; NO-JNZD-NEXT: nop // Delay Slot 3 ; NO-JNZD-NEXT: nop // Delay Slot 2 ; NO-JNZD-NEXT: nop // Delay Slot 1 +; USE-JNZD-LABEL: conv2d: +; USE-JNZD: // %bb.0: // %newFuncRoot +; USE-JNZD-NEXT: paddxm [sp], #64; nopb ; nopx +; USE-JNZD-NEXT: st p6, [sp, #-64] // 4-byte Folded Spill +; USE-JNZD-NEXT: mova m0, #-68; mov p6, sp +; USE-JNZD-NEXT: padda [p6], m0 +; USE-JNZD-NEXT: lda m0, [p6], #-4 +; USE-JNZD-NEXT: lda dn0, [p6], #-4 +; USE-JNZD-NEXT: lda dj0, [p6], #-4 +; USE-JNZD-NEXT: lda dn4, [p6], #-4 +; USE-JNZD-NEXT: lda dj4, [p6], #-4 +; USE-JNZD-NEXT: lda m4, [p6], #-4 +; USE-JNZD-NEXT: lda m1, [p6], #-4 +; USE-JNZD-NEXT: lda dj1, [p6], #-4 +; USE-JNZD-NEXT: lda dj5, [p6], #-4 +; USE-JNZD-NEXT: lda dn1, [p6], #-4 +; USE-JNZD-NEXT: lda dn5, [p6], #-4 +; USE-JNZD-NEXT: lda m2, [p6], #-4 +; USE-JNZD-NEXT: lda dj2, [p6], #-4 +; USE-JNZD-NEXT: lda dj6, [p6], #-4 +; USE-JNZD-NEXT: lda dn2, [p6], #-4 +; USE-JNZD-NEXT: lda dn6, [p6], #-4 +; USE-JNZD-NEXT: lda r17, [p6], #-4 +; USE-JNZD-NEXT: lda m3, [p6], #-4 +; USE-JNZD-NEXT: lda dj3, [p6, #0]; movx r30, #63; mov dc7, #0 +; USE-JNZD-NEXT: lda dn3, [p6, #-4]; movs p6, p1; movx r25, #0; mov p1, p4 +; USE-JNZD-NEXT: movs dc0, dc7; vldb.popx x10, [p1, lf1, r25]; mov dc4, dc7 +; USE-JNZD-NEXT: vldb.pop.3d x1, [p1, lf1, r25, d0] +; USE-JNZD-NEXT: or r22, r12, r12; mov r19, r8 +; USE-JNZD-NEXT: mova r16, #5; or r21, r10, r10; mov s0, r1 +; USE-JNZD-NEXT: mova r18, #16; vldb.128 wl4, [p5, #16]; or r10, r3, r3; mov s1, r5 +; USE-JNZD-NEXT: vlda.ups.2x cml1, s0, upssign1, [p2], #64; movs dc3, dc7; or r8, r7, r7; mov dc6, dc7 +; USE-JNZD-NEXT: mova r12, #264; vldb.128 wl2, [p5, #0]; add r0, r0, #-1; mov dc5, dc7; movs dc2, dc7 +; USE-JNZD-NEXT: vlda.ups.2x cml0, s0, upssign1, [p3], #64; vldb x8, [p0, #0]; movx crupsmode, #0; addm.nc r1, r0, #-1; movs dc1, dc7 +; USE-JNZD-NEXT: vlda.ups.2x cmh1, s0, upssign1, [p2], #64; st p7, [sp, #-60]; movxm p4, #.LBB0_1 // 4-byte Folded Spill +; USE-JNZD-NEXT: vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vldb x6, [p0, #64]; movx crsrsmode, #0; vshuffle x10, x10, x1, r2; movs m5, r17 +; USE-JNZD-NEXT: .LBB0_1: // %for.body.i +; USE-JNZD-NEXT: // =>This Loop Header: Depth=1 +; USE-JNZD-NEXT: // Child Loop BB0_2 Depth 2 +; USE-JNZD-NEXT: nopa ; vldb.popx x10, [p1, lf1, r25]; nops ; nopxm ; nopv +; USE-JNZD-NEXT: vldb.pop.3d x8, [p1, lf1, r25, d0] +; USE-JNZD-NEXT: vldb.popx x10, [p1, lf1, r25]; mov p7, p0 +; USE-JNZD-NEXT: vlda x6, [p7, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; vmul dm2, x0, x2, r12 +; USE-JNZD-NEXT: vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; add.nc lc, r4, #-6; padds [p7], #128; vmul dm3, x0, x4, r12 +; USE-JNZD-NEXT: vlda x6, [p7, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; movxm ls, #.LBB0_2; vaddmac dm1, dm1, dm2, x10, x8, r10 +; USE-JNZD-NEXT: vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p7], #128; movxm le, #.L_LEnd1; vaddmac dm0, dm0, dm3, x10, x6, r10 +; USE-JNZD-NEXT: vlda x6, [p7, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; nops ; nopxm ; nopv +; USE-JNZD-NEXT: vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p7], #128; nopxm ; nopv +; USE-JNZD-NEXT: vlda x6, [p7, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x2, x10, x8, r2; nopv +; USE-JNZD-NEXT: vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p7], #128; nopxm ; nopv +; USE-JNZD-NEXT: vlda x6, [p7, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 +; USE-JNZD-NEXT: .LBB0_2: // %for.body55.i +; USE-JNZD-NEXT: // Parent Loop BB0_1 Depth=1 +; USE-JNZD-NEXT: // => This Inner Loop Header: Depth=2 +; USE-JNZD-NEXT: vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p7], #128; nopxm ; vmac dm0, dm0, x2, x4, r8 +; USE-JNZD-NEXT: .L_LEnd1: +; USE-JNZD-NEXT: vlda x6, [p7, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 +; USE-JNZD-NEXT: // %bb.3: // %for.cond.cleanup54.i +; USE-JNZD-NEXT: // in Loop: Header=BB0_1 Depth=1 +; USE-JNZD-NEXT: vlda x4, [p7, #192]; paddb.3d [p1], d1; padds [p7], #128; nopx ; mov dc4, dc7; vmac dm0, dm0, x2, x4, r8 +; USE-JNZD-NEXT: vlda x6, [p7, #128]; paddb [p0], m4; nops ; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 +; USE-JNZD-NEXT: vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p7], #128; nopx ; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8 +; USE-JNZD-NEXT: vlda.pop.3d x1, [p1, lf1, r25, d0]; paddb.3d [p0], d2; nops ; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 +; USE-JNZD-NEXT: nopa ; vldb x8, [p0, #0]; nops ; nopx ; mov r0, dc6; vmac dm0, dm0, x2, x4, r8 +; USE-JNZD-NEXT: vldb x6, [p0, #64]; lshl r0, r0, r16; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 +; USE-JNZD-NEXT: or r20, r0, r18; mov dj7, r0; vmac dm0, dm0, x2, x4, r8 +; USE-JNZD-NEXT: movs dj7, r20; vldb.128 wl2, [p5, dj7]; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8 +; USE-JNZD-NEXT: vldb.128 wl4, [p5, dj7]; vmac dm0, dm0, x2, x4, r8 +; USE-JNZD-NEXT: vmac dm1, dm1, x2, x6, r8 +; USE-JNZD-NEXT: vmac dm0, dm0, x2, x4, r8 +; USE-JNZD-NEXT: jnzd r1, r1, p4 +; USE-JNZD-NEXT: nop // Delay Slot 5 +; USE-JNZD-NEXT: vlda.ups.2x cml1, s0, upssign1, [p2], #64 // Delay Slot 4 +; USE-JNZD-NEXT: vlda.ups.2x cmh1, s0, upssign1, [p2], #64 // Delay Slot 3 +; USE-JNZD-NEXT: vlda.ups.2x cml0, s0, upssign1, [p3], #64; vst.srs.4x dm1, s1, srssign0, [p6], m5 // Delay Slot 2 +; USE-JNZD-NEXT: vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vst.2d.srs.4x dm0, s1, srssign0, [p6], d3; movx srssign0, #0; vshuffle x10, x10, x1, r2 // Delay Slot 1 +; USE-JNZD-NEXT: // %bb.4: // %cooldown.entry +; USE-JNZD-NEXT: vldb.popx x8, [p1, lf1, r25] +; USE-JNZD-NEXT: vldb.pop.3d x6, [p1, lf1, r25, d0] +; USE-JNZD-NEXT: vldb.popx x8, [p1, lf1, r25] +; USE-JNZD-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0] +; USE-JNZD-NEXT: vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; add.nc lc, r4, #-6; padds [p0], #128; vmul dm2, x0, x2, r12 +; USE-JNZD-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; movxm ls, #.LBB0_5; vmul dm3, x0, x4, r12 +; USE-JNZD-NEXT: vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; movxm le, #.L_LEnd0; vaddmac dm1, dm1, dm2, x10, x8, r10 +; USE-JNZD-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopxm ; vaddmac dm0, dm0, dm3, x10, x6, r10 +; USE-JNZD-NEXT: vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; nopv +; USE-JNZD-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x0, x8, x6, r2; nopv +; USE-JNZD-NEXT: vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; nopv +; USE-JNZD-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x0, x8, x6, r2; vmac dm1, dm1, x0, x4, r8 +; USE-JNZD-NEXT: .LBB0_5: // %for.body55.i.cd +; USE-JNZD-NEXT: // =>This Inner Loop Header: Depth=1 +; USE-JNZD-NEXT: vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; vmac dm0, dm0, x0, x2, r8 +; USE-JNZD-NEXT: .L_LEnd0: +; USE-JNZD-NEXT: vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x0, x8, x6, r2; vmac dm1, dm1, x0, x4, r8 +; USE-JNZD-NEXT: // %bb.6: // %cooldown.exit +; USE-JNZD-NEXT: vlda x2, [p0, #192]; nopb ; padds [p0], #128; movx crsrsmode, #0; mov s0, r5; vmac dm0, dm0, x0, x2, r8 +; USE-JNZD-NEXT: vlda x4, [p0, #128]; nopb ; movs dj0, r17; or r12, r22, r22; vshuffle x0, x8, x6, r2; vmac dm1, dm1, x0, x4, r8 +; USE-JNZD-NEXT: vlda x2, [p0, #192]; nopb ; padds [p0], #128; or r10, r21, r21; mov srssign0, r6; vmac dm0, dm0, x0, x2, r8 +; USE-JNZD-NEXT: lda p7, [sp, #-60]; vshuffle x0, x8, x6, r2; vmac dm1, dm1, x0, x4, r8 // 4-byte Folded Reload +; USE-JNZD-NEXT: vmac dm0, dm0, x0, x2, r8 +; USE-JNZD-NEXT: vshuffle x0, x8, x6, r2; vmac dm1, dm1, x0, x4, r8 +; USE-JNZD-NEXT: vmac dm0, dm0, x0, x2, r8 +; USE-JNZD-NEXT: vshuffle x0, x8, x6, r2; vmac dm1, dm1, x0, x4, r8 +; USE-JNZD-NEXT: vmac dm0, dm0, x0, x2, r8 +; USE-JNZD-NEXT: vmac dm1, dm1, x0, x4, r8 +; USE-JNZD-NEXT: vmac dm0, dm0, x0, x2, r8 +; USE-JNZD-NEXT: nop +; USE-JNZD-NEXT: nop +; USE-JNZD-NEXT: lda p6, [sp, #-64] // 4-byte Folded Reload +; USE-JNZD-NEXT: ret lr +; USE-JNZD-NEXT: vst.srs.4x dm1, s0, srssign0, [p6, #0] // Delay Slot 5 +; USE-JNZD-NEXT: vst.srs.4x dm0, s0, srssign0, [p6, dj0] // Delay Slot 4 +; USE-JNZD-NEXT: nop // Delay Slot 3 +; USE-JNZD-NEXT: nop // Delay Slot 2 +; USE-JNZD-NEXT: paddxm [sp], #-64; movx srssign0, #0; mov r8, r19 // Delay Slot 1 newFuncRoot: br label %for.body.i diff --git a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_blocks.ll b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_blocks.ll index 125947b70f3d..dfbc9e84aaec 100644 --- a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_blocks.ll +++ b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_blocks.ll @@ -45,7 +45,7 @@ declare i1 @llvm.loop.decrement.i32(i32) #2 define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, ptr %p_psum, ptr %p_c, ptr %p_bias, i20 %3, i20 %4, i20 %5, i20 %6, i20 %7, i20 %idx.ext.i, i20 %8, i20 %9, i20 %10, i20 %11, i20 %12, i32 %13, i32 %14, i32 %15, i32 %16, i32 %or23.i.i.i.i, <64 x i8> %17, i32 %18, i32 %19, i32 %20, i32 %21, i32 %22, i32 %or22.i.i.i.i, i32 %conv166, i32 %conv.i.i.i.i, i20 %idx.ext.i478, i20 %23, i20 %24, i20 %25, i20 %26, i20 %27, i20 %28) #3 { ; CHECK-LABEL: gemm: ; CHECK: // %bb.0: // %newFuncRoot -; CHECK-NEXT: paddxm [sp], #64; nopx +; CHECK-NEXT: paddxm [sp], #64; nopb ; nopx ; CHECK-NEXT: st p6, [sp, #-64] // 4-byte Folded Spill ; CHECK-NEXT: mova m0, #-68; mov p6, sp ; CHECK-NEXT: padda [p6], m0 @@ -66,34 +66,33 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt ; CHECK-NEXT: lda r17, [p6], #-4 ; CHECK-NEXT: lda r19, [p6], #-4 ; CHECK-NEXT: lda m5, [p6], #-4 -; CHECK-NEXT: vlda.ups.2x cml3, s0, upssign1, [p2], #64 -; CHECK-NEXT: vlda.ups.2x cmh3, s0, upssign1, [p2], #64; vldb.128 wl4, [p4, #0] -; CHECK-NEXT: vlda.ups.2x cml2, s0, upssign1, [p2], #64; vldb.128 wl6, [p4, #16]; add r0, r0, #-1; mov r21, r8 -; CHECK-NEXT: vlda.ups.2x cmh2, s0, upssign1, [p2], #64; vldb x8, [p1], m4; movx r22, #0; mov dc0, #0 -; CHECK-NEXT: vlda.ups.2x cml1, s0, upssign1, [p2], #64; or r23, r10, r10; mov s0, r22 -; CHECK-NEXT: vlda.ups.2x cmh1, s0, upssign1, [p2], #64; movs dc1, dc0; or r10, r5, r5; mov dc5, dc0 -; CHECK-NEXT: lda m2, [p6], #-4; vldb.3d x10, [p1], d1; movx crupsmode, #0; vbcst.32 x2, r22 -; CHECK-NEXT: lda dj2, [p6], #-4; movs dc4, dc0; movx r22, #15; addm.nc r5, r0, #-1 -; CHECK-NEXT: lda dn2, [p6], #-4; vldb x6, [p0], #64; movs m0, p5; vsel.32 x4, x2, x4, r22 -; CHECK-NEXT: lda m3, [p6], #-4; vldb.3d x8, [p0], d0; vsel.32 x2, x2, x6, r22 -; CHECK-NEXT: vlda.ups.2x cml0, s0, upssign1, [p2], #64; vshuffle x6, x8, x0, r1 +; CHECK-NEXT: vldb.128 wl4, [p4, #0] +; CHECK-NEXT: vlda.ups.2x cml3, s0, upssign1, [p2], #64; vldb.128 wl6, [p4, #16]; or r21, r8, r8; mov dc2, #0 +; CHECK-NEXT: vlda.ups.2x cmh3, s0, upssign1, [p2], #64; vldb x8, [p1], m4; movx r22, #0; mov dc0, #0 +; CHECK-NEXT: vlda.ups.2x cml2, s0, upssign1, [p2], #64; add r0, r0, #-1; vbcst.32 x2, r22 +; CHECK-NEXT: vlda.ups.2x cmh2, s0, upssign1, [p2], #64; or r23, r10, r10; mov s0, r22 +; CHECK-NEXT: lda m2, [p6], #-4; movs dc1, dc0; or r10, r5, r5; mov dc5, dc0 +; CHECK-NEXT: lda dj2, [p6], #-4; vldb.3d x10, [p1], d1; movx r22, #15; addm.nc r5, r0, #-1 +; CHECK-NEXT: lda dn2, [p6], #-4; movx crupsmode, #0; vsel.32 x4, x2, x4, r22 +; CHECK-NEXT: lda m3, [p6], #-4; vsel.32 x2, x2, x6, r22 +; CHECK-NEXT: vlda.ups.2x cml1, s0, upssign1, [p2], #64; vshuffle x6, x8, x0, r1 ; CHECK-NEXT: lda dj3, [p6, #0]; vshuffle x1, x6, x0, r2 ; CHECK-NEXT: lda dn3, [p6, #-4]; movxm p6, #.LBB0_1 -; CHECK-NEXT: vlda.ups.2x cmh0, s0, upssign1, [p2], #64; vshuffle x8, x10, x0, r3 -; CHECK-NEXT: mova dc2, #0; or r24, r12, r12; mov s1, r17 -; CHECK-NEXT: mova r12, #776; movs dc3, dc2; movx crsrsmode, #0; vshuffle x10, x8, x0, r4 +; CHECK-NEXT: vlda.ups.2x cmh1, s0, upssign1, [p2], #64; vldb x6, [p0], #64; or r24, r12, r12; mov s1, r17; movs dc4, dc0 +; CHECK-NEXT: vlda.ups.2x cml0, s0, upssign1, [p2], #64; movs m0, p5; movx r12, #776; vshuffle x8, x10, x0, r3 +; CHECK-NEXT: vlda.ups.2x cmh0, s0, upssign1, [p2], #64; vldb.3d x8, [p0], d0; movx crsrsmode, #0; vshuffle x10, x8, x0, r4; movs dc3, dc2 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Loop Header: Depth=1 ; CHECK-NEXT: // Child Loop BB0_2 Depth 2 -; CHECK-NEXT: nopa ; vldb x9, [p1], m4; movs m0, p5; nopxm ; vmul dm4, x0, x4, r12 +; CHECK-NEXT: nopa ; vldb x9, [p1], m4; movs m0, p5; nopxm ; nopv ; CHECK-NEXT: vlda.3d x7, [p1], d1; nopb ; nopx -; CHECK-NEXT: vldb x5, [p0], #64; vaddmac dm3, dm3, dm4, x6, x1, r10 -; CHECK-NEXT: vlda.3d x3, [p0], d0; vmul dm4, x0, x2, r12 -; CHECK-NEXT: movs m0, p5; vldb x9, [p1], m4; vaddmac dm2, dm2, dm4, x8, x1, r10 -; CHECK-NEXT: vlda.3d x7, [p1], d1; movxm ls, #.LBB0_2; vaddmac dm1, dm1, dm4, x6, x10, r10 -; CHECK-NEXT: vldb x5, [p0], #64; movxm le, #.L_LEnd1; vaddmac dm0, dm0, dm4, x8, x10, r10 -; CHECK-NEXT: vlda.3d x3, [p0], d0; vshuffle x1, x9, x0, r7 -; CHECK-NEXT: nopa ; vldb x9, [p1], m4; movs m0, p5; add.nc lc, r6, #-3; vshuffle x10, x1, x0, r16; nopv +; CHECK-NEXT: vldb x5, [p0], #64; vmul dm4, x0, x4, r12 +; CHECK-NEXT: vlda.3d x3, [p0], d0 +; CHECK-NEXT: movs m0, p5; vldb x9, [p1], m4; vaddmac dm3, dm3, dm4, x6, x1, r10 +; CHECK-NEXT: vlda.3d x7, [p1], d1; movxm ls, #.LBB0_2; vmul dm4, x0, x2, r12 +; CHECK-NEXT: vldb x5, [p0], #64; movxm le, #.L_LEnd1; vaddmac dm2, dm2, dm4, x8, x1, r10 +; CHECK-NEXT: vlda.3d x3, [p0], d0; vshuffle x1, x9, x0, r7; vaddmac dm1, dm1, dm4, x6, x10, r10 +; CHECK-NEXT: nopa ; vldb x9, [p1], m4; movs m0, p5; add.nc lc, r6, #-3; vshuffle x10, x1, x0, r16; vaddmac dm0, dm0, dm4, x8, x10, r10 ; CHECK-NEXT: vlda.3d x7, [p1], d1; nopb ; nops ; nopx ; vshuffle x8, x7, x0, r18; nopv ; CHECK-NEXT: nopa ; vldb x5, [p0], #64; nops ; nopx ; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8 ; CHECK-NEXT: vlda.3d x3, [p0], d0; nopb ; nops ; nopx ; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8 @@ -236,7 +235,7 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt ; NO-PROLOGUE-SPLIT-NEXT: vlda x1, [p1], m4; paddb.2d [p4], d7; movs p2, p7; nopx ; vshuffle x10, x1, x0, r16; vmac dm1, dm1, x5, x6, r8 ; NO-PROLOGUE-SPLIT-NEXT: padda [p2], m6; vldb x8, [p0], #64; movs m0, p5; nopx ; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8 ; NO-PROLOGUE-SPLIT-NEXT: nopa ; vldb.3d x10, [p0], d0; nops ; nopx ; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8 -; NO-PROLOGUE-SPLIT-NEXT: vldb.3d x5, [p1], d1; nopx ; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8 +; NO-PROLOGUE-SPLIT-NEXT: nopa ; vldb.3d x5, [p1], d1; nopx ; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8 ; NO-PROLOGUE-SPLIT-NEXT: vlda.ups.2x cmh3, s0, upssign1, [p2], #64; vldb.128 wl6, [p4, #0]; vshuffle x10, x1, x0, r16; vmac dm1, dm1, x5, x6, r8 ; NO-PROLOGUE-SPLIT-NEXT: vlda.ups.2x cml3, s0, upssign1, [p7, dj2]; vldb.128 wl3, [p4, #16]; movs p7, p6; nopx ; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8 ; NO-PROLOGUE-SPLIT-NEXT: vlda.ups.2x cml2, s0, upssign1, [p2], #64; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8 @@ -250,11 +249,10 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt ; NO-PROLOGUE-SPLIT-NEXT: vst.srs.4x dm1, s1, srssign0, [p3], #64; vsel.32 x4, x4, x3, r22 ; NO-PROLOGUE-SPLIT-NEXT: vst.2d.srs.4x dm0, s1, srssign0, [p3], d3; movx srssign0, #0; vshuffle x1, x5, x0, r3; vaddmac dm3, dm3, dm4, x8, x6, r10 ; NO-PROLOGUE-SPLIT-NEXT: movxm p2, #.LBB0_1; vaddmac dm2, dm2, dm4, x10, x6, r10 -; NO-PROLOGUE-SPLIT-NEXT: vshuffle x6, x1, x0, r4; vmul dm4, x0, x4, r12 -; NO-PROLOGUE-SPLIT-NEXT: jnzd r5, r5, p2 -; NO-PROLOGUE-SPLIT-NEXT: vaddmac dm1, dm1, dm4, x8, x6, r10 // Delay Slot 5 -; NO-PROLOGUE-SPLIT-NEXT: vaddmac dm0, dm0, dm4, x10, x6, r10 // Delay Slot 4 -; NO-PROLOGUE-SPLIT-NEXT: nop // Delay Slot 3 +; NO-PROLOGUE-SPLIT-NEXT: jnzd r5, r5, p2; vshuffle x6, x1, x0, r4; vmul dm4, x0, x4, r12 +; NO-PROLOGUE-SPLIT-NEXT: nop // Delay Slot 5 +; NO-PROLOGUE-SPLIT-NEXT: vaddmac dm1, dm1, dm4, x8, x6, r10 // Delay Slot 4 +; NO-PROLOGUE-SPLIT-NEXT: vaddmac dm0, dm0, dm4, x10, x6, r10 // Delay Slot 3 ; NO-PROLOGUE-SPLIT-NEXT: nop // Delay Slot 2 ; NO-PROLOGUE-SPLIT-NEXT: nop // Delay Slot 1 ; NO-PROLOGUE-SPLIT-NEXT: // %bb.4: // %cooldown.entry @@ -300,7 +298,7 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt ; ; NO-JNZD-LABEL: gemm: ; NO-JNZD: // %bb.0: // %newFuncRoot -; NO-JNZD-NEXT: paddxm [sp], #64; nopxm +; NO-JNZD-NEXT: paddxm [sp], #64; nopb ; nopxm ; nops ; NO-JNZD-NEXT: st p6, [sp, #-64] // 4-byte Folded Spill ; NO-JNZD-NEXT: mova m0, #-68; mov p6, sp ; NO-JNZD-NEXT: padda [p6], m0 @@ -322,19 +320,19 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt ; NO-JNZD-NEXT: lda r19, [p6], #-4 ; NO-JNZD-NEXT: lda m5, [p6], #-4 ; NO-JNZD-NEXT: nop -; NO-JNZD-NEXT: vlda.ups.2x cml3, s0, upssign1, [p2], #64; vldb.128 wl4, [p4, #0]; mov dc2, #0 +; NO-JNZD-NEXT: vlda.ups.2x cml3, s0, upssign1, [p2], #64; vldb.128 wl4, [p4, #0] ; NO-JNZD-NEXT: vlda.ups.2x cmh3, s0, upssign1, [p2], #64; vldb.128 wl6, [p4, #16]; mov r21, r8 -; NO-JNZD-NEXT: vlda.ups.2x cml2, s0, upssign1, [p2], #64; vldb x8, [p1], m4; movx r24, #0; mov dc0, #0 -; NO-JNZD-NEXT: vlda.ups.2x cmh2, s0, upssign1, [p2], #64; vbcst.32 x2, r24 -; NO-JNZD-NEXT: vlda.ups.2x cml1, s0, upssign1, [p2], #64; movs dc1, dc0; mov dc5, dc0 +; NO-JNZD-NEXT: vlda.ups.2x cml2, s0, upssign1, [p2], #64; vldb x8, [p1], m4; mov dc2, #0 +; NO-JNZD-NEXT: vlda.ups.2x cmh2, s0, upssign1, [p2], #64; mov dc0, #0 +; NO-JNZD-NEXT: vlda.ups.2x cml1, s0, upssign1, [p2], #64; movs dc1, dc0; movx r24, #0; mov dc5, dc0 ; NO-JNZD-NEXT: vlda.ups.2x cmh1, s0, upssign1, [p2], #64; vldb.3d x1, [p1], d1; movx crupsmode, #0; mov s0, r24 -; NO-JNZD-NEXT: vlda.ups.2x cml0, s0, upssign1, [p2], #64; vldb x6, [p0], #64; movx r24, #15; mov m0, p5; movs dc4, dc0 -; NO-JNZD-NEXT: lda m2, [p6], #-4; vldb.3d x8, [p0], d0; vsel.32 x4, x2, x4, r24 +; NO-JNZD-NEXT: vlda.ups.2x cml0, s0, upssign1, [p2], #64; movx r24, #15; vbcst.32 x2, r24 +; NO-JNZD-NEXT: lda m2, [p6], #-4; vsel.32 x4, x2, x4, r24 ; NO-JNZD-NEXT: vlda.ups.2x cmh0, s0, upssign1, [p2], #64; or r23, r10, r10; vsel.32 x2, x2, x6, r24 ; NO-JNZD-NEXT: lda dj2, [p6], #-4; or r25, r12, r12; vshuffle x8, x8, x0, r1 -; NO-JNZD-NEXT: lda dn2, [p6], #-4; movx r22, #1; vshuffle x10, x8, x0, r2 -; NO-JNZD-NEXT: lda m3, [p6], #-4; movx r12, #776; mov s1, r17 -; NO-JNZD-NEXT: lda dj3, [p6, #0]; or r10, r5, r5; vshuffle x1, x1, x0, r3 +; NO-JNZD-NEXT: lda dn2, [p6], #-4; vldb x6, [p0], #64; movx r22, #1; vshuffle x10, x8, x0, r2; movs dc4, dc0 +; NO-JNZD-NEXT: lda m3, [p6], #-4; movs m0, p5; movx r12, #776; mov s1, r17 +; NO-JNZD-NEXT: lda dj3, [p6, #0]; vldb.3d x8, [p0], d0; or r10, r5, r5; vshuffle x1, x1, x0, r3 ; NO-JNZD-NEXT: lda dn3, [p6, #-4]; movs dc3, dc2; movx crsrsmode, #0; vshuffle x1, x1, x0, r4 ; NO-JNZD-NEXT: .LBB0_1: // %for.body ; NO-JNZD-NEXT: // =>This Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined-aa.ll b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined-aa.ll index 502ee192b9be..3994f2eae751 100644 --- a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined-aa.ll +++ b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined-aa.ll @@ -106,9 +106,9 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup99 ; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: padda [p2], m5; paddb.2d [p4], d7; movs m0, p5; add r0, r0, #-1; vshuffle x10, x1, x0, r16; vmac dm1, dm1, x5, x6, r8 -; CHECK-NEXT: nopa ; vldb x8, [p0], #64; movs p6, p2; nopx ; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8 -; CHECK-NEXT: nopa ; vldb x10, [p1], m4; nops ; nopx ; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8 -; CHECK-NEXT: nopa ; vldb.3d x5, [p1], d1; nopx ; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8 +; CHECK-NEXT: movs p6, p2; vldb x8, [p0], #64; nopx ; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8 +; CHECK-NEXT: vldb x10, [p1], m4; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8 +; CHECK-NEXT: vldb.3d x5, [p1], d1; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8 ; CHECK-NEXT: vlda.3d x1, [p0], d0; vldb.128 wl6, [p4, #0]; vshuffle x10, x1, x0, r16; vmac dm1, dm1, x5, x6, r8 ; CHECK-NEXT: vlda.ups.2x cml3, s0, upssign1, [p6], #64; vldb.128 wl3, [p4, #16]; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8 ; CHECK-NEXT: vlda.ups.2x cmh3, s0, upssign1, [p6], #64; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8 @@ -122,12 +122,10 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt ; CHECK-NEXT: vst.srs.4x dm1, s1, srssign0, [p3], #64; vsel.32 x4, x4, x3, r22 ; CHECK-NEXT: vst.2d.srs.4x dm0, s1, srssign0, [p3], d3; movx srssign0, #0; vshuffle x10, x5, x0, r3; vaddmac dm3, dm3, dm4, x8, x6, r12 ; CHECK-NEXT: vshuffle x6, x10, x0, r4; vaddmac dm2, dm2, dm4, x1, x6, r12 -; CHECK-NEXT: vmul dm4, x0, x4, r10 -; CHECK-NEXT: nop -; CHECK-NEXT: jnz r0, #.LBB0_1; vaddmac dm1, dm1, dm4, x8, x6, r12 -; CHECK-NEXT: vaddmac dm0, dm0, dm4, x1, x6, r12 // Delay Slot 5 -; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: jnz r0, #.LBB0_1; vmul dm4, x0, x4, r10 +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: vaddmac dm1, dm1, dm4, x8, x6, r12 // Delay Slot 4 +; CHECK-NEXT: vaddmac dm0, dm0, dm4, x1, x6, r12 // Delay Slot 3 ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.4: // %cooldown.entry diff --git a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined.ll b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined.ll index c73701b5f2da..c9f84a0887b7 100644 --- a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined.ll +++ b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined.ll @@ -107,9 +107,9 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup99 ; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: padda [p2], m5; paddb.2d [p4], d7; movs m0, p5; add r0, r0, #-1; vshuffle x10, x1, x0, r16; vmac dm1, dm1, x5, x6, r8 -; CHECK-NEXT: nopa ; vldb x8, [p0], #64; movs p6, p2; nopx ; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8 -; CHECK-NEXT: nopa ; vldb x10, [p1], m4; nops ; nopx ; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8 -; CHECK-NEXT: nopa ; vldb.3d x5, [p1], d1; nopx ; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8 +; CHECK-NEXT: movs p6, p2; vldb x8, [p0], #64; nopx ; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8 +; CHECK-NEXT: vldb x10, [p1], m4; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8 +; CHECK-NEXT: vldb.3d x5, [p1], d1; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8 ; CHECK-NEXT: vlda.3d x1, [p0], d0; vldb.128 wl6, [p4, #0]; vshuffle x10, x1, x0, r16; vmac dm1, dm1, x5, x6, r8 ; CHECK-NEXT: vlda.ups.2x cml3, s0, upssign1, [p6], #64; vldb.128 wl3, [p4, #16]; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8 ; CHECK-NEXT: vlda.ups.2x cmh3, s0, upssign1, [p6], #64; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8 @@ -123,12 +123,10 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt ; CHECK-NEXT: vst.srs.4x dm1, s1, srssign0, [p3], #64; vsel.32 x4, x4, x3, r22 ; CHECK-NEXT: vst.2d.srs.4x dm0, s1, srssign0, [p3], d3; movx srssign0, #0; vshuffle x10, x5, x0, r3; vaddmac dm3, dm3, dm4, x8, x6, r12 ; CHECK-NEXT: vshuffle x6, x10, x0, r4; vaddmac dm2, dm2, dm4, x1, x6, r12 -; CHECK-NEXT: vmul dm4, x0, x4, r10 -; CHECK-NEXT: nop -; CHECK-NEXT: jnz r0, #.LBB0_1; vaddmac dm1, dm1, dm4, x8, x6, r12 -; CHECK-NEXT: vaddmac dm0, dm0, dm4, x1, x6, r12 // Delay Slot 5 -; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: jnz r0, #.LBB0_1; vmul dm4, x0, x4, r10 +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: vaddmac dm1, dm1, dm4, x8, x6, r12 // Delay Slot 4 +; CHECK-NEXT: vaddmac dm0, dm0, dm4, x1, x6, r12 // Delay Slot 3 ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.4: // %cooldown.entry diff --git a/llvm/test/CodeGen/AIE/hardware-loops/loop-with-call.ll b/llvm/test/CodeGen/AIE/hardware-loops/loop-with-call.ll index a31040ee5271..3b6b3ed3e574 100644 --- a/llvm/test/CodeGen/AIE/hardware-loops/loop-with-call.ll +++ b/llvm/test/CodeGen/AIE/hardware-loops/loop-with-call.ll @@ -204,8 +204,8 @@ define dso_local void @_Z5test4i(i32 noundef %n) { ; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 ; AIE2P-NEXT: nop // Delay Slot 3 -; AIE2P-NEXT: st lr, [sp, #-64] // 4-byte Folded Spill Delay Slot 2 -; AIE2P-NEXT: nop // Delay Slot 1 +; AIE2P-NEXT: nop // Delay Slot 2 +; AIE2P-NEXT: st lr, [sp, #-64] // 4-byte Folded Spill Delay Slot 1 ; AIE2P-NEXT: .LBB1_1: // %for.body ; AIE2P-NEXT: // =>This Inner Loop Header: Depth=1 ; AIE2P-NEXT: nopa ; nopb ; jl #_Z16addToSymbolTablePKci; nops @@ -309,10 +309,10 @@ define dso_local void @memcpy_lowered_to_call(ptr nocapture writeonly %a, ptr no ; AIE2P-NEXT: nopa ; nopb ; st r8, [sp, #-60]; ge r0, r1, r0; mov r8, r0; nopv // 4-byte Folded Spill ; AIE2P-NEXT: jnz r0, #.LBB2_3 ; AIE2P-NEXT: nopx // Delay Slot 5 -; AIE2P-NEXT: st lr, [sp, #-64] // 4-byte Folded Spill Delay Slot 4 -; AIE2P-NEXT: st p6, [sp, #-56] // 4-byte Folded Spill Delay Slot 3 -; AIE2P-NEXT: st p7, [sp, #-52] // 4-byte Folded Spill Delay Slot 2 -; AIE2P-NEXT: nop // Delay Slot 1 +; AIE2P-NEXT: nop // Delay Slot 4 +; AIE2P-NEXT: st lr, [sp, #-64] // 4-byte Folded Spill Delay Slot 3 +; AIE2P-NEXT: st p6, [sp, #-56] // 4-byte Folded Spill Delay Slot 2 +; AIE2P-NEXT: st p7, [sp, #-52] // 4-byte Folded Spill Delay Slot 1 ; AIE2P-NEXT: // %bb.1: ; AIE2P-NEXT: movs p6, p0; mov p7, p1 ; AIE2P-NEXT: .LBB2_2: // %for.body diff --git a/llvm/test/CodeGen/AIE/hardware-loops/nested.ll b/llvm/test/CodeGen/AIE/hardware-loops/nested.ll index 39aa8210245f..8d4b34ef041c 100644 --- a/llvm/test/CodeGen/AIE/hardware-loops/nested.ll +++ b/llvm/test/CodeGen/AIE/hardware-loops/nested.ll @@ -24,17 +24,16 @@ define void @nested(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; AIE2: // %bb.0: // %for.cond3.preheader.lr.ph ; AIE2-NEXT: nopb ; mova r3, #0; nops ; nopxm ; nopv ; AIE2-NEXT: mova r4, #2; nopx -; AIE2-NEXT: movxm p2, #.LBB0_2 ; AIE2-NEXT: lda r2, [p0, #0] +; AIE2-NEXT: movxm p2, #.LBB0_2 ; AIE2-NEXT: .LBB0_1: // %for.cond3.preheader ; AIE2-NEXT: // =>This Loop Header: Depth=1 ; AIE2-NEXT: // Child Loop BB0_2 Depth 2 -; AIE2-NEXT: nopa ; lshl r5, r3, r4; nopm +; AIE2-NEXT: nopa ; nopb ; lshl r5, r3, r4; nopm ; AIE2-NEXT: mov dj0, r5 ; AIE2-NEXT: lda p3, [p1, dj0] ; AIE2-NEXT: nop ; AIE2-NEXT: nop -; AIE2-NEXT: nop ; AIE2-NEXT: mova r6, #0 ; AIE2-NEXT: add.nc r5, r1, #-1 ; AIE2-NEXT: .LBB0_2: // %for.body6 @@ -73,17 +72,16 @@ define void @nested(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; AIE2P: // %bb.0: // %for.cond3.preheader.lr.ph ; AIE2P-NEXT: mova r3, #0; nopb ; nops ; nopxm ; nopv ; AIE2P-NEXT: mova r4, #2; nopx -; AIE2P-NEXT: movxm p2, #.LBB0_2 ; AIE2P-NEXT: lda r2, [p0, #0] +; AIE2P-NEXT: movxm p2, #.LBB0_2 ; AIE2P-NEXT: .LBB0_1: // %for.cond3.preheader ; AIE2P-NEXT: // =>This Loop Header: Depth=1 ; AIE2P-NEXT: // Child Loop BB0_2 Depth 2 -; AIE2P-NEXT: nopa ; lshl r5, r3, r4; nopm +; AIE2P-NEXT: nopa ; nopb ; lshl r5, r3, r4; nopm ; AIE2P-NEXT: mov dj0, r5 ; AIE2P-NEXT: lda p3, [p1, dj0] ; AIE2P-NEXT: nop ; AIE2P-NEXT: nop -; AIE2P-NEXT: nop ; AIE2P-NEXT: mova r6, #0 ; AIE2P-NEXT: add.nc r5, r1, #-1 ; AIE2P-NEXT: .LBB0_2: // %for.body6 @@ -127,12 +125,11 @@ define void @nested(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; AIE2PS-NEXT: .LBB0_1: // %for.cond3.preheader ; AIE2PS-NEXT: // =>This Loop Header: Depth=1 ; AIE2PS-NEXT: // Child Loop BB0_2 Depth 2 -; AIE2PS-NEXT: nopa ; lshl r16, r4, r6; nopm +; AIE2PS-NEXT: nopa ; nopb ; lshl r16, r4, r6; nopm ; AIE2PS-NEXT: mov dj0, r16 ; AIE2PS-NEXT: lda p3, [p1, dj0] ; AIE2PS-NEXT: nop ; AIE2PS-NEXT: nop -; AIE2PS-NEXT: nop ; AIE2PS-NEXT: addm.nc r3, r1, #-1 ; AIE2PS-NEXT: mova r16, #0 ; AIE2PS-NEXT: .LBB0_2: // %for.body6 diff --git a/llvm/test/CodeGen/AIE/schedule/commit-block-schedule.ll b/llvm/test/CodeGen/AIE/schedule/commit-block-schedule.ll index cb2b638af789..4c89cd3bc47c 100644 --- a/llvm/test/CodeGen/AIE/schedule/commit-block-schedule.ll +++ b/llvm/test/CodeGen/AIE/schedule/commit-block-schedule.ll @@ -23,8 +23,8 @@ define void @test_commit_block_schedule(i1 %0) { ; CHECK-NEXT: nopa ; nopb ; nops ; movxm le, #.L_LEnd0; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv -; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; vmov cml2, cml0; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv +; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; vmov cml2, cml0; nopv ; CHECK-NEXT: .LBB0_2: // %for.body54 ; CHECK-NEXT: // Parent Loop BB0_1 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AIE/switch.ll b/llvm/test/CodeGen/AIE/switch.ll index 01686e40e019..2ec5147ccf10 100644 --- a/llvm/test/CodeGen/AIE/switch.ll +++ b/llvm/test/CodeGen/AIE/switch.ll @@ -164,8 +164,8 @@ define i32 @test(i8 signext %i) noinline nounwind optnone { ; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 ; AIE2P-NEXT: nop // Delay Slot 3 -; AIE2P-NEXT: st lr, [sp, #-64] // 4-byte Folded Spill Delay Slot 2 -; AIE2P-NEXT: nop // Delay Slot 1 +; AIE2P-NEXT: nop // Delay Slot 2 +; AIE2P-NEXT: st lr, [sp, #-64] // 4-byte Folded Spill Delay Slot 1 ; AIE2P-NEXT: // %bb.1: // %entry ; AIE2P-NEXT: movxm p0, ##.LJTI0_0 ; AIE2P-NEXT: movxm r1, #1048575