diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
index bf08fdbf2ff4..1cbf15e7065f 100644
--- a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
+++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
@@ -269,8 +269,6 @@ class BiasDepth : public ScheduleDAGMutation {
 };
 
 class RegionEndEdges : public ScheduleDAGMutation {
-  AAResults *AA;
-
   void removeExitSUPreds(ScheduleDAGInstrs *DAG) {
     SUnit &ExitSU = DAG->ExitSU;
     while (!ExitSU.Preds.empty()) {
@@ -278,7 +276,7 @@ class RegionEndEdges : public ScheduleDAGMutation {
     }
   }
   void apply(ScheduleDAGInstrs *DAG) override {
-    AIE::MaxLatencyFinder MaxLatency(DAG, AA);
+    AIE::MaxLatencyFinder MaxLatency(DAG);
     MachineBasicBlock *PrologueMBB = DAG->getBB();
     unsigned int ZOLBundlesCount = 0;
 
@@ -349,7 +347,7 @@ class RegionEndEdges : public ScheduleDAGMutation {
   };
 
 public:
-  RegionEndEdges(AAResults *AA = nullptr) : AA(AA) {}
+  RegionEndEdges() {}
 };
 
 /// This Mutator is responsible for emitting "fixed" SUnits at the top or bottom
@@ -912,16 +910,30 @@ AIEBaseSubtarget::getPostRAMutationsImpl(const Triple &TT, AAResults *AA) {
   if (!TT.isAIE1()) {
     if (EnableWAWStickyRegisters)
       Mutations.emplace_back(std::make_unique<WAWStickyRegistersEdges>());
-    Mutations.emplace_back(std::make_unique<RegionEndEdges>(AA));
+    // RegionEndEdges must run before MemoryEdges/WAWEdges/BiasDepth, and
+    // EmitFixedSUnits must run last. Both are applied via applyMutations()
+    // inside AIEPostRASchedStrategy::buildGraph, which also suppresses the
+    // redundant postProcessDAG() call from ScheduleDAGMI::schedule().
+    Mutations.emplace_back(createRegionEndEdgesMutation());
     Mutations.emplace_back(std::make_unique<MemoryEdges>(true));
     Mutations.emplace_back(std::make_unique<MachineSchedWAWEdges>());
     Mutations.emplace_back(std::make_unique<BiasDepth>());
-    Mutations.emplace_back(std::make_unique<EmitFixedSUnits>(
-        EnableAAInEmitFixedSUnits ? AA : nullptr));
+    Mutations.emplace_back(createEmitFixedSUnitsMutation(AA));
   }
   return Mutations;
 }
 
+std::unique_ptr<ScheduleDAGMutation>
+AIEBaseSubtarget::createRegionEndEdgesMutation() {
+  return std::make_unique<RegionEndEdges>();
+}
+
+std::unique_ptr<ScheduleDAGMutation>
+AIEBaseSubtarget::createEmitFixedSUnitsMutation(AAResults *AA) {
+  return std::make_unique<EmitFixedSUnits>(EnableAAInEmitFixedSUnits ? AA
+                                                                     : nullptr);
+}
+
 // List the Mutations that apply to the interblock DAG construction.
 std::vector<std::unique_ptr<ScheduleDAGMutation>>
 AIEBaseSubtarget::getDDGMutationsImpl(const Triple &TT, bool ExactLatencies) {
diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.h b/llvm/lib/Target/AIE/AIEBaseSubtarget.h
index 9eaa4b29d920..9e0146d50d92 100644
--- a/llvm/lib/Target/AIE/AIEBaseSubtarget.h
+++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.h
@@ -56,8 +56,12 @@ class AIEBaseSubtarget : public TargetSubtargetInfo {
   }
   void getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>>
                               &Mutations) const override {
-    Mutations =
-        AIEBaseSubtarget::getPostRAMutationsImpl(getTargetTriple(), nullptr);
+    // Post-RA mutations are applied directly in
+    // AIEPostRASchedStrategy::buildGraph, which owns the full graph
+    // construction pipeline. The registered Mutations list is intentionally
+    // empty so that the postProcessDAG() call in ScheduleDAGMI::schedule()
+    // is a no-op.
+    Mutations.clear();
   }
 
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
@@ -84,6 +88,17 @@ class AIEBaseSubtarget : public TargetSubtargetInfo {
   static std::vector<std::unique_ptr<ScheduleDAGMutation>>
   getSMSMutationsImpl(const Triple &TT);
 
+  /// Create the RegionEndEdges mutation for use in buildGraph, where it is
+  /// invoked directly after the other post-RA mutations and before
+  /// createEmitFixedSUnitsMutation (ordering is significant).
+  static std::unique_ptr<ScheduleDAGMutation> createRegionEndEdgesMutation();
+
+  /// Create the EmitFixedSUnits mutation for use in buildGraph, invoked after
+  /// createRegionEndEdgesMutation to preserve the ExitSU-edge ordering
+  /// invariant.
+  static std::unique_ptr<ScheduleDAGMutation>
+  createEmitFixedSUnitsMutation(AAResults *AA);
+
   /// Whether to enable the pre-RA MachinePipeliner. This can be disabled to let
   /// the post-RA pipeliner handle the scheduling.
   bool enableMachinePipeliner() const override;
diff --git a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp
index a8cb2cd06ef3..2b1b64d08f38 100644
--- a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp
+++ b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp
@@ -378,13 +378,11 @@ void AIEBasePassConfig::addPreSched2() {
 
 ScheduleDAGInstrs *
 AIEBaseTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
-  ScheduleDAGMI *DAG =
-      new AIEScheduleDAGMI(C, std::make_unique<AIEPostRASchedStrategy>(C),
-                           /* RemoveKillFlags=*/true);
-  for (auto &Mutation :
-       AIEBaseSubtarget::getPostRAMutationsImpl(getTargetTriple(), C->AA))
-    DAG->addMutation(std::move(Mutation));
-  return DAG;
+  // Post-RA mutations are applied directly in
+  // AIEPostRASchedStrategy::buildGraph, so the registered Mutations list is
+  // intentionally empty (matching the empty list from getPostRAMutations).
+  return new AIEScheduleDAGMI(C, std::make_unique<AIEPostRASchedStrategy>(C),
+                              /* RemoveKillFlags=*/true);
 }
 
 ScheduleDAGInstrs *
diff --git a/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp b/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp
index 08116d367768..80b79cba36c7 100644
--- a/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp
+++ b/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
+// (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 //
 //===----------------------------------------------------------------------===//
 
@@ -72,4 +72,67 @@ void DataDependenceHelper::dumpDot(raw_ostream &OS,
   OS << "}\n";
 }
 
+void InterBlockEdges::addNode(MachineInstr *MI) {
+  if (auto Index = initSUnit(*MI)) {
+    IndexMap &TheMap = Boundary ? SuccMap : PredMap;
+    TheMap.emplace(MI, *Index);
+  }
+}
+
+void InterBlockEdges::markBoundary() { Boundary = SUnits.size(); }
+
+bool InterBlockEdges::mayAlias(SUnit *SUa, SUnit *SUb, bool TBAA) {
+  if (SafeToIgnoreMemDeps && Boundary) {
+    // Suppress memory edges that cross the pre/post boundary.
+    const bool AIsPost = SUa->NodeNum >= *Boundary;
+    const bool BIsPost = SUb->NodeNum >= *Boundary;
+    if (AIsPost != BIsPost)
+      return false;
+  }
+  return DataDependenceHelper::mayAlias(SUa, SUb, TBAA);
+}
+
+const SUnit *InterBlockEdges::getPreBoundaryNode(MachineInstr *MI) const {
+  const auto Found = PredMap.find(MI);
+  if (Found == PredMap.end()) {
+    return nullptr;
+  }
+  return &SUnits.at(Found->second);
+}
+
+bool InterBlockEdges::isPostBoundaryNode(SUnit *SU) const {
+  return Boundary ? SU->NodeNum >= *Boundary : false;
+}
+
+void InterBlockEdges::recordPostDepth(MachineInstr *MI, int Depth) {
+  const auto Found = SuccMap.find(MI);
+  if (Found == SuccMap.end())
+    return;
+  PostDepths[Found->second] = Depth;
+}
+
+int InterBlockEdges::getPostDepthOr(const SUnit *SU, int Default) const {
+  const auto It = PostDepths.find(SU->NodeNum);
+  return It != PostDepths.end() ? It->second : Default;
+}
+
+void InterBlockEdges::recordPreHeightsFromSuccessors() {
+  for (const auto &[MI, NodeNum] : PredMap) {
+    const SUnit &SU = SUnits.at(NodeNum);
+    int MinHeight = std::numeric_limits<int>::max();
+    for (const SDep &Dep : SU.Succs) {
+      if (!isPostBoundaryNode(Dep.getSUnit()))
+        continue;
+      MinHeight = std::min(MinHeight, int(Dep.getSUnit()->getHeight()));
+    }
+    if (MinHeight != std::numeric_limits<int>::max())
+      PreHeights[NodeNum] = MinHeight;
+  }
+}
+
+int InterBlockEdges::getPreHeight(const SUnit *SU) const {
+  const auto It = PreHeights.find(SU->NodeNum);
+  return It != PreHeights.end() ? It->second : std::numeric_limits<int>::max();
+}
+
 } // end namespace llvm::AIE
diff --git a/llvm/lib/Target/AIE/AIEDataDependenceHelper.h b/llvm/lib/Target/AIE/AIEDataDependenceHelper.h
index 94ad326ef07f..dcace95370ad 100644
--- a/llvm/lib/Target/AIE/AIEDataDependenceHelper.h
+++ b/llvm/lib/Target/AIE/AIEDataDependenceHelper.h
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
+// (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 //
 //===----------------------------------------------------------------------===//
 //
@@ -17,6 +17,9 @@
 
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include <limits>
+#include <map>
+#include <optional>
 
 namespace llvm {
 
@@ -36,6 +39,8 @@ class DataDependenceHelper : public ScheduleDAGInstrs {
   std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
   const MachineSchedContext &Context;
   void schedule() override {};
+
+protected:
   bool mayAlias(SUnit *SUa, SUnit *SUb, bool TBAA) override;
 
 public:
@@ -53,6 +58,111 @@ class DataDependenceHelper : public ScheduleDAGInstrs {
   // are printed.
   void dumpDot(raw_ostream &OS, bool IncludeBoundaries) const;
 };
+
+/// This class generates all edges between nodes in two flow-adjacent regions.
+/// The nodes are added in forward flow order, marking the boundary at the
+/// appropriate point.
+///
+/// When SafeToIgnoreMemDeps is set, memory-alias edges that cross the
+/// pre/post boundary are suppressed via a mayAlias() override.
+///
+/// The class also provides optional depth and height maps (both keyed by SUnit
+/// NodeNum, so they remain unambiguous when the same MachineInstr* appears on
+/// both sides of the boundary, e.g. in a single-block loop):
+///
+///   PostDepths — top-down cycle of each post-boundary node.  Populated by
+///   recordPostDepth(); queried by getPostDepth().
+///
+///   PreHeights — for each pre-boundary node, the minimum getHeight() of its
+///   post-boundary successors in the DDG.  Populated by
+///   recordPreHeightsFromSuccessors() after buildEdges(); queried by
+///   getPreHeight().
+///
+///   PreRegionLength — total number of bundles in the pre-boundary region.
+///
+///   PostRegionLength — total number of bundles in the post-boundary region,
+///   used to represent the depth of the artificial ExitSU node.
+class InterBlockEdges : public DataDependenceHelper {
+  // The boundary between Pred and Succ nodes.
+  std::optional<unsigned> Boundary;
+  // When true, memory edges crossing the boundary are suppressed.
+  bool SafeToIgnoreMemDeps = false;
+
+  /// We can add the same instruction on both sides of the boundary.
+  /// We maintain explicit maps to retrieve the corresponding SUnit.
+  using IndexMap = std::map<MachineInstr *, unsigned>;
+  IndexMap PredMap;
+  IndexMap SuccMap;
+
+  /// Depth (top-down cycle) of post-boundary SUnits, keyed by NodeNum.
+  std::map<unsigned, int> PostDepths;
+  /// For each pre-boundary SUnit, the minimum getHeight() of its
+  /// post-boundary successors (keyed by NodeNum).
+  std::map<unsigned, int> PreHeights;
+  /// Total number of bundles in the pre-boundary region.
+  int PreRegionLength = 0;
+  /// Total number of bundles in the post-boundary region.
+  int PostRegionLength = 0;
+
+  bool mayAlias(SUnit *SUa, SUnit *SUb, bool TBAA) override;
+
+public:
+  InterBlockEdges(const MachineSchedContext &Context,
+                  bool SafeToIgnoreMemDeps = false)
+      : DataDependenceHelper(Context, true, true),
+        SafeToIgnoreMemDeps(SafeToIgnoreMemDeps) {}
+
+  /// Add a Node to the DAG.
+  void addNode(MachineInstr *);
+
+  /// Mark the boundary between the predecessor block and the successor block.
+  /// In normal operation, there should just be one call to this method.
+  /// Nodes added before are part of the predecessor, nodes added after are
+  /// part of the successor.
+  void markBoundary();
+
+  /// To iterate forward across the SUnits of the underlying DDG.
+  auto begin() const { return SUnits.begin(); }
+  auto end() const { return SUnits.end(); }
+
+  /// The following two methods are used to find the cross-boundary edges,
+  /// by starting from a pre-boundary node and selecting its successor edges
+  /// that connect to a post-boundary node.
+  /// ---
+  /// Retrieve the SUnit that represents MI's instance before the
+  /// boundary, null if not found.
+  const SUnit *getPreBoundaryNode(MachineInstr *MI) const;
+
+  /// Check whether SU represents an instruction after the boundary.
+  bool isPostBoundaryNode(SUnit *SU) const;
+
+  // Post-boundary depth interface.
+  /// Record the top-down cycle of a post-boundary instruction.
+  void recordPostDepth(MachineInstr *MI, int Depth);
+  /// Get the recorded top-down cycle of a post-boundary SUnit, or \p Default
+  /// if no depth has been recorded (e.g. the instruction is beyond the
+  /// conflict horizon).
+  int getPostDepthOr(const SUnit *SU, int Default) const;
+  /// Clear all recorded post-boundary depths.  Call before repopulating.
+  void clearPostDepths() { PostDepths.clear(); }
+
+  // Pre-boundary height interface.
+  /// Compute and store, for each pre-boundary SUnit, the minimum getHeight()
+  /// of its post-boundary successors.  Must be called after buildEdges().
+  void recordPreHeightsFromSuccessors();
+  /// Get the stored height of a pre-boundary SUnit.
+  /// Returns INT_MAX if not recorded (conservative: no loop-carried use).
+  int getPreHeight(const SUnit *SU) const;
+
+  // Pre-boundary region length.
+  void setPreRegionLength(int Length) { PreRegionLength = Length; }
+  int getPreRegionLength() const { return PreRegionLength; }
+
+  // Post-boundary region length (used as depth of the ExitSU node).
+  void setPostRegionLength(int Length) { PostRegionLength = Length; }
+  int getPostRegionLength() const { return PostRegionLength; }
+};
+
 } // namespace AIE
 } // namespace llvm
 
diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
index 61ad93cc7711..814405a06e2d 100644
--- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
+++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
@@ -383,6 +383,13 @@ void InterBlockScheduling::enterBlock(MachineBasicBlock *BB) {
                       << CurrentBlockState->kindAsString() << " FixPointIter="
                       << CurrentBlockState->FixPoint.NumIters
                       << " II=" << CurrentBlockState->FixPoint.II << "\n");
+  // Emit SWP prologues/epilogues that belong to this block. This only applies
+  // in the Scheduling stage: during GatheringRegions the regions are only being
+  // recorded without physically inserting any SWP code yet.
+  if (CurrentBlockState->FixPoint.Stage != SchedulingStage::GatheringRegions) {
+    emitInterBlockTop(*CurrentBlockState);
+    emitInterBlockBottom(*CurrentBlockState);
+  }
 }
 namespace {
 /// This implements the interface to the postpipeliner to extract the
@@ -529,7 +536,7 @@ InterBlockScheduling::resourcesConverged(BlockState &BS,
   return nullptr;
 }
 
-MachineInstr *InterBlockScheduling::latencyConverged(BlockState &BS) const {
+MachineInstr *InterBlockScheduling::latencyConverged(BlockState &BS) {
   const auto &SubTarget = BS.TheBlock->getParent()->getSubtarget();
   auto *TII = static_cast<const AIEBaseInstrInfo *>(SubTarget.getInstrInfo());
   auto *ItinData = SubTarget.getInstrItineraryData();
@@ -543,17 +550,18 @@ MachineInstr *InterBlockScheduling::latencyConverged(BlockState &BS) const {
   // If the successor is in Top, we lookup its depth in TopDepth
   const Region &Bottom = BS.getBottom();
   const Region &Top = BS.getTop();
-  const InterBlockEdges &BackEdges = BS.getBoundaryEdges();
+  InterBlockEdges &BackEdges = BS.getBoundaryEdges();
 
-  // Record the depth of all instructions in Top. Don't record the ones that
-  // can't cause problems
-  std::map<MachineInstr *, int> TopDepth;
-  int Depth = 0;
+  // Repopulate the post-boundary depths from the current scheduled bundles of
+  // the top region, capped at the conflict horizon.  Clear first so that stale
+  // values from a previous fixpoint iteration are not retained.
+  BackEdges.clearPostDepths();
+  int TopDepth = 0;
   for (auto &Bundle : Top.Bundles) {
     for (auto *MI : Bundle.getInstrs()) {
-      TopDepth[MI] = Depth;
+      BackEdges.recordPostDepth(MI, TopDepth);
     }
-    if (++Depth > HR->getConflictHorizon()) {
+    if (++TopDepth > HR->getConflictHorizon()) {
       break;
     }
   }
@@ -579,14 +587,14 @@ MachineInstr *InterBlockScheduling::latencyConverged(BlockState &BS) const {
           continue;
         }
         DEBUG_LOOPAWARE(dbgs() << "  Backedge to " << Succ->NodeNum << "\n");
-        auto DepthIt = TopDepth.find(Succ->getInstr());
-        if (DepthIt == TopDepth.end()) {
-          // Over the horizon
-          continue;
-        }
-        DEBUG_LOOPAWARE(dbgs() << "  Depth=" << DepthIt->second << "\n");
+        // Instructions beyond the conflict horizon default to ConflictHorizon,
+        // so that Distance = Height + ConflictHorizon >= 1 + ConflictHorizon,
+        // which is always >= Latency, naturally avoiding false positives.
+        const int SuccDepth =
+            BackEdges.getPostDepthOr(Succ, HR->getConflictHorizon());
+        DEBUG_LOOPAWARE(dbgs() << "  Depth=" << SuccDepth << "\n");
         int Latency = SDep.getSignedLatency();
-        int Distance = Height + DepthIt->second;
+        int Distance = Height + SuccDepth;
         if (Distance < Latency) {
           DEBUG_LOOPAWARE(dbgs() << "  Latency(" << Pred->NodeNum << "->"
                                  << Succ->NodeNum << ")=" << Latency
@@ -610,19 +618,22 @@ MachineInstr *InterBlockScheduling::latencyConverged(BlockState &BS) const {
 }
 
 SchedulingStage InterBlockScheduling::updateFixPoint(BlockState &BS) {
-  if (BS.Kind != BlockType::Loop) {
-    return SchedulingStage::SchedulingDone;
-  }
-
   if (BS.FixPoint.Stage == SchedulingStage::GatheringRegions) {
-    // This is the first time we schedule this loop. In that first
-    // iteration, we have recorded the region decomposition.
-    // Now we can create the interblock edges between the top and the bottom
-    // region
-    BS.initInterBlock(*Context, *HR);
+    // This is the first visit to this block. The region decomposition has been
+    // gathered. Now transition to Scheduling so the next pass actually
+    // schedules the gathered regions.
+    if (BS.Kind == BlockType::Loop) {
+      // For loops, also create the interblock edges between the top and the
+      // bottom region.
+      BS.initInterBlock(*Context, *HR);
+    }
     return SchedulingStage::Scheduling;
   }
 
+  if (BS.Kind != BlockType::Loop) {
+    return SchedulingStage::SchedulingDone;
+  }
+
   BS.FixPoint.NumIters++;
   if (BS.FixPoint.Stage == SchedulingStage::Scheduling) {
     return updateScheduling(BS);
@@ -839,16 +850,33 @@ void InterBlockScheduling::enterRegion(MachineBasicBlock *BB,
   DEBUG_BLOCKS(dbgs() << "    >> enterRegion, Iter=" << BS.FixPoint.NumIters
                       << "\n");
 
-  // Only add regions of loops when in the GatheringRegions phase
-  if (BS.Kind != BlockType::Loop ||
-      BS.FixPoint.Stage == SchedulingStage::GatheringRegions) {
-    ArrayRef<MachineBundle> TopFixedBundles =
-        RegionBegin == BB->begin() ? ArrayRef<MachineBundle>(BS.TopInsert)
-                                   : ArrayRef<MachineBundle>();
-    ArrayRef<MachineBundle> BotFixedBundles =
-        RegionEnd == BB->end() ? ArrayRef<MachineBundle>(BS.BottomInsert)
-                               : ArrayRef<MachineBundle>();
-    BS.addRegion(BB, RegionBegin, RegionEnd, TopFixedBundles, BotFixedBundles);
+  if (BS.FixPoint.Stage == SchedulingStage::GatheringRegions) {
+    // Gather region boundaries and capture the invariant SemanticOrder for all
+    // block types. Fixed bundles are NOT set here: they result from loop
+    // pipelining, which happens during Scheduling, and are applied via the
+    // setTopFixedBundles / setBotFixedBundles calls in the Scheduling pass.
+    BS.addRegion(BB, RegionBegin, RegionEnd);
+  } else if (BS.Kind != BlockType::Loop) {
+    // Scheduling pass for non-loop blocks: set fixed bundles on the
+    // pre-gathered region now that emitInterBlockTop / emitInterBlockBottom
+    // has physically inserted the SWP instructions into the block.
+    //
+    // If Regions is empty, the block was empty during GatheringRegions (e.g.
+    // a newly-created dedicated exit block). The machine scheduler skips
+    // enterRegion for empty blocks so no region was captured. Create it now
+    // with correct free-instruction boundaries, excluding any fixed bundles.
+    if (BS.getRegions().empty()) {
+      const unsigned TopCount =
+          (RegionBegin == BB->begin()) ? BS.TopInsert.size() : 0u;
+      const unsigned BotCount =
+          (RegionEnd == BB->end()) ? BS.BottomInsert.size() : 0u;
+      BS.addRegion(BB, std::next(RegionBegin, TopCount),
+                   std::prev(RegionEnd, BotCount));
+    }
+    if (RegionBegin == BB->begin() && !BS.TopInsert.empty())
+      BS.getCurrentRegion().setTopFixedBundles(BS.TopInsert);
+    if (RegionEnd == BB->end() && !BS.BottomInsert.empty())
+      BS.getCurrentRegion().setBotFixedBundles(BS.BottomInsert);
   }
 }
 
@@ -1042,26 +1070,34 @@ int InterBlockScheduling::getCyclesToRespectTiming(
   int DistFromLoopEntry = 0;
   int EntryNops = 0;
 
-  auto AddRegionToEdges = [&](const Region &R) {
-    for (auto &Bundle : R.Bundles) {
-      for (MachineInstr *MI : Bundle.getInstrs()) {
-        DistancesFromLoopEntry[MI] = DistFromLoopEntry;
-      }
-      ++DistFromLoopEntry;
-    }
+  auto AddRegionToEdges = [&](const Region &R, bool IsPostBoundary = false) {
+    // Add nodes first so that SuccMap/PredMap are populated before depth
+    // recording (recordPostDepth looks up SuccMap by MachineInstr*).
     // Here we need to iterate using semantic order.
     assert(R.top_fixed_instrs().empty() && "SWP epilogue already emitted?");
     for (MachineInstr *MI : R.getFreeInstructions()) {
       Edges.addNode(MI);
     }
+    for (auto &Bundle : R.Bundles) {
+      for (MachineInstr *MI : Bundle.getInstrs()) {
+        if (IsPostBoundary) {
+          Edges.recordPostDepth(MI, DistFromLoopEntry);
+        } else {
+          DistancesFromLoopEntry[MI] = DistFromLoopEntry;
+        }
+      }
+      ++DistFromLoopEntry;
+    }
   };
 
   // Construction of the superblock containing Loop+Epilogue
   // First part is the loop
   AddRegionToEdges(LoopBS.getBottom());
+  Edges.setPreRegionLength(DistFromLoopEntry);
   Edges.markBoundary();
   // Second part is the epilogue itself
-  AddRegionToEdges(EpilogueBS.getTop());
+  AddRegionToEdges(EpilogueBS.getTop(), /*IsPostBoundary=*/true);
+  Edges.setPostRegionLength(DistFromLoopEntry - Edges.getPreRegionLength());
   Edges.buildEdges();
 
   DEBUG_LOOPAWARE(dumpInterBlock(Edges));
@@ -1081,11 +1117,12 @@ int InterBlockScheduling::getCyclesToRespectTiming(
 
         const int PostBoundOrExitDist =
             (PostBoundaryMI != nullptr)
-                ? DistancesFromLoopEntry[PostBoundaryMI]
-                // When getInstr returns nullptr, we reached
-                // ExitSU. We can consider the DistFromLoopEntry as
-                // depth of the ExitSU.
-                : DistFromLoopEntry;
+                ? Edges.getPostDepthOr(Succ, 0)
+                // When getInstr returns nullptr, we reached ExitSU.
+                // The coordinate system counts from the start of the loop
+                // (same as DistancesFromLoopEntry), so ExitSU is at
+                // pre-region + post-region bundles.
+                : Edges.getPreRegionLength() + Edges.getPostRegionLength();
 
         const int Latency = SDep.getSignedLatency();
         const int Distance =
@@ -1150,56 +1187,14 @@ int InterBlockScheduling::getCyclesToAvoidResourceConflicts(
   return NopCounter;
 }
 
-void InterBlockEdges::addNode(MachineInstr *MI) {
-  if (auto Index = DDG.initSUnit(*MI)) {
-    IndexMap &TheMap = Boundary ? SuccMap : PredMap;
-    TheMap.emplace(MI, *Index);
-  }
-}
-
-// Mark the boundary between the predecessor block and the successor block
-void InterBlockEdges::markBoundary() { Boundary = DDG.SUnits.size(); }
-
-const SUnit *InterBlockEdges::getPreBoundaryNode(MachineInstr *MI) const {
-  auto Found = PredMap.find(MI);
-  if (Found == PredMap.end()) {
-    return nullptr;
-  }
-
-  return &DDG.SUnits.at(Found->second);
-}
-
-bool InterBlockEdges::isPostBoundaryNode(SUnit *SU) const {
-  return Boundary ? SU->NodeNum >= *Boundary : false;
-}
-
 Region::Region(MachineBasicBlock *BB, MachineBasicBlock::iterator Begin,
-               MachineBasicBlock::iterator End,
-               ArrayRef<MachineBundle> TopFixedBundles,
-               ArrayRef<MachineBundle> BotFixedBundles)
-    : BB(BB), TopFixedBundles(TopFixedBundles),
-      BotFixedBundles(BotFixedBundles) {
-  MachineBasicBlock::iterator FreeBegin =
-      std::next(Begin, TopFixedBundles.size());
-  MachineBasicBlock::iterator FreeEnd = std::prev(End, BotFixedBundles.size());
-
-  // Verify that all fixed instructions are at the right place in the MBB
-  assert(TopFixedBundles.empty() || Begin == BB->begin());
-  assert(TopFixedBundles.empty() ||
-         all_of(TopFixedBundles.back().Instrs, [FreeBegin](
-                                                   const MachineInstr *MI) {
-           return getBundleStart(MI->getIterator()) == std::prev(FreeBegin);
-         }));
-  assert(BotFixedBundles.empty() || End == BB->end());
-  assert(
-      BotFixedBundles.empty() ||
-      all_of(BotFixedBundles.front().Instrs, [FreeEnd](const MachineInstr *MI) {
-        return getBundleStart(MI->getIterator()) == FreeEnd;
-      }));
-
+               MachineBasicBlock::iterator End)
+    : BB(BB) {
   // When the region is created, its instructions haven't been re-ordered yet,
-  // so this is effectively saving the semantic order.
-  for (auto It = FreeBegin; It != FreeEnd; ++It) {
+  // so this is effectively saving the semantic order. Fixed bundles (if any)
+  // are set separately via setTopFixedBundles / setBotFixedBundles, which
+  // will trim the corresponding entries from SemanticOrder.
+  for (auto It = Begin; It != End; ++It) {
     SemanticOrder.push_back(&*It);
   }
   if (End != BB->end()) {
@@ -1207,6 +1202,32 @@ Region::Region(MachineBasicBlock *BB, MachineBasicBlock::iterator Begin,
   }
 }
 
+void Region::setTopFixedBundles(ArrayRef<MachineBundle> Bundles) {
+  assert(TopFixedBundles.empty() && "TopFixedBundles already set.");
+  // Verify the fixed instructions are physically at the top of the block.
+  const auto FreeBegin = std::next(BB->begin(), Bundles.size());
+  assert(all_of(Bundles.back().Instrs, [FreeBegin](const MachineInstr *MI) {
+    return getBundleStart(MI->getIterator()) == std::prev(FreeBegin);
+  }));
+  TopFixedBundles = Bundles;
+  // SemanticOrder was captured during GatheringRegions before the fixed
+  // bundles were inserted into the block, so it already contains only the
+  // free instructions. No adjustment is needed.
+}
+
+void Region::setBotFixedBundles(ArrayRef<MachineBundle> Bundles) {
+  assert(BotFixedBundles.empty() && "BotFixedBundles already set.");
+  // Verify the fixed instructions are physically at the bottom of the block.
+  const auto FreeEnd = std::prev(BB->end(), Bundles.size());
+  assert(all_of(Bundles.front().Instrs, [FreeEnd](const MachineInstr *MI) {
+    return getBundleStart(MI->getIterator()) == FreeEnd;
+  }));
+  BotFixedBundles = Bundles;
+  // SemanticOrder was captured during GatheringRegions before the fixed
+  // bundles were inserted into the block, so it already contains only the
+  // free instructions. No adjustment is needed.
+}
+
 BlockState::BlockState(MachineBasicBlock *Block) : TheBlock(Block) {
   classify();
   setBlockProperties();
@@ -1267,7 +1288,6 @@ void BlockState::classify() {
   if (LoopAware && IsLoop(TheBlock) &&
       llvm::all_of(TheBlock->successors(), CanFixLoopSchedule)) {
     Kind = BlockType::Loop;
-    FixPoint.Stage = SchedulingStage::GatheringRegions;
   }
 
   // We will mark the epilogues in a second sweep, when all states have been
@@ -1325,6 +1345,7 @@ void BlockState::initInterBlock(const MachineSchedContext &Context,
     BoundaryEdges->addNode(MI);
   }
   BoundaryEdges->buildEdges();
+  BoundaryEdges->recordPreHeightsFromSuccessors();
   DEBUG_LOOPAWARE(dumpInterBlock(*BoundaryEdges));
 }
 
diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
index bbd3e75770f6..f8260989ed78 100644
--- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
+++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
@@ -31,53 +31,6 @@
 
 namespace llvm::AIE {
 
-/// This class generates all edges between nodes in two flow-adjacent regions
-/// The nodes are added in forward flow order, marking the boundary at the
-/// appropriate point.
-class InterBlockEdges {
-  DataDependenceHelper DDG;
-  // the boundary between Pred and Succ nodes
-  std::optional<unsigned> Boundary;
-
-  /// We can add the same instruction on both sides of the boundary.
-  /// We maintain explicit maps to retrieve the corresponding SUnit
-  using IndexMap = std::map<MachineInstr *, unsigned>;
-  IndexMap PredMap;
-  IndexMap SuccMap;
-
-public:
-  InterBlockEdges(const MachineSchedContext &Context)
-      : DDG(Context, true, true) {}
-
-  /// Add a Node to the DAG.
-  void addNode(MachineInstr *);
-
-  /// Mark the boundary between the predecessor block and the successor block.
-  /// In normal operation, there should just be one call to this method.
-  /// Nodes added before are part of the predecesor, nodes added after are
-  /// part of the successor
-  void markBoundary();
-
-  /// Create all the edges by interpreting read and write events of the nodes
-  // in reverse order.
-  void buildEdges() { DDG.buildEdges(); }
-
-  /// To iterate forward across the SUnits of the underlying DDG.
-  auto begin() const { return DDG.SUnits.begin(); }
-  auto end() const { return DDG.SUnits.end(); }
-
-  /// The following two methods are used to find the cross-boundary edges,
-  /// by starting from a pre-boundary node and select its successor edges that
-  /// connect to a post-boundary node.
-  /// ---
-  /// Retrieve the SUnit that represents MI's instance before the
-  /// boundary, null if not found.
-  const SUnit *getPreBoundaryNode(MachineInstr *MI) const;
-
-  /// Check whether SU represents an instruction after the boundary
-  bool isPostBoundaryNode(SUnit *SU) const;
-};
-
 // BlockType determines scheduling priority, direction and safety margin
 // handling.
 enum class BlockType { Regular, Loop, Epilogue };
@@ -113,7 +66,7 @@ enum class SchedulingStage {
 /// Parameters that drive fixpoint convergence
 class FixedpointState {
 public:
-  SchedulingStage Stage = SchedulingStage::Scheduling;
+  SchedulingStage Stage = SchedulingStage::GatheringRegions;
   // Parameters of the loop-aware convergence
   int LatencyMargin = 0;
   SmallMapVector<MachineInstr *, int, 8> PerMILatencyMargin;
@@ -153,9 +106,7 @@ class Region {
 
 public:
   Region(MachineBasicBlock *BB, MachineBasicBlock::iterator Begin,
-         MachineBasicBlock::iterator End,
-         ArrayRef<MachineBundle> TopFixedBundles,
-         ArrayRef<MachineBundle> BotFixedBundles);
+         MachineBasicBlock::iterator End);
 
   using free_iterator = std::vector<MachineInstr *>::const_iterator;
   using fixed_iterator = MachineBasicBlock::iterator;
@@ -181,6 +132,18 @@ class Region {
   }
   ArrayRef<MachineBundle> getBotFixedBundles() const { return BotFixedBundles; }
 
+  /// Set the fixed bundles at the top of the region (e.g. a SWP epilogue).
+  /// The instructions must already be physically present at the start of the
+  /// block. Trims SemanticOrder to exclude the newly fixed instructions.
+  /// \pre The region starts at BB->begin().
+  void setTopFixedBundles(ArrayRef<MachineBundle> Bundles);
+
+  /// Set the fixed bundles at the bottom of the region (e.g. a SWP prologue).
+  /// The instructions must already be physically present at the end of the
+  /// block. Trims SemanticOrder to exclude the newly fixed instructions.
+  /// \pre The region ends at BB->end().
+  void setBotFixedBundles(ArrayRef<MachineBundle> Bundles);
+
   MachineInstr *getExitInstr() const { return ExitInstr; }
 
   std::vector<MachineBundle> Bundles;
@@ -240,15 +203,11 @@ class BlockState {
     TheBundles.insert(TheBundles.end(), Bundles.begin(), Bundles.end());
   }
   void addRegion(MachineBasicBlock *BB, MachineBasicBlock::iterator RegionBegin,
-                 MachineBasicBlock::iterator RegionEnd,
-                 ArrayRef<MachineBundle> TopFixedBundles,
-                 ArrayRef<MachineBundle> BotFixedBundles) {
-    assert((Kind == BlockType::Loop &&
-            FixPoint.Stage == SchedulingStage::GatheringRegions) ||
-           FixPoint.Stage == SchedulingStage::Scheduling);
+                 MachineBasicBlock::iterator RegionEnd) {
+    assert(FixPoint.Stage == SchedulingStage::GatheringRegions ||
+           (FixPoint.Stage == SchedulingStage::Scheduling && Regions.empty()));
     CurrentRegion = Regions.size();
-    Regions.emplace_back(BB, RegionBegin, RegionEnd, TopFixedBundles,
-                         BotFixedBundles);
+    Regions.emplace_back(BB, RegionBegin, RegionEnd);
   }
   auto &getCurrentRegion() const { return Regions.at(CurrentRegion); }
   auto &getCurrentRegion() { return Regions[CurrentRegion]; }
@@ -256,6 +215,10 @@ class BlockState {
   const Region &getTop() const { return Regions.back(); }
   Region &getTop() { return Regions.back(); }
   const Region &getBottom() const { return Regions.front(); }
+  InterBlockEdges &getBoundaryEdges() {
+    assert(Kind == BlockType::Loop && BoundaryEdges);
+    return *BoundaryEdges;
+  }
   const InterBlockEdges &getBoundaryEdges() const {
     assert(Kind == BlockType::Loop && BoundaryEdges);
     return *BoundaryEdges;
@@ -344,7 +307,7 @@ class InterBlockScheduling {
 
   /// Return one instruction that needs a higher latency cap, or nullptr if all
   /// latencies converged.
-  MachineInstr *latencyConverged(BlockState &BS) const;
+  MachineInstr *latencyConverged(BlockState &BS);
 
   /// After finding the loops, determine the epilogue blocks.
   void markEpilogueBlocks();
@@ -437,6 +400,8 @@ class InterBlockScheduling {
 
   AIEAlternateDescriptors &getSelectedAltDescs() { return SelectedAltDescs; }
 
+  const MachineSchedContext *getContext() const { return Context; }
+
   std::optional<SWPEpilogueContext>
   getSWPEpilogueContext(MachineBasicBlock *MBB);
 };
diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
index 0aef0e14b0cd..e144eecb29c6 100644
--- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
+++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
@@ -11,6 +11,7 @@
 #include "AIEMachineScheduler.h"
 #include "AIEBaseAliasAnalysis.h"
 #include "AIEBaseInstrInfo.h"
+#include "AIEBaseSubtarget.h"
 #include "AIEBundle.h"
 #include "AIEHazardRecognizer.h"
 #include "AIEInterBlockScheduling.h"
@@ -783,13 +784,8 @@ void AIEPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) {
   CurMBB = MBB;
   // We force bottom up region processing, so the first region
   // from a block is the bottom one. We reset this when leaving any
-  // region
+  // region.
   IsBottomRegion = true;
-
-  // The block may have a timed region, append its instructions.
-  auto &BS = InterBlock.getBlockState(MBB);
-  InterBlock.emitInterBlockTop(BS);
-  InterBlock.emitInterBlockBottom(BS);
 }
 
 static MachineBasicBlock::iterator
@@ -1164,17 +1160,7 @@ int getEarliestLoopCarriedUse(const SUnit &SU,
   assert(SUInCurrentIteration);
   assert(SUInCurrentIteration->getHeight() >= SU.getHeight());
 
-  // Look at loop-carried dependencies to see how early the instruction will be
-  // needed in the next iteration.
-  int EarliestCycle = std::numeric_limits<int>::max();
-  for (const SDep &Succ : SUInCurrentIteration->Succs) {
-    if (!LoopEdges.isPostBoundaryNode(Succ.getSUnit()))
-      continue;
-
-    EarliestCycle = std::min(EarliestCycle, int(Succ.getSUnit()->getHeight()));
-  }
-
-  return EarliestCycle;
+  return LoopEdges.getPreHeight(SUInCurrentIteration);
 }
 
 /// Apply a set of heuristics to a new candidate for PostRA scheduling.
@@ -1737,6 +1723,14 @@ void llvm::AIEPostRASchedStrategy::buildGraph(ScheduleDAGMI &DAG, AAResults *AA,
   DAG.ExitSU.setInstr(Region.getExitInstr());
   DAG.makeMaps();
   DAG.buildEdges(Context->AA);
+  // Apply all post-RA mutations in the correct order. buildGraph owns the
+  // complete graph construction pipeline (raw edges + mutations); the
+  // postProcessDAG() call in ScheduleDAGMI::schedule() is a no-op because
+  // the registered Mutations list is intentionally empty (getPostRAMutations
+  // returns empty, and createPostMachineScheduler no longer registers them).
+  const Triple &TT = DAG.MF.getTarget().getTargetTriple();
+  for (auto &M : AIEBaseSubtarget::getPostRAMutationsImpl(TT, Context->AA))
+    M->apply(&DAG);
   static_cast<AIEScheduleDAGMI &>(DAG).recordDbgInstrs(Region);
 }
 
diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
index a8859864a1bd..1ca79c94e817 100644
--- a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
+++ b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
@@ -74,144 +74,89 @@ bool MaxLatencyFinder::isBottomRegion(MachineInstr *ExitMI) {
   return std::next(It) == CurBB->end();
 }
 
-/// Check whether SrcOp and DstOp might refer to the same value
-static bool overlap(const MachineOperand &SrcOp, const MachineOperand &DstOp,
-                    const TargetRegisterInfo *TRI) {
-  Register SrcReg = SrcOp.getReg();
-  Register DstReg = DstOp.getReg();
+MaxLatencyFinder::MaxLatencyFinder(const MachineSchedContext &C,
+                                   const AIEPostRASchedStrategy *Scheduler,
+                                   MachineBasicBlock *CurBB)
+    : Scheduler(Scheduler), TII(static_cast<const AIEBaseInstrInfo *>(
+                                C.MF->getSubtarget().getInstrInfo())),
+      Itineraries(C.MF->getSubtarget().getInstrItineraryData()),
+      TRI(C.MF->getSubtarget().getRegisterInfo()), CurBB(CurBB),
+      InterBlock(true) {}
 
-  // Use TRI's regsOverlap which handles both physical and virtual registers,
-  // including subregisters and lane masks
-  return TRI->regsOverlap(SrcReg, DstReg);
-}
-
-/// Check whether Dst depends on Src
-static bool depends(const MachineInstr &Src, const MachineInstr &Dst,
-                    const TargetRegisterInfo *TRI, AAResults *AA,
-                    bool SafeToIgnoreMemDeps) {
-
-  const AIEBaseInstrInfo *const TII = static_cast<const AIEBaseInstrInfo *>(
-      Src.getMF()->getSubtarget().getInstrInfo());
-  // Detect dependency between lock and ld/st intructions.
-  if ((TII->isLock(Src.getOpcode()) && (Dst.mayLoadOrStore())) ||
-      (TII->isLock(Dst.getOpcode()) && (Src.mayLoadOrStore()))) {
-    return true;
-  }
-
-  // We detect any common register input/output between Dst and Src
-  for (auto &SrcOp : Src.operands()) {
-    if (!SrcOp.isReg()) {
-      continue;
-    }
-    for (auto &DstOp : Dst.operands()) {
-      if (!DstOp.isReg()) {
-        continue;
-      }
-      // Exclude the RAR case
-      if (SrcOp.isUse() && DstOp.isUse()) {
-        continue;
-      }
-      if (overlap(SrcOp, DstOp, TRI)) {
-        return true;
-      }
-    }
+// This is called from different contexts, so we need some case analysis
+// If we have a basic block, we are in a regular MachineScheduler invocation,
+// and we will be able to retrieve its strategy,
+// Otherwise we are an abstract region; Scheduler will be nullptr, which
+// will not be dereferenced.
+MaxLatencyFinder::MaxLatencyFinder(ScheduleDAGInstrs *DAG)
+    : Scheduler(DAG->getBB()
+                    ? static_cast<AIEScheduleDAGMI *>(DAG)->getSchedImpl()
+                    : nullptr),
+      TII(static_cast<const AIEBaseInstrInfo *>(DAG->TII)),
+      Itineraries(DAG->getSchedModel()->getInstrItineraries()),
+      TRI(DAG->MF.getSubtarget().getRegisterInfo()), CurBB(DAG->getBB()),
+      InterBlock(InterBlockLatency && CurBB &&
+                 isBottomRegion(DAG->ExitSU.getInstr()) &&
+                 Scheduler->successorsAreScheduled(CurBB)) {
+  if (CurBB && Scheduler) {
+    const Region &CurRegion =
+        Scheduler->getInterBlock().getBlockState(CurBB).getCurrentRegion();
+    buildInterBlockEdges(CurRegion);
   }
+}
 
-  // Use alias analysis if available.
-  // The memory latency is accounted for by maxLatency() and any
-  // possible dependence will be corrected for by its scheduled cycle.
-  // (RAW || WAW) ||
-  // (WAR)
-  if ((Src.mayStore() && (Dst.mayLoad() || Dst.mayStore())) ||
-      (Src.mayLoad() && Dst.mayStore())) {
-
-    // For non-part-word memory instructions, use alias analysis (if available)
-    // to determine if Src and Dst may alias. Part-word instructions are always
-    // treated conservatively due to their read-modify-write behavior.
-    auto IsPartWordStore = [&TII](const MachineInstr &MaybePartStore) {
-      return MaybePartStore.mayStore() &&
-             TII->isPartWordMemoryInst(MaybePartStore);
-    };
+void MaxLatencyFinder::buildInterBlockEdges(const Region &CurRegion) {
+  const MachineSchedContext &C = *Scheduler->getInterBlock().getContext();
+  const InterBlockScheduling &IB = Scheduler->getInterBlock();
 
-    if (!IsPartWordStore(Src)) {
+  HasUnknownSuccessors = CurBB->succ_empty();
 
-      // If it's safe to ignore memory dependencies, skip memory checks.
-      if (SafeToIgnoreMemDeps)
-        return false;
+  // When the outer loop pipeliner has annotated the epilogue to indicate that
+  // epilogue stores will not alias with loads from the peeled iteration, we
+  // suppress cross-boundary memory edges in the inter-block DDG accordingly.
+  const bool SafeToIgnoreMemDeps =
+      IB.getBlockState(CurBB).isSafeToIgnoreMemDeps();
 
-      if (AA)
-        return Src.mayAlias(AA, Dst, true);
+  for (MachineBasicBlock *SuccBB : CurBB->successors()) {
+    InterBlockEdges &SE = *PerSuccEdges.emplace_back(
+        std::make_unique<InterBlockEdges>(C, SafeToIgnoreMemDeps));
+
+    // Pre-boundary: free instructions of the current region.
+    for (MachineInstr *MI : CurRegion.getFreeInstructions())
+      SE.addNode(MI);
+
+    SE.markBoundary();
+
+    // Post-boundary: always use getFreeInstructions() as the single source of
+    // node identity. Empty regions signify empty basic blocks; in that case no
+    // post-boundary nodes are added.
+    const BlockState &SBS = IB.getBlockState(SuccBB);
+    if (!SBS.getRegions().empty()) {
+      for (MachineInstr *MI : SBS.getTop().getFreeInstructions())
+        SE.addNode(MI);
     }
 
-    // Conservative: assume dependency for part-word instructions or when AA
-    // is unavailable
-    return true;
-  }
+    SE.buildEdges();
 
-  return false;
-}
-
-InstrAndCycle findEarliestRef(const MachineInstr &SrcMI,
-                              ArrayRef<MachineBundle> Bundles, int Prune,
-                              AAResults *AA, bool SafeToIgnoreMemDeps) {
-  const TargetRegisterInfo *TRI =
-      SrcMI.getMF()->getSubtarget().getRegisterInfo();
-  int Cycle = 0;
-  for (const auto &Bundle : Bundles) {
-    if (Cycle >= Prune) {
-      LLVM_DEBUG(dbgs() << " prune at " << Cycle << "\n");
-      return {/*MI=*/nullptr, Cycle};
+    // After the graph is built, record the scheduled cycle depth for each
+    // post-boundary instruction and the total length of the successor block's
+    // top region.
+    // Instructions absent from Depths return depth 0 from getDepth(), which
+    // is the conservative value (no latency reduction) for unscheduled nodes.
+    if (!SBS.isScheduled() || SBS.getRegions().empty()) {
+      return;
     }
-    for (MachineInstr *DstMI : Bundle.getInstrs()) {
-      LLVM_DEBUG(dbgs() << " " << *DstMI);
-      if (depends(SrcMI, *DstMI, TRI, AA, SafeToIgnoreMemDeps)) {
-        LLVM_DEBUG(dbgs() << "    depends in cycle=" << Cycle << "\n");
-        return {DstMI, Cycle};
+    int Cycle = 0;
+    for (const MachineBundle &Bundle : SBS.getTop().Bundles) {
+      for (MachineInstr *MI : Bundle.getInstrs()) {
+        SE.recordPostDepth(MI, Cycle);
       }
+      ++Cycle;
     }
-    Cycle++;
+    SE.setPostRegionLength(Cycle);
   }
-  return {/*MI=*/nullptr, Cycle};
 }
 
-MaxLatencyFinder::MaxLatencyFinder(
-    const AIEPostRASchedStrategy *const Scheduler,
-    const AIEBaseInstrInfo *const TII,
-    const InstrItineraryData *const Itineraries,
-    const MCRegisterInfo *const TRI, MachineBasicBlock *const CurBB,
-    AAResults *AA)
-    : Scheduler(Scheduler), TII(TII), Itineraries(Itineraries), TRI(TRI),
-      CurBB(CurBB), InterBlock(true), AA(AA), SafeToIgnoreMemDeps(false) {}
-
-// This is called from different contexts, so we need some case analysis
-// If we have a basic block, we are in a regular MachineScheduler invocation,
-// and we will be able to retrieve its strategy,
-// Otherwise we are an abstract region; Scheduler will be nullptr, which
-// will not be derefenced.
-MaxLatencyFinder::MaxLatencyFinder(ScheduleDAGInstrs *DAG, AAResults *AA)
-    : Scheduler(DAG->getBB()
-                    ? static_cast<AIEScheduleDAGMI *>(DAG)->getSchedImpl()
-                    : nullptr),
-      TII(static_cast<const AIEBaseInstrInfo *>(DAG->TII)),
-      Itineraries(DAG->getSchedModel()->getInstrItineraries()),
-      TRI(DAG->MF.getSubtarget().getRegisterInfo()), CurBB(DAG->getBB()),
-      InterBlock(InterBlockLatency && CurBB &&
-                 isBottomRegion(DAG->ExitSU.getInstr()) &&
-                 Scheduler->successorsAreScheduled(CurBB)),
-      AA(AA),
-      // This is a current assumption needed to achieve a proper compact
-      // schedule.
-      // A loop is considered a candidate for outer loop pipelining if there are
-      // no memory-carried dependencies. The outer loop pipeliner attaches
-      // related metadata to the loop/epilogue, which we capture here. This
-      // metadata indicates that epilogue stores will not alias with loads from
-      // the peeled iteration. We will further analyze why AA is too
-      // conservative in some cases and remove this assumption when possible.
-      SafeToIgnoreMemDeps(Scheduler && CurBB &&
-                          Scheduler->getInterBlock()
-                              .getBlockState(CurBB)
-                              .isSafeToIgnoreMemDeps()) {}
-
 unsigned MaxLatencyFinder::operator()(MachineInstr &MI) {
   LLVM_DEBUG(dbgs() << MI << "\n");
   // If we don't use interblock information, include the 'StageLatency'
@@ -239,29 +184,55 @@ unsigned MaxLatencyFinder::operator()(MachineInstr &MI) {
     }
     return Latency;
   }
-  LLVM_DEBUG(dbgs() << "Earliest for: " << MI);
-  // Track the earliest use in any successor block, given the cycles in
-  // which these uses are scheduled
-  int Earliest = Latency;
-  for (MachineBasicBlock *SuccBB : CurBB->successors()) {
-    auto &SBS = IB.getBlockState(SuccBB);
-    assert(SBS.isScheduled());
-    if (SBS.getRegions().empty()) {
-      // Blocks can be empty. getTop() will fail, and Earliest=0 is
-      // a conservative value
-      Earliest = 0;
+
+  int EffectiveLatency = HasUnknownSuccessors ? Latency : 0;
+  LLVM_DEBUG(dbgs() << "   EffectiveLatency=" << EffectiveLatency
+                    << (HasUnknownSuccessors ? " (HasUnknownSuccessors)"
+                                             : " (known successors)")
+                    << "\n");
+  for (auto &SEPtr : PerSuccEdges) {
+    InterBlockEdges &SE = *SEPtr;
+    const SUnit *Pred = SE.getPreBoundaryNode(&MI);
+    if (!Pred) {
+      LLVM_DEBUG(
+          dbgs() << "   No pre-boundary node for this successor, skip\n");
       continue;
     }
-    const std::vector<AIE::MachineBundle> &TopBundles = SBS.getTop().Bundles;
-    Earliest =
-        findEarliestRef(MI, TopBundles, Earliest, AA, SafeToIgnoreMemDeps)
-            .Cycle;
+    LLVM_DEBUG(dbgs() << "   Pre-boundary SU#" << Pred->NodeNum << " has "
+                      << Pred->Succs.size() << " successor edge(s)\n");
+
+    for (const SDep &Dep : Pred->Succs) {
+      SUnit *Succ = Dep.getSUnit();
+      if (!SE.isPostBoundaryNode(Succ)) {
+        LLVM_DEBUG(dbgs() << "   SU#" << Succ->NodeNum
+                          << " is not a post-boundary node, skip\n");
+        continue;
+      }
+
+      // For ExitSU the depth is the full length of the successor block's
+      // top region (all its cycles have elapsed before reaching ExitSU).
+      // For a regular instruction node the depth is its scheduled cycle
+      // within the block.
+      const int Depth = Succ->isBoundaryNode() ? SE.getPostRegionLength()
+                                               : SE.getPostDepthOr(Succ, 0);
+      const int EdgeLat = Dep.getSignedLatency();
+      const int Remaining = EdgeLat - Depth;
+      LLVM_DEBUG(
+          dbgs() << "   " << (Succ->isBoundaryNode() ? "ExitSU" : "SU#")
+                 << (Succ->isBoundaryNode() ? ""
+                                            : std::to_string(Succ->NodeNum))
+                 << ": latency=" << EdgeLat << ", depth=" << Depth
+                 << ", remaining=" << Remaining
+                 << ", updating EffectiveLatency " << EffectiveLatency << " -> "
+                 << std::max(EffectiveLatency, Remaining) << "\n");
+      EffectiveLatency = std::max(EffectiveLatency, Remaining);
+    }
   }
+  // Cap at the raw maxLatency of the source instruction.
+  EffectiveLatency = std::min(EffectiveLatency, Latency);
+  LLVM_DEBUG(dbgs() << "   EffectiveLatency=" << EffectiveLatency << "\n");
 
-  LLVM_DEBUG(dbgs() << "   Earliest=" << Earliest << "\n");
-  Latency = std::max(Latency - Earliest, 1);
-  LLVM_DEBUG(dbgs() << "EffectiveLatency=" << Latency << "\n");
-  return Latency;
+  return static_cast<unsigned>(EffectiveLatency);
 }
 
 } // namespace llvm::AIE
diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h
index e5e17d40452b..7705cf2ac4c5 100644
--- a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h
+++ b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h
@@ -16,8 +16,11 @@
 #define LLVM_LIB_TARGET_AIE_MAXLATENCYFINDER_H
 
 #include "AIEBaseSubtarget.h"
+#include "AIEDataDependenceHelper.h"
 #include "AIEMachineScheduler.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include <memory>
+#include <vector>
 
 using namespace llvm;
 
@@ -27,20 +30,6 @@ namespace llvm::AIE {
 int maxLatency(const MachineInstr *MI, const AIEBaseInstrInfo &InstrInfo,
                const InstrItineraryData &Itineraries, bool IncludeStages);
 
-struct InstrAndCycle {
-  MachineInstr *MI = nullptr;
-  int Cycle;
-};
-
-/// Find the first dependence on SrcMI in Bundles[0,Prune)
-/// \returns the Cycle in which the dependence happens or a conservative lower
-///          bound and the instruction responsible for the dependency if it is
-///          found.
-InstrAndCycle findEarliestRef(const MachineInstr &SrcMI,
-                              ArrayRef<MachineBundle> Bundles, int Prune,
-                              AAResults *AA = nullptr,
-                              bool SafeToIgnoreMemDeps = false);
-
 class MaxLatencyFinder {
   const AIEPostRASchedStrategy *const Scheduler;
   const AIEBaseInstrInfo *const TII;
@@ -48,24 +37,34 @@ class MaxLatencyFinder {
   const MCRegisterInfo *const TRI;
   MachineBasicBlock *const CurBB;
   const bool InterBlock;
-  AAResults *AA;
-  bool SafeToIgnoreMemDeps;
 
-  // Check whether this region connects to the successor blocks
-  //
+  /// One entry per CFG successor of CurBB.  InterBlockEdges is heap-allocated
+  /// via unique_ptr because it inherits from ScheduleDAGInstrs which is not
+  /// safely moveable.
+  std::vector<std::unique_ptr<InterBlockEdges>> PerSuccEdges;
+
+  /// True when CurBB has no CFG successors (e.g. a return block), requiring
+  /// the conservative raw latency as a floor.
+  bool HasUnknownSuccessors = false;
+
+  // Check whether this region connects to the successor blocks.
   bool isBottomRegion(MachineInstr *ExitMI);
 
+  // Build one InterBlockEdges per CFG successor of CurBB and populate
+  // PerSuccEdges.
+  void buildInterBlockEdges(const Region &CurRegion);
+
 public:
   // Constructors
-  MaxLatencyFinder(const AIEPostRASchedStrategy *const Scheduler,
-                   const AIEBaseInstrInfo *const TII,
-                   const InstrItineraryData *const Itineraries,
-                   const MCRegisterInfo *const TRI,
-                   MachineBasicBlock *const CurBB, AAResults *AA = nullptr);
+  // Derive TII, TRI, and Itineraries from the scheduling context, keeping
+  // only Scheduler and CurBB as explicit parameters.
+  MaxLatencyFinder(const MachineSchedContext &C,
+                   const AIEPostRASchedStrategy *Scheduler,
+                   MachineBasicBlock *CurBB);
 
-  MaxLatencyFinder(ScheduleDAGInstrs *DAG, AAResults *AA = nullptr);
+  MaxLatencyFinder(ScheduleDAGInstrs *DAG);
 
-  // Find the maximum latency of MI taking  successors into account
+  // Find the maximum latency of MI taking successors into account.
   unsigned operator()(MachineInstr &MI);
 };
 
diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll
index 68ee86c98a44..713f0d7e3981 100644
--- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll
+++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll
@@ -34,8 +34,8 @@ declare { ptr, i20, i20 } @llvm.aie2.add.3d(ptr, i20, i20, i20, i20, i20, i20, i
 define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm2_data, ptr noalias %ofm_data, ptr %.out, ptr %conv.i.i.i.out, ptr %idx.ext9.out, ptr %.out1, ptr %.out2, ptr %.out3, ptr %.out4, ptr %.out5, ptr %conv.i.i.i.i.i.out, ptr %.out6, ptr %conv.i.i.i46.out, ptr %xtraiter.out, ptr %in_ptr1.051.unr.ce.out, ptr %in_ptr2.0.in50.unr.ce.out, ptr %out_ptr.049.unr.ce.out, ptr %itr_left_cnt0.048.unr.ce.out, ptr %itr_left_cnt1.047.unr.ce.out) #3 {
 ; ASM-LABEL: add2d:
 ; ASM:       // %bb.0: // %newFuncRoot
-; ASM-NEXT:    paddb [p0], #40; lda r2, [p0, #64]; nops ; nopxm ; nopv
-; ASM-NEXT:    lda m2, [p0], #-4; nopx
+; ASM-NEXT:    lda r2, [p0, #64]; paddb [p0], #40; nopxm
+; ASM-NEXT:    lda m2, [p0], #-4
 ; ASM-NEXT:    lda m5, [p0], #8
 ; ASM-NEXT:    lda m4, [p0], #8
 ; ASM-NEXT:    lda m3, [p0], #-24
@@ -60,14 +60,14 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm
 ; ASM-NEXT:    lda p0, [p7], #-4; st m1, [p0, #0]; add r7, r2, #-1; mov r6, #1
 ; ASM-NEXT:    lda p4, [p7], #-4; st m0, [p0, #0]; ne r6, r0, r6
 ; ASM-NEXT:    lda r13, [p7], #-4; st dj0, [p0, #0]; movx r0, #3
-; ASM-NEXT:    st dj4, [p0, #0]; ltu r7, r7, r0
-; ASM-NEXT:    st dn0, [p0, #0]; nez r1, r1
-; ASM-NEXT:    lda r9, [p7], #-4; st dn4, [p5, #0]; jz r7, #.LBB0_2
-; ASM-NEXT:    lda r10, [p7], #-4; st r1, [p6, #0] // Delay Slot 5
-; ASM-NEXT:    lda r11, [p7], #-4; st r5, [p0, #0] // Delay Slot 4
-; ASM-NEXT:    lda p7, [p7, #-4]; paddb [p2], m5; st r6, [p4, #0] // Delay Slot 3
-; ASM-NEXT:    lda r12, [p7, #0]; paddb [p2], m4; and r8, r2, r0; mov p0, r13; padds [p1], m2 // Delay Slot 2
-; ASM-NEXT:    mova r0, #0; paddb [p2], m3; st r8, [p0, #0] // Delay Slot 1
+; ASM-NEXT:    lda r9, [p7], #-4; st dj4, [p0, #0]; ltu r7, r7, r0
+; ASM-NEXT:    lda r10, [p7], #-4; st dn0, [p0, #0]; nez r1, r1
+; ASM-NEXT:    lda r11, [p7], #-4; st dn4, [p5, #0]; jz r7, #.LBB0_2
+; ASM-NEXT:    lda p7, [p7, #-4]; st r1, [p6, #0] // Delay Slot 5
+; ASM-NEXT:    st r5, [p0, #0] // Delay Slot 4
+; ASM-NEXT:    paddb [p2], m5; st r6, [p4, #0] // Delay Slot 3
+; ASM-NEXT:    lda r12, [p7, #0]; paddb [p2], m4; and r8, r2, r0; mov p0, r13 // Delay Slot 2
+; ASM-NEXT:    padda [p1], m2; paddb [p2], m3; movx r0, #0; st r8, [p0, #0] // Delay Slot 1
 ; ASM-NEXT:  // %bb.1:
 ; ASM-NEXT:    j #.LBB0_5
 ; ASM-NEXT:    nop // Delay Slot 5
@@ -76,20 +76,14 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm
 ; ASM-NEXT:    nop // Delay Slot 2
 ; ASM-NEXT:    mova r1, #0 // Delay Slot 1
 ; ASM-NEXT:  .LBB0_2: // %entry.new
-; ASM-NEXT:    nopb ; vlda.ups.s32.d8 cm2, s1, [p1], m1; nops ; nopx ; mov dc0, #0; nopv
+; ASM-NEXT:    vlda.ups.s32.d8 cm2, s1, [p1], m1; mov dc0, #0
 ; ASM-NEXT:    vlda.ups.s32.d8 cm1, s1, [p1], m1; mov dc4, dc0
-; ASM-NEXT:    vlda.3d.ups.s32.d8 cm7, s1, [p2], d0; nopx
-; ASM-NEXT:    vlda.ups.s32.d8 cm0, s1, [p1], m1
-; ASM-NEXT:    vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; mov crUPSSign, r4
-; ASM-NEXT:    vlda.ups.s32.d8 cm4, s1, [p1], m1; mov s1, r3
-; ASM-NEXT:    vlda.3d.ups.s32.d8 cm5, s1, [p2], d0
-; ASM-NEXT:    vlda.3d.ups.s32.d8 cm3, s1, [p2], d0
-; ASM-NEXT:    nop
-; ASM-NEXT:    movxm ls, #.LBB0_3
-; ASM-NEXT:    mova r0, #-4; movxm le, #.L_LEnd0
-; ASM-NEXT:    and r0, r2, r0
-; ASM-NEXT:    mova r2, #-2; add r0, r0, #-4
-; ASM-NEXT:    lshl r0, r0, r2; mov crSRSSign, r6
+; ASM-NEXT:    vlda.3d.ups.s32.d8 cm7, s1, [p2], d0; movx r0, #-4; mov crUPSSign, r4
+; ASM-NEXT:    vlda.ups.s32.d8 cm0, s1, [p1], m1; movxm ls, #.LBB0_3
+; ASM-NEXT:    vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; movxm le, #.L_LEnd0
+; ASM-NEXT:    vlda.ups.s32.d8 cm4, s1, [p1], m1; and r0, r2, r0; mov s1, r3
+; ASM-NEXT:    vlda.3d.ups.s32.d8 cm5, s1, [p2], d0; add r0, r0, #-4; mov r2, #-2
+; ASM-NEXT:    vlda.3d.ups.s32.d8 cm3, s1, [p2], d0; lshl r0, r0, r2; mov crSRSSign, r6
 ; ASM-NEXT:    add r0, r0, #1; mov s0, r5
 ; ASM-NEXT:    nopb ; nopa ; nops ; nopx ; add.nc lc, r0, #-1; nopv
 ; ASM-NEXT:  .LBB0_3: // %for.body
diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll
index a70e2cd07a66..c0977cea0eeb 100644
--- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll
+++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll
@@ -216,7 +216,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
 ; ZOL-NEXT:  .LBB0_1: // %outer.loop.header
 ; ZOL-NEXT:    // =>This Loop Header: Depth=1
 ; ZOL-NEXT:    // Child Loop BB0_2 Depth 2
-; ZOL-NEXT:    vlda.ups.s32.s16 bmh0, s0, [p2, #32]; nopx
+; ZOL-NEXT:    vlda.ups.s32.s16 bmh0, s0, [p2, #32]; nopb ; nopx
 ; ZOL-NEXT:    vlda.ups.s32.s16 bml0, s0, [p2], m5
 ; ZOL-NEXT:    vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m7, p5
 ; ZOL-NEXT:    vlda.ups.s32.s16 bml1, s0, [p2], m7
@@ -229,17 +229,17 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
 ; ZOL-NEXT:    vlda.ups.s32.s16 bmh7, s0, [p2, #32]; vldb wh3, [p0], m6
 ; ZOL-NEXT:    vlda.ups.s32.s16 bml7, s0, [p2], m7; vldb wl7, [p0], m6
 ; ZOL-NEXT:    vlda.ups.s32.s16 bmh5, s0, [p2, #32]; vldb.3d wh7, [p0], d0
-; ZOL-NEXT:    vlda.ups.s32.s16 bml5, s0, [p2], m5; movxm ls, #.LBB0_2
-; ZOL-NEXT:    vldb wl6, [p1], #32; movxm le, #.L_LEnd0
-; ZOL-NEXT:    vlda wh6, [p1], #32; vldb wl5, [p0], m6; mov r1, p0
-; ZOL-NEXT:    vlda.ups.s32.s16 bmh6, s0, [p2, #32]; vldb wh5, [p0], m6; and r0, r0, r9
-; ZOL-NEXT:    vlda wl8, [p1], #32; vldb wl3, [p0], m6; add r0, r0, #33
-; ZOL-NEXT:    vlda wh8, [p1], #32; vldb.3d wh3, [p0], d0; vshift.align x4, x4, s1, x3, r0
-; ZOL-NEXT:    vlda.ups.s32.s16 bml6, s0, [p2, #0]; vldb wl1, [p1], #32; and r1, r1, r9; vshift.align x2, x2, s1, x7, r0
-; ZOL-NEXT:    vldb wh1, [p1], #32; add r0, r1, #33; mov r1, p0
-; ZOL-NEXT:    vldb wl10, [p1], #32; vshuffle x7, x4, x2, r2
-; ZOL-NEXT:    vldb wh10, [p1], #32; vshuffle x9, x7, x0, r8
-; ZOL-NEXT:    nopb ; nopa ; nops ; and r1, r1, r9; add.nc lc, r5, #-2; nopv
+; ZOL-NEXT:    vlda.ups.s32.s16 bml5, s0, [p2], m5
+; ZOL-NEXT:    vldb wl6, [p1], #32; movxm ls, #.LBB0_2
+; ZOL-NEXT:    vldb wh6, [p1], #32; movxm le, #.L_LEnd0
+; ZOL-NEXT:    vlda.ups.s32.s16 bmh6, s0, [p2, #32]; vldb wl5, [p0], m6; and r0, r0, r9; mov r1, p0
+; ZOL-NEXT:    vlda wl8, [p1], #32; vldb wh5, [p0], m6; add r0, r0, #33
+; ZOL-NEXT:    vlda wh8, [p1], #32; vldb wl3, [p0], m6; vshift.align x4, x4, s1, x3, r0
+; ZOL-NEXT:    vlda.ups.s32.s16 bml6, s0, [p2, #0]; vldb.3d wh3, [p0], d0; and r1, r1, r9; vshift.align x2, x2, s1, x7, r0
+; ZOL-NEXT:    vldb wl1, [p1], #32; add r0, r1, #33; mov r1, p0
+; ZOL-NEXT:    vldb wh1, [p1], #32; vshuffle x7, x4, x2, r2
+; ZOL-NEXT:    vldb wl10, [p1], #32; vshuffle x9, x7, x0, r8
+; ZOL-NEXT:    vldb wh10, [p1], #32; nopa ; nops ; and r1, r1, r9; add.nc lc, r5, #-2; nopv
 ; ZOL-NEXT:  .LBB0_2: // %inner.loop
 ; ZOL-NEXT:    // Parent Loop BB0_1 Depth=1
 ; ZOL-NEXT:    // => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll
index 401aa4affd85..c4dae8bfdf2c 100644
--- a/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll
+++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll
@@ -4,7 +4,7 @@
 ; See https://llvm.org/LICENSE.txt for license information.
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 ;
-; (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
+; (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 
 ; RUN: llc -O2 -mtriple=aie2 \
 ; RUN:    %s -o - | FileCheck %s
@@ -65,6 +65,7 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non
 ; CHECK-NEXT:    nopb ; nopa ; nops ; movxm r3, #16512; nopv
 ; CHECK-NEXT:    nopa ; movxm r4, #-16256
 ; CHECK-NEXT:    movxm r5, #32767
+; CHECK-NEXT:    movxm r6, #15616
 ; CHECK-NEXT:    movxm r0, #16256
 ; CHECK-NEXT:    movxm r1, #16384
 ; CHECK-NEXT:    lda r0, [p2, #0]; movxm r2, #16128
@@ -73,37 +74,36 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non
 ; CHECK-NEXT:    vbcst.16 x2, r2
 ; CHECK-NEXT:    mova r1, #0; vconv.fp32.bf16 bmh0, wl2
 ; CHECK-NEXT:    vbcst.16 x2, r1
-; CHECK-NEXT:    vldb wl3, [p0], #32; vmov wh0, wl2
-; CHECK-NEXT:    mova r1, #-5; vmov wh3, wl2
+; CHECK-NEXT:    vmov wh0, wl2
+; CHECK-NEXT:    mova r1, #-5; vldb wl3, [p0], #32; vmov wh3, wl2
 ; CHECK-NEXT:    mova r1, #60; lshl r2, r0, r1; vconv.fp32.bf16 bmh1, wl3
-; CHECK-NEXT:    movxm r6, #15616; vmul.f bmh2, x0, x3, r1
-; CHECK-NEXT:    movxm r7, #16000
+; CHECK-NEXT:    movxm r7, #16000; vmul.f bmh2, x0, x3, r1
 ; CHECK-NEXT:    vbcst.16 x1, r3
 ; CHECK-NEXT:    vbcst.16 x8, r4
-; CHECK-NEXT:    vbcst.16 x10, r5; vmul.f bmh3, x0, x3, r1
+; CHECK-NEXT:    vbcst.16 x10, r5
 ; CHECK-NEXT:    vbcst.16 x6, r6
-; CHECK-NEXT:    vconv.bf16.fp32 wl3, bmh2; vbcst.16 x4, r7
-; CHECK-NEXT:    vmov wh6, wl2
+; CHECK-NEXT:    vbcst.16 x4, r7; vmul.f bmh3, x0, x3, r1
+; CHECK-NEXT:    vconv.bf16.fp32 wl3, bmh2; vmov wh6, wl2
+; CHECK-NEXT:    vmov wh4, wl2
 ; CHECK-NEXT:    vmin_ge.bf16 x3, r16, x3, x1
 ; CHECK-NEXT:    or r8, r16, r16; vmax_lt.bf16 x3, r16, x3, x8
-; CHECK-NEXT:    vconv.bf16.fp32 wl5, bmh3; vband x7, x10, x3
-; CHECK-NEXT:    vmov wh7, wl2
+; CHECK-NEXT:    vband x7, x10, x3
+; CHECK-NEXT:    vconv.bf16.fp32 wl5, bmh3; vmov wh3, wl2
+; CHECK-NEXT:    vldb wl7, [p0], #32; vmov wh7, wl2
 ; CHECK-NEXT:    vmin_ge.bf16 x5, r16, x5, x1
 ; CHECK-NEXT:    vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x5, x8
-; CHECK-NEXT:    vband x7, x10, x5
-; CHECK-NEXT:    vldb wl7, [p0], #32; vmov wh7, wl2; vmul.f bmh2, x6, x7, r1
-; CHECK-NEXT:    vmov wh4, wl2
-; CHECK-NEXT:    vmov wh3, wl2; vmul.f bmh4, x6, x7, r1
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    vmov wh5, wl2; vmac.f bmh3, bmh0, x3, x4, r1
+; CHECK-NEXT:    vband x7, x10, x5; vmul.f bmh2, x6, x7, r1
+; CHECK-NEXT:    vmov wh7, wl2
+; CHECK-NEXT:    vmac.f bmh3, bmh0, x3, x4, r1
+; CHECK-NEXT:    vmul.f bmh4, x6, x7, r1
 ; CHECK-NEXT:    vmul.f bmh5, x0, x7, r1
-; CHECK-NEXT:    vmac.f bmh6, bmh0, x5, x4, r1
+; CHECK-NEXT:    vmov wh5, wl2
 ; CHECK-NEXT:    vconv.bf16.fp32 wl7, bmh2; vmul.f bmh7, x0, x7, r1
-; CHECK-NEXT:    vconv.bf16.fp32 wl3, bmh4
+; CHECK-NEXT:    vmac.f bmh6, bmh0, x5, x4, r1
 ; CHECK-NEXT:    vmsc.f bmh3, bmh3, x7, x3, r1
-; CHECK-NEXT:    movxm ls, #.LBB0_1; vmsc.f bml4, bmh6, x3, x5, r1
+; CHECK-NEXT:    vconv.bf16.fp32 wl3, bmh4; movxm ls, #.LBB0_1
 ; CHECK-NEXT:    vconv.bf16.fp32 wl3, bmh5; movxm le, #.L_LEnd0
-; CHECK-NEXT:    add.nc lc, r2, #-2
+; CHECK-NEXT:    add.nc lc, r2, #-2; vmsc.f bml4, bmh6, x3, x5, r1
 ; CHECK-NEXT:    vconv.bf16.fp32 wl5, bmh7; vmin_ge.bf16 x3, r16, x3, x1
 ; CHECK-NEXT:    vmax_lt.bf16 x3, r16, x3, x8
 ; CHECK-NEXT:    mova r0, #28; vconv.bf16.fp32 wl7, bmh3; vmin_ge.bf16 x11, r16, x5, x1
diff --git a/llvm/test/CodeGen/AIE/aie2/extract.ll b/llvm/test/CodeGen/AIE/aie2/extract.ll
index 28e9ce6a1524..220b8e83f119 100644
--- a/llvm/test/CodeGen/AIE/aie2/extract.ll
+++ b/llvm/test/CodeGen/AIE/aie2/extract.ll
@@ -99,9 +99,9 @@ define dso_local noundef <32 x i8> @_Z30test_extract_v64uint4_256_1024Dv128_DU8_
 ; CHECK-NEXT:    jz r0, #.LBB2_6
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    vlda wl4, [sp, #-160] // 32-byte Folded Reload Delay Slot 3
+; CHECK-NEXT:    nop // Delay Slot 3
 ; CHECK-NEXT:    vlda wh5, [sp, #-64] // 32-byte Folded Reload Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
+; CHECK-NEXT:    vlda wl4, [sp, #-160] // 32-byte Folded Reload Delay Slot 1
 ; CHECK-NEXT:  // %bb.3: // %if.else.i
 ; CHECK-NEXT:    j #.LBB2_6
 ; CHECK-NEXT:    nop // Delay Slot 5
@@ -444,9 +444,9 @@ define dso_local noundef <4 x i64> @_Z20test_extract_v8acc32Dv32_u7__acc32i(<16
 ; CHECK-NEXT:    jz r0, #.LBB13_6
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    vlda amll0, [sp, #-160] // 32-byte Folded Reload Delay Slot 3
+; CHECK-NEXT:    nop // Delay Slot 3
 ; CHECK-NEXT:    vlda amhh0, [sp, #-64] // 32-byte Folded Reload Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
+; CHECK-NEXT:    vlda amll0, [sp, #-160] // 32-byte Folded Reload Delay Slot 1
 ; CHECK-NEXT:  // %bb.3: // %if.else.i
 ; CHECK-NEXT:    j #.LBB13_6
 ; CHECK-NEXT:    nop // Delay Slot 5
@@ -664,9 +664,9 @@ define dso_local noundef <4 x i64> @_Z20test_extract_v4acc64Dv16_u7__acc64i(<16
 ; CHECK-NEXT:    jz r0, #.LBB20_6
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    vlda amll0, [sp, #-160] // 32-byte Folded Reload Delay Slot 3
+; CHECK-NEXT:    nop // Delay Slot 3
 ; CHECK-NEXT:    vlda amhh0, [sp, #-64] // 32-byte Folded Reload Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
+; CHECK-NEXT:    vlda amll0, [sp, #-160] // 32-byte Folded Reload Delay Slot 1
 ; CHECK-NEXT:  // %bb.3: // %if.else.i
 ; CHECK-NEXT:    j #.LBB20_6
 ; CHECK-NEXT:    nop // Delay Slot 5
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir
index 2c48a9426854..19af57435e7d 100644
--- a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 
 # RUN: llc --mtriple=aie2 --run-pass=postmisched \
 # RUN:     %s -o - | FileCheck %s
@@ -35,9 +35,6 @@ body:             |
   ; CHECK-NEXT:   NOP
   ; CHECK-NEXT:   NOP
   ; CHECK-NEXT:   $s1 = MOV_mv_scl killed $r2
-  ; CHECK-NEXT:   NOP
-  ; CHECK-NEXT:   NOP
-  ; CHECK-NEXT:   NOP
   ; CHECK-NEXT:   BUNDLE implicit-def $r1, implicit-def dead $srcarry, implicit-def $s0, implicit killed $r1, implicit killed $r4 {
   ; CHECK-NEXT:     renamable $r1 = ADD_add_r_ri killed renamable $r1, -4, implicit-def dead $srcarry
   ; CHECK-NEXT:     $s0 = MOV_mv_scl killed $r4
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/swp/doloop-stage0.ll b/llvm/test/CodeGen/AIE/aie2/schedule/swp/doloop-stage0.ll
index c2b8cf49d88d..b3384ae6f241 100644
--- a/llvm/test/CodeGen/AIE/aie2/schedule/swp/doloop-stage0.ll
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/swp/doloop-stage0.ll
@@ -4,7 +4,7 @@
 ; See https://llvm.org/LICENSE.txt for license information.
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 ;
-; (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates
+; (c) Copyright 2023-2026 Advanced Micro Devices, Inc. or its affiliates
 ; RUN: llc --mtriple=aie2 -O2 --aie-pipeliner-max-guards=2 -enable-aie-zol-without-minitercount=false %s -o - | FileCheck %s
 
 ; Similar to stage0.ll, but now with a do-while. Again we expect a three
@@ -18,14 +18,13 @@
 define dso_local i32 @dot(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 noundef %n) {
 ; CHECK-LABEL: dot:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    nopa ; movxm m0, #2044
-; CHECK-NEXT:    lda r3, [p1], m0; add r5, r1, #-1
-; CHECK-NEXT:    lda r2, [p0], m0; jz r5, #.LBB0_5
-; CHECK-NEXT:    nop // Delay Slot 5
-; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    nop // Delay Slot 3
+; CHECK-NEXT:    nopa ; nopb ; add r5, r1, #-1; nopm
+; CHECK-NEXT:    jz r5, #.LBB0_5
+; CHECK-NEXT:    movxm m0, #2044 // Delay Slot 5
+; CHECK-NEXT:    lda r2, [p0], m0 // Delay Slot 4
+; CHECK-NEXT:    lda r3, [p1], m0 // Delay Slot 3
 ; CHECK-NEXT:    nop // Delay Slot 2
-; CHECK-NEXT:    movx r0, #0 // Delay Slot 1
+; CHECK-NEXT:    mova r0, #0 // Delay Slot 1
 ; CHECK-NEXT:  // %bb.1: // %do.body
 ; CHECK-NEXT:    lda r4, [p1], m0; add r5, r5, #-1
 ; CHECK-NEXT:    lda r1, [p0], m0; jz r5, #.LBB0_4
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/swp/prepipeliner-ore.ll b/llvm/test/CodeGen/AIE/aie2/schedule/swp/prepipeliner-ore.ll
index 2eb14f4b2d98..fa1c257540d1 100644
--- a/llvm/test/CodeGen/AIE/aie2/schedule/swp/prepipeliner-ore.ll
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/swp/prepipeliner-ore.ll
@@ -55,7 +55,7 @@ define dso_local i32 @dot(ptr addrspace(6) nocapture readonly %a, ptr addrspace(
 ; PRE-NEXT:   - NS:              '3'
 ; PRE-NEXT:   - Loop:            bb.2.for.body
 ; PRE-NEXT:   - Prologue:        bb.1.for.body.preheader
-; PRE-NEXT:   - PrologueBundles: '10'
+; PRE-NEXT:   - PrologueBundles: '7'
 ; PRE-NEXT:   - Epilogue:        bb.3
 ; PRE-NEXT:   - EpilogueBundles: '6'
 ; PRE-NEXT: ...
diff --git a/llvm/test/CodeGen/AIE/aie2/set.ll b/llvm/test/CodeGen/AIE/aie2/set.ll
index 66c81001a1ad..294939198b68 100644
--- a/llvm/test/CodeGen/AIE/aie2/set.ll
+++ b/llvm/test/CodeGen/AIE/aie2/set.ll
@@ -16,8 +16,8 @@ define dso_local noundef <64 x i8> @_Z29test_set_v128uint_set_512_256iDv32_DU8_(
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
 ; CHECK-NEXT:    nop // Delay Slot 3
-; CHECK-NEXT:    vmov wl0, wl2 // Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
+; CHECK-NEXT:    nop // Delay Slot 2
+; CHECK-NEXT:    vmov wl0, wl2 // Delay Slot 1
 ; CHECK-NEXT:  // %bb.1: // %if.else.i
 ; CHECK-NEXT:    vmov wh0, wl0
 ; CHECK-NEXT:  .LBB0_2: // %_ZL13set_v128uint4iDv32_DU8_.exit
@@ -199,8 +199,8 @@ define dso_local noundef <128 x i8> @_Z27test_set_v256uint4_1024_512iDv64_DU8_(i
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
 ; CHECK-NEXT:    nop // Delay Slot 3
-; CHECK-NEXT:    vmov x4, x0 // Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
+; CHECK-NEXT:    nop // Delay Slot 2
+; CHECK-NEXT:    vmov x4, x0 // Delay Slot 1
 ; CHECK-NEXT:  // %bb.1: // %if.else.i
 ; CHECK-NEXT:    vmov x5, x4
 ; CHECK-NEXT:  .LBB4_2: // %_ZL13set_v256uint4iDv64_DU8_.exit
diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll
index 0dd29ac72714..ccf6cb3a1140 100644
--- a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll
+++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll
@@ -27,9 +27,9 @@ define weak_odr dso_local void @convert_bf16_to_bfp16(ptr noalias %in, ptr noali
 ; CHECK-NEXT:    vlda.pop.512 x6, [p0, lf0, r24]; movs dc1, dj0; mov dn1, dn0
 ; CHECK-NEXT:    vldb.pop.512.2d x4, [p0, lf0, r24, d1]
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    lda m0, [p2, #4]; vldb.fill.512 [p0, lf0, r24]
+; CHECK-NEXT:    vldb.fill.512 [p0, lf0, r24]
 ; CHECK-NEXT:    vlda.pop.512 x6, [p0, lf0, r24]
-; CHECK-NEXT:    vldb.pop.512.2d x4, [p0, lf0, r24, d1]; movxm ls, #.LBB0_1
+; CHECK-NEXT:    lda m0, [p2, #4]; vldb.pop.512.2d x4, [p0, lf0, r24, d1]; movxm ls, #.LBB0_1
 ; CHECK-NEXT:    movxm le, #.L_LEnd0
 ; CHECK-NEXT:    nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopx ; add.nc lc, r0, #-3; nopv
 ; CHECK-NEXT:    vlda.pop.512 x6, [p0, lf0, r24]; nopb ; nops ; nopx ; vconv.fp32.bf16 cml1, x6; nopv
diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/gelu-templated.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/gelu-templated.ll
index b72536458b9e..e52c94ef8500 100644
--- a/llvm/test/CodeGen/AIE/aie2p/end-to-end/gelu-templated.ll
+++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/gelu-templated.ll
@@ -17,54 +17,53 @@
 define void @gelu_fn(ptr noalias %ifm, ptr noalias %ofm, ptr nonnull align 64 dereferenceable(64) %params) {
 ; CHECK-LABEL: gelu_fn:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    vlda.conv.fp32.bf16 cml1, [p0], #64; nopb ; nopx
+; CHECK-NEXT:    vlda.conv.fp32.bf16 cml1, [p0], #64; nopx
 ; CHECK-NEXT:    movxm r0, #16544
 ; CHECK-NEXT:    vbcst.16 x6, r0
 ; CHECK-NEXT:    lda r1, [p2, #0]; movxm r0, #17280
 ; CHECK-NEXT:    mova r0, #60; vbcst.16 x2, r0
 ; CHECK-NEXT:    vadd.f dm3, dm1, dm0, r0
-; CHECK-NEXT:    vconv.fp32.bf16 cml0, x6
+; CHECK-NEXT:    vlda.conv.fp32.bf16 cml2, [p0], #64; vconv.fp32.bf16 cml0, x6
+; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vlda.conv.fp32.bf16 cml2, [p0], #64
 ; CHECK-NEXT:    movxm r2, #15821
-; CHECK-NEXT:    mova r2, #255; movx r4, #1; vbcst.16 x4, r2
+; CHECK-NEXT:    mova r2, #255; movx r4, #1; vbcst.16 x4, r2; vadd.f dm3, dm2, dm0, r0
 ; CHECK-NEXT:    vlda.conv.fp32.bf16 cml1, [p0], #64; vconv.bf16.fp32 x8, cml3; lshl r2, r1, r4; vbcst.16 x0, r2
-; CHECK-NEXT:    mova r2, #828; mov m0, r2; vadd.f dm3, dm2, dm0, r0
-; CHECK-NEXT:    vlda.conv.fp32.bf16 cml2, [p0], #64; vmul.f dm2, x8, x2, r2
+; CHECK-NEXT:    mova r2, #828; mov m0, r2
+; CHECK-NEXT:    vmul.f dm2, x8, x2, r2
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vadd.f dm3, dm1, dm0, r0
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vadd.f dm3, dm2, dm0, r0
-; CHECK-NEXT:    vconv.bf16.fp32 x10, cml3
+; CHECK-NEXT:    vconv.bf16.fp32 x10, cml3; vadd.f dm3, dm1, dm0, r0
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vlda.conv.fp32.bf16 cml2, [p0], #64; vmul.f dm1, x10, x2, r2
 ; CHECK-NEXT:    vconv.bf16.fp32 x8, cml2
-; CHECK-NEXT:    vmul.f dm1, x10, x2, r2
-; CHECK-NEXT:    vconv.bf16.fp32 x1, cml3
-; CHECK-NEXT:    vlda.conv.fp32.bf16 cml1, [p0], #64; vmul.f dm4, x8, x4, r2
-; CHECK-NEXT:    vconv.bf16.fp32 x7, cml3; vmul.f dm2, x1, x2, r2
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    vmul.f dm3, x7, x2, r2
-; CHECK-NEXT:    vconv.bf16.fp32 x10, cml1; vadd.f dm1, dm1, dm0, r0
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    vlda.conv.fp32.bf16 cml2, [p0], #64; vconv.bf16.fp32 x8, cml4; movx r3, #0; vmul.f dm4, x10, x4, r2
-; CHECK-NEXT:    vconv.bf16.fp32 x5, cml2; mov s0, r3
-; CHECK-NEXT:    vfloor.s32.bf16 x1, wl8, s0
-; CHECK-NEXT:    vconv.bf16.fp32 x5, cml3; vmul.f dm4, x5, x4, r2
-; CHECK-NEXT:    vconv.bf16.fp32 x7, cml1; movxm ls, #.LBB0_1; vadd.f dm2, dm2, dm0, r0
-; CHECK-NEXT:    mova r4, #-5; nopb ; vfloor.s32.bf16 x3, wh8, s0; movxm le, #.L_LEnd0; vmul.f dm3, x5, x4, r2
-; CHECK-NEXT:    mova r1, #2; vconv.bf16.fp32 x10, cml4; lshl r4, r1, r4; vmul.f dm4, x7, x2, r2
-; CHECK-NEXT:    vlda.conv.fp32.bf16 cml1, [p0], #64; vshuffle x1, x1, x3, r1
-; CHECK-NEXT:    vfloor.s32.bf16 x9, wl10, s0; vmin_ge.16 x3, r16, x1, x0, vaddsign1
-; CHECK-NEXT:    vfloor.s32.bf16 x3, wh10, s0; vbcst.16 x6, r3
-; CHECK-NEXT:    vconv.bf16.fp32 x8, cml4; vmax_lt.16 x11, r16, x3, x6, vaddsign1
-; CHECK-NEXT:    padda [p1], m0; nopb ; nops ; nopx ; add.nc lc, r4, #-7; nopv
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmul.f dm4, x8, x4, r2
+; CHECK-NEXT:    vconv.bf16.fp32 x1, cml3; vadd.f dm3, dm2, dm0, r0
+; CHECK-NEXT:    vlda.conv.fp32.bf16 cml1, [p0], #64
+; CHECK-NEXT:    vconv.bf16.fp32 x10, cml1; vmul.f dm2, x1, x2, r2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmul.f dm4, x10, x4, r2
+; CHECK-NEXT:    mova r3, #0; vconv.bf16.fp32 x8, cml4; vadd.f dm1, dm1, dm0, r0
+; CHECK-NEXT:    vconv.bf16.fp32 x7, cml3; mov s0, r3
+; CHECK-NEXT:    vlda.conv.fp32.bf16 cml2, [p0], #64; vfloor.s32.bf16 x1, wl8, s0
+; CHECK-NEXT:    vconv.bf16.fp32 x5, cml2; vmul.f dm3, x7, x2, r2
+; CHECK-NEXT:    vfloor.s32.bf16 x3, wh8, s0; movxm ls, #.LBB0_1
+; CHECK-NEXT:    mova r4, #-5; nopb ; vconv.bf16.fp32 x10, cml4; movxm le, #.L_LEnd0; vmul.f dm4, x5, x4, r2
+; CHECK-NEXT:    vconv.bf16.fp32 x7, cml1; lshl r4, r1, r4; vadd.f dm2, dm2, dm0, r0
+; CHECK-NEXT:    vlda.conv.fp32.bf16 cml1, [p0], #64; movx r1, #2; vbcst.16 x6, r3
+; CHECK-NEXT:    vfloor.s32.bf16 x9, wl10, s0; vshuffle x1, x1, x3, r1; vmul.f dm4, x7, x2, r2
+; CHECK-NEXT:    vconv.bf16.fp32 x5, cml3; vmin_ge.16 x3, r16, x1, x0, vaddsign1
+; CHECK-NEXT:    nopa ; nopb ; vfloor.s32.bf16 x3, wh10, s0; nopx ; add.nc lc, r4, #-7; nopv
+; CHECK-NEXT:    padda [p1], m0; nopb ; vconv.bf16.fp32 x8, cml4; nopx ; vmax_lt.16 x11, r16, x3, x6, vaddsign1; vmul.f dm3, x5, x4, r2
 ; CHECK-NEXT:  .LBB0_1: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    nopa ; nopb ; vconv.bf16.fp32 x10, cml2; nopxm ; nopv
 ; CHECK-NEXT:    nopa ; nopb ; nops ; nopxm ; vadd.f dm2, dm4, dm0, r0
 ; CHECK-NEXT:    vlda.conv.fp32.bf16 cml1, [p0], #64; nopb ; vconv.bf16.fp32 x7, cml4; nopx ; vmov cml4, cml1; vmul.f dm4, x10, x2, r2
 ; CHECK-NEXT:    nopa ; nopb ; vst x11, [p1], #64; nopx ; vshuffle x1, x9, x3, r1; nopv
-; CHECK-NEXT:    nopa ; nopb ; vfloor.s32.bf16 x3, wh8, s0; nopx ; vmin_ge.16 x5, r16, x1, x0, vaddsign1; nopv
-; CHECK-NEXT:    nopa ; nopb ; vfloor.s32.bf16 x9, wl8, s0; nopx ; vmax_lt.16 x11, r16, x5, x6, vaddsign1; nopv
+; CHECK-NEXT:    vfloor.s32.bf16 x3, wh8, s0; vmin_ge.16 x5, r16, x1, x0, vaddsign1
+; CHECK-NEXT:    vfloor.s32.bf16 x9, wl8, s0; vmax_lt.16 x11, r16, x5, x6, vaddsign1
 ; CHECK-NEXT:  .L_LEnd0:
 ; CHECK-NEXT:    nopa ; nopb ; vconv.bf16.fp32 x8, cml3; nopxm ; vmul.f dm3, x7, x4, r2
 ; CHECK-NEXT:  // %bb.2:
diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/ore-hardware-loops.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/ore-hardware-loops.ll
index b7354dcb9369..6febf7745d81 100644
--- a/llvm/test/CodeGen/AIE/aie2p/end-to-end/ore-hardware-loops.ll
+++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/ore-hardware-loops.ll
@@ -68,9 +68,9 @@ define weak_odr dso_local void @convert_bf16_to_bfp16(ptr noalias %in, ptr noali
 ; ASM-NEXT:    vlda.pop.512 x6, [p0, lf0, r24]; movs dc1, dj0; mov dn1, dn0
 ; ASM-NEXT:    vldb.pop.512.2d x4, [p0, lf0, r24, d1]
 ; ASM-NEXT:    nop
-; ASM-NEXT:    lda m0, [p2, #4]; vldb.fill.512 [p0, lf0, r24]
+; ASM-NEXT:    vldb.fill.512 [p0, lf0, r24]
 ; ASM-NEXT:    vlda.pop.512 x6, [p0, lf0, r24]
-; ASM-NEXT:    vldb.pop.512.2d x4, [p0, lf0, r24, d1]; movxm ls, #.LBB0_1
+; ASM-NEXT:    lda m0, [p2, #4]; vldb.pop.512.2d x4, [p0, lf0, r24, d1]; movxm ls, #.LBB0_1
 ; ASM-NEXT:    movxm le, #.L_LEnd0
 ; ASM-NEXT:    nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopx ; add.nc lc, r0, #-3; nopv
 ; ASM-NEXT:    vlda.pop.512 x6, [p0, lf0, r24]; nopb ; nops ; nopx ; vconv.fp32.bf16 cml1, x6; nopv
diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll
index d890556cbc69..a6f59f17636e 100644
--- a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll
+++ b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll
@@ -44,18 +44,15 @@ define void @heavy_3d_user(i32 %dimsAI.sroa.5.0.copyload.i, i32 %dimsAI.sroa.7.0
 ; FINE-GRAINED-NEXT:  .LBB0_1: // %for.body.i
 ; FINE-GRAINED-NEXT:    // =>This Loop Header: Depth=1
 ; FINE-GRAINED-NEXT:    // Child Loop BB0_2 Depth 2
-; FINE-GRAINED-NEXT:    nopa ; nopb ; nopx ; mov dn2, r3; movs dj2, p6
-; FINE-GRAINED-NEXT:    movs dn6, r3; mov r17, dc6
-; FINE-GRAINED-NEXT:    movs dj6, p6; mov m2, m4
-; FINE-GRAINED-NEXT:    mova p1, #0; movs dc6, r4; mov r25, r18
-; FINE-GRAINED-NEXT:    vldb.pop.576.3d ex0, [p1, lf1, r25, d2]
-; FINE-GRAINED-NEXT:    nop
+; FINE-GRAINED-NEXT:    nopa ; nopb ; movs dj2, p6; nopx ; mov dn2, r3; nopv
+; FINE-GRAINED-NEXT:    nopa ; movs dn6, r3; nopx ; mov r17, dc6
+; FINE-GRAINED-NEXT:    movs dj6, p6; or r6, r5, r5; mov r5, dj4
+; FINE-GRAINED-NEXT:    movs m2, m4; vmov lfl1, lfl0
 ; FINE-GRAINED-NEXT:    movs m1, m5; mov dn1, r3
-; FINE-GRAINED-NEXT:    movs dc1, dc0; vmov lfl1, lfl0
-; FINE-GRAINED-NEXT:    movs dj1, m5; vmov lfh1, lfh0
+; FINE-GRAINED-NEXT:    movs dc1, dc0; mov dj1, m5
 ; FINE-GRAINED-NEXT:    mova p0, #0; movs dn5, r3; mov dj5, m5
-; FINE-GRAINED-NEXT:    paddb.3d [p0], d1; or r6, r5, r5; mov r5, dj4
-; FINE-GRAINED-NEXT:    mova p0, #0; mov r21, dc5
+; FINE-GRAINED-NEXT:    mova p1, #0; paddb.3d [p0], d1; or r25, r18, r18; vmov lfh1, lfh0; movs dc6, r4
+; FINE-GRAINED-NEXT:    mova p0, #0; vldb.pop.576.3d ex0, [p1, lf1, r25, d2]; mov r21, dc5
 ; FINE-GRAINED-NEXT:  .LBB0_2: // %for.body125.i
 ; FINE-GRAINED-NEXT:    // Parent Loop BB0_1 Depth=1
 ; FINE-GRAINED-NEXT:    // => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/AIE/aie2p/upd_ext_bfp16.ll b/llvm/test/CodeGen/AIE/aie2p/upd_ext_bfp16.ll
index a3ea23ad71f1..cb2c9076b76d 100644
--- a/llvm/test/CodeGen/AIE/aie2p/upd_ext_bfp16.ll
+++ b/llvm/test/CodeGen/AIE/aie2p/upd_ext_bfp16.ll
@@ -4,7 +4,7 @@
 ; See https://llvm.org/LICENSE.txt for license information.
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 ;
-; (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
+; (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aie2p | FileCheck %s
 
 %struct.v64bfp16ebs16 = type <{ <64 x i8>, <8 x i8> }>
@@ -197,8 +197,8 @@ define dso_local noundef <32 x i8> @_Z20test_extract_v32int813v64bfp16ebs16i(%st
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
 ; CHECK-NEXT:    nop // Delay Slot 3
-; CHECK-NEXT:    vmov x0, x2 // Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
+; CHECK-NEXT:    nop // Delay Slot 2
+; CHECK-NEXT:    vmov x0, x2 // Delay Slot 1
 ; CHECK-NEXT:  // %bb.1: // %if.else.i.i
 ; CHECK-NEXT:    vmov wl0, wh0
 ; CHECK-NEXT:  .LBB10_2: // %_ZL15extract_v32int813v64bfp16ebs16i.exit
@@ -238,8 +238,8 @@ define dso_local noundef <32 x i8> @_Z20test_extract_v32int812v64bfp16ebs8i(%str
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
 ; CHECK-NEXT:    nop // Delay Slot 3
-; CHECK-NEXT:    vmov x0, x2 // Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
+; CHECK-NEXT:    nop // Delay Slot 2
+; CHECK-NEXT:    vmov x0, x2 // Delay Slot 1
 ; CHECK-NEXT:  // %bb.1: // %if.else.i.i
 ; CHECK-NEXT:    vmov wl0, wh0
 ; CHECK-NEXT:  .LBB11_2: // %_ZL15extract_v32int812v64bfp16ebs8i.exit
@@ -340,8 +340,8 @@ define dso_local %struct.v128bfp16ebs8 @_Z11test_insert13v128bfp16ebs8i12v64bfp1
 ; CHECK-NEXT:    nopa ; jz r0, #.LBB15_2
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    mov r4, el2 // Delay Slot 3
-; CHECK-NEXT:    vmov x1, x2 // Delay Slot 2
+; CHECK-NEXT:    vmov x1, x2 // Delay Slot 3
+; CHECK-NEXT:    mov r4, el2 // Delay Slot 2
 ; CHECK-NEXT:    mov r5, eh2 // Delay Slot 1
 ; CHECK-NEXT:  // %bb.1: // %if.end.i
 ; CHECK-NEXT:    j #.LBB15_3
@@ -622,8 +622,8 @@ define dso_local %struct.v128bfp16ebs16 @_Z11test_insert14v128bfp16ebs16i13v64bf
 ; CHECK-NEXT:    nopa ; jz r0, #.LBB22_2
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    mov r4, el2 // Delay Slot 3
-; CHECK-NEXT:    vmov x1, x2 // Delay Slot 2
+; CHECK-NEXT:    vmov x1, x2 // Delay Slot 3
+; CHECK-NEXT:    mov r4, el2 // Delay Slot 2
 ; CHECK-NEXT:    mov r5, eh2 // Delay Slot 1
 ; CHECK-NEXT:  // %bb.1: // %if.end.i
 ; CHECK-NEXT:    j #.LBB22_3
diff --git a/llvm/test/CodeGen/AIE/aie2ps/conv2d-outer-loop.ll b/llvm/test/CodeGen/AIE/aie2ps/conv2d-outer-loop.ll
index 10b3d045ad97..c42209602468 100644
--- a/llvm/test/CodeGen/AIE/aie2ps/conv2d-outer-loop.ll
+++ b/llvm/test/CodeGen/AIE/aie2ps/conv2d-outer-loop.ll
@@ -32,16 +32,16 @@ define void @conv2d_outer_loop(ptr noalias %ifm, ptr noalias %wts, ptr noalias %
 ; CHECK-NEXT:    eq r18, r20, r16
 ; CHECK-NEXT:    paddxm [sp], #64; lshl r28, r24, r16
 ; CHECK-NEXT:    st p6, [sp, #-64]; ltu r27, r16, r6 // 4-byte Folded Spill
-; CHECK-NEXT:    mova dj1, #96; st r26, [p3, dj0]; or r18, r28, r18
-; CHECK-NEXT:    st.s8 r18, [p3, dj1]; add r2, r18, r2
-; CHECK-NEXT:    sel.nez r18, r2, r0, r27
+; CHECK-NEXT:    st p7, [sp, #-60]; or r18, r28, r18 // 4-byte Folded Spill
+; CHECK-NEXT:    mova dj1, #96; st r26, [p3, dj0]; add r2, r18, r2
+; CHECK-NEXT:    st.s8 r18, [p3, dj1]; sel.nez r18, r2, r0, r27
 ; CHECK-NEXT:    ne r26, r2, r16
 ; CHECK-NEXT:    jnz r26, #.LBB0_2
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    mova dj0, #76; ltu r28, r16, r18 // Delay Slot 3
-; CHECK-NEXT:    st r28, [p3, dj0]; mov r7, r8 // Delay Slot 2
-; CHECK-NEXT:    mova r2, #5; st p7, [sp, #-60]; or r17, r10, r10; mov r19, r12 // 4-byte Folded Spill Delay Slot 1
+; CHECK-NEXT:    nop // Delay Slot 3
+; CHECK-NEXT:    mova dj0, #76; ltu r28, r16, r18; mov r7, r8 // Delay Slot 2
+; CHECK-NEXT:    mova r2, #5; st r28, [p3, dj0]; or r17, r10, r10; mov r19, r12 // Delay Slot 1
 ; CHECK-NEXT:  // %bb.1: // %entry
 ; CHECK-NEXT:    jnz r24, #.LBB0_3
 ; CHECK-NEXT:    nop // Delay Slot 5
@@ -73,11 +73,9 @@ define void @conv2d_outer_loop(ptr noalias %ifm, ptr noalias %wts, ptr noalias %
 ; CHECK-NEXT:    or r26, r24, r18
 ; CHECK-NEXT:  .LBB0_3: // %if.end.i
 ; CHECK-NEXT:    mova m0, #80; nopb ; nops ; nopx ; mov p2, p3; nopv
-; CHECK-NEXT:    padda [p2], m0
-; CHECK-NEXT:    st r28, [p2], #24
-; CHECK-NEXT:    st.s8 r26, [p2, #0]; ne r6, r20, r6
-; CHECK-NEXT:    jnz r6, #.LBB0_5
-; CHECK-NEXT:    nop // Delay Slot 5
+; CHECK-NEXT:    padda [p2], m0; ne r6, r20, r6; nopm
+; CHECK-NEXT:    st r28, [p2], #24; jnz r6, #.LBB0_5
+; CHECK-NEXT:    st.s8 r26, [p2, #0] // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
 ; CHECK-NEXT:    nop // Delay Slot 3
 ; CHECK-NEXT:    nop // Delay Slot 2
@@ -86,17 +84,15 @@ define void @conv2d_outer_loop(ptr noalias %ifm, ptr noalias %wts, ptr noalias %
 ; CHECK-NEXT:    movxm r6, #16777215
 ; CHECK-NEXT:    mova dj0, #92; and r4, r4, r6
 ; CHECK-NEXT:    st r4, [p3, dj0]
-; CHECK-NEXT:    nop
 ; CHECK-NEXT:  .LBB0_5: // %_Z24setup_conv2d_iter_paramsR13conv2d_params.exit
-; CHECK-NEXT:    mova dj0, #84; nopb ; nopxm
-; CHECK-NEXT:    lda r20, [p3, dj0]; extend.u8 r4, r26
-; CHECK-NEXT:    mova dj0, #120; eq r6, r4, r22
-; CHECK-NEXT:    lda r24, [p3, dj0]; jnz r6, #.LBB0_7
+; CHECK-NEXT:    nopa ; nopb ; nops ; extend.u8 r4, r26; nopm ; nopv
+; CHECK-NEXT:    eq r6, r4, r22
+; CHECK-NEXT:    jnz r6, #.LBB0_7
 ; CHECK-NEXT:    nop // Delay Slot 5
-; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    nop // Delay Slot 3
-; CHECK-NEXT:    nop // Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
+; CHECK-NEXT:    mova dj0, #84 // Delay Slot 4
+; CHECK-NEXT:    lda r20, [p3, dj0] // Delay Slot 3
+; CHECK-NEXT:    mova dj0, #120 // Delay Slot 2
+; CHECK-NEXT:    lda r24, [p3, dj0] // Delay Slot 1
 ; CHECK-NEXT:  // %bb.6: // %_Z24setup_conv2d_iter_paramsR13conv2d_params.exit
 ; CHECK-NEXT:    ne r4, r4, r16
 ; CHECK-NEXT:    jnz r4, #.LBB0_11
@@ -142,22 +138,21 @@ define void @conv2d_outer_loop(ptr noalias %ifm, ptr noalias %wts, ptr noalias %
 ; CHECK-NEXT:  .LBB0_8: // %for.body.i68
 ; CHECK-NEXT:    // =>This Loop Header: Depth=1
 ; CHECK-NEXT:    // Child Loop BB0_9 Depth 2
-; CHECK-NEXT:    nopa ; vldb x1, [p1, #64]; nopx ; mov r0, dc6; nops
+; CHECK-NEXT:    nopa ; nopb ; nops ; nopx ; mov r0, dc6; nopv
 ; CHECK-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p4, #0]; vldb.popx x4, [p0, lf0, r24]; lshl r0, r0, r2; mov dc4, dc3
 ; CHECK-NEXT:    vlda.pop.3d x6, [p0, lf0, r24, d0]; or r20, r0, r16; mov dj3, r0
 ; CHECK-NEXT:    vldb.128 wl2, [p2, dj3]; mov dj3, r20
 ; CHECK-NEXT:    vldb.128 wl8, [p2, dj3]
 ; CHECK-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p6, #0]; vldb x10, [p1, #0]
-; CHECK-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p4, #64]
-; CHECK-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p6, #64]; vldb.popx x10, [p0, lf0, r24]
-; CHECK-NEXT:    vldb.pop.3d x8, [p0, lf0, r24, d0]
-; CHECK-NEXT:    vldb.popx x10, [p0, lf0, r24]; mov p7, p1
-; CHECK-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]; vshuffle x2, x4, x6, r6; vmul dm2, x0, x2, r10
-; CHECK-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; add.nc lc, r18, #-6; padds [p7], #128; vmul dm3, x0, x8, r10
-; CHECK-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]; movxm ls, #.LBB0_9; vaddmac dm1, dm1, dm2, x2, x10, r12
-; CHECK-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; padds [p7], #128; movxm le, #.L_LEnd0; vaddmac dm0, dm0, dm3, x2, x1, r12
+; CHECK-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p4, #64]; vldb.popx x10, [p0, lf0, r24]
+; CHECK-NEXT:    vlda x1, [p1, #64]; vldb.pop.3d x8, [p0, lf0, r24, d0]
+; CHECK-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p6, #64]; vldb.popx x10, [p0, lf0, r24]; mov p7, p1
+; CHECK-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]
+; CHECK-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; padds [p7], #128; add.nc lc, r18, #-6; vshuffle x2, x4, x6, r6; vmul dm2, x0, x2, r10
+; CHECK-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]; movxm ls, #.LBB0_9; vmul dm3, x0, x8, r10
+; CHECK-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; padds [p7], #128; movxm le, #.L_LEnd0; vaddmac dm1, dm1, dm2, x2, x10, r12
 ; CHECK-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]; nops ; nopxm ; nopv
-; CHECK-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; padds [p7], #128; nopxm ; nopv
+; CHECK-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; padds [p7], #128; nopxm ; vaddmac dm0, dm0, dm3, x2, x1, r12
 ; CHECK-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]; nops ; nopx ; vshuffle x2, x10, x8, r6; nopv
 ; CHECK-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; padds [p7], #128; nopxm ; nopv
 ; CHECK-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]; nops ; nopx ; vshuffle x2, x10, x8, r6; vmac dm1, dm1, x2, x6, r8
@@ -170,9 +165,9 @@ define void @conv2d_outer_loop(ptr noalias %ifm, ptr noalias %wts, ptr noalias %
 ; CHECK-NEXT:  // %bb.10: // %for.cond.cleanup54.i89
 ; CHECK-NEXT:    // in Loop: Header=BB0_8 Depth=1
 ; CHECK-NEXT:    vlda x4, [p7, #192]; paddb [p1], m3; padds [p7], #128; add r4, r4, #-1; nopm ; vmac dm0, dm0, x2, x4, r8
-; CHECK-NEXT:    vlda x6, [p7, #128]; paddb [p4], #128; padds [p6], #128; nopx ; vshuffle x2, x10, x8, r6; vmac dm1, dm1, x2, x6, r8
-; CHECK-NEXT:    vlda x4, [p7, #192]; paddb.3d [p0], d1; padds [p7], #128; nopxm ; vmac dm0, dm0, x2, x4, r8
-; CHECK-NEXT:    nopa ; paddb.3d [p1], d2; nops ; nopx ; vshuffle x2, x10, x8, r6; vmac dm1, dm1, x2, x6, r8
+; CHECK-NEXT:    vlda x6, [p7, #128]; paddb.3d [p1], d2; padds [p4], #128; nopx ; vshuffle x2, x10, x8, r6; vmac dm1, dm1, x2, x6, r8
+; CHECK-NEXT:    vlda x4, [p7, #192]; paddb [p6], #128; padds [p7], #128; nopxm ; vmac dm0, dm0, x2, x4, r8
+; CHECK-NEXT:    nopa ; paddb.3d [p0], d1; nops ; nopx ; vshuffle x2, x10, x8, r6; vmac dm1, dm1, x2, x6, r8
 ; CHECK-NEXT:    vmac dm0, dm0, x2, x4, r8
 ; CHECK-NEXT:    vshuffle x2, x10, x8, r6; vmac dm1, dm1, x2, x6, r8
 ; CHECK-NEXT:    vmac dm0, dm0, x2, x4, r8
diff --git a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_int8_outerloop_pipelined.ll b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_int8_outerloop_pipelined.ll
index 3ba9874aebe2..f4015e67392c 100644
--- a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_int8_outerloop_pipelined.ll
+++ b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_int8_outerloop_pipelined.ll
@@ -43,8 +43,8 @@ declare i1 @llvm.loop.decrement.i32(i32) #3
 define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %cond, ptr %cond.i, ptr %ifm, i32 %cond88.i, i20 %idx.ext.i.i, i20 %idx.ext.i330.i, i20 %idx.ext.i334.i, i32 %1, i20 %idx.ext.i338.i, i20 %2, i20 %3, i20 %4, i20 %5, i20 %6, i20 %7, i20 %8, i20 %9, i20 %10, i20 %11, i20 %12, i20 %13, i20 %14, i20 %15, i20 %16, i32 %conv197.i, i32 %conv.i.i.i.i.i, i20 %idx.ext.i342.i, i20 %idx.ext.i344.i, i20 %17, i20 %18, i20 %19, i32 %or.i.i, i32 %cond15.i.i.i.i.i, i20 %20, i20 %21, i20 %22, i20 %23, i20 %24, i32 %or22.i.i.i.i.i) #4 personality ptr @__gxx_personality_v0 {
 ; CHECK-LABEL: conv2d:
 ; CHECK:       // %bb.0: // %newFuncRoot
-; CHECK-NEXT:    paddxm [sp], #64; nopb ; nops ; nopx ; mov m4, p4; nopv
-; CHECK-NEXT:    mova m0, #-68; st p6, [sp, #-64]; nopx // 4-byte Folded Spill
+; CHECK-NEXT:    paddxm [sp], #64; nopb ; nopx ; mov m4, p4
+; CHECK-NEXT:    mova m0, #-68; st p6, [sp, #-64] // 4-byte Folded Spill
 ; CHECK-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p1], m4; movs m2, p5; mov p6, sp
 ; CHECK-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p1], m2; paddb [p6], m0
 ; CHECK-NEXT:    lda m5, [p6], #-4
@@ -53,8 +53,9 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %cond, ptr %cond.i, ptr
 ; CHECK-NEXT:    lda r21, [p6], #-4; mov s0, r1
 ; CHECK-NEXT:    lda r29, [p6], #-4; movx crupsmode, #0
 ; CHECK-NEXT:    lda r25, [p6], #-4
-; CHECK-NEXT:    lda dn0, [p6], #-4
-; CHECK-NEXT:    lda r27, [p6], #-4
+; CHECK-NEXT:    lda dn0, [p6], #-4; paddb [p1], m4
+; CHECK-NEXT:    lda r27, [p6], #-4; paddb [p1], m5
+; CHECK-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p1, #0]
 ; CHECK-NEXT:    lda m1, [p6], #-4
 ; CHECK-NEXT:    lda r31, [p6], #-4
 ; CHECK-NEXT:    lda r16, [p6], #-4
@@ -64,20 +65,16 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %cond, ptr %cond.i, ptr
 ; CHECK-NEXT:    lda r18, [p6], #-4
 ; CHECK-NEXT:    lda dn3, [p6], #-4
 ; CHECK-NEXT:    lda dn7, [p6], #-4
-; CHECK-NEXT:    lda r20, [p6], #-4
-; CHECK-NEXT:    lda m6, [p6], #-4
-; CHECK-NEXT:    lda r1, [p6], #-4
-; CHECK-NEXT:    lda r26, [p6], #-4
-; CHECK-NEXT:    lda r22, [p6], #-4; mov dj4, #0
-; CHECK-NEXT:    lda m2, [p6], #-4; mov s1, r3
-; CHECK-NEXT:    lda dj2, [p6], #-4; or r28, r8, r8; mov dj3, #0
-; CHECK-NEXT:    lda dj6, [p6], #-4; movs dc2, dj4; or r30, r5, r5; mov r5, dj4
-; CHECK-NEXT:    lda dn2, [p6, #0]; movs dc6, dj4; or r8, r7, r7; mov r7, dj4
-; CHECK-NEXT:    lda dn6, [p6, #-4]; movs dc0, dj4; mov dj1, r31
-; CHECK-NEXT:    padda [p1], m4; movs dc1, dj4; mov dj5, r16
-; CHECK-NEXT:    padda [p1], m5; movs dc5, dj4; mov dj7, r18
-; CHECK-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p1, #0]; movs dc3, dj4; mov r23, m5
-; CHECK-NEXT:    padda [p1], m4; movs dc7, dj4; add r0, r0, #-1; mov p6, p0
+; CHECK-NEXT:    lda r20, [p6], #-4; mov dj4, #0
+; CHECK-NEXT:    lda m6, [p6], #-4; mov s1, r3
+; CHECK-NEXT:    lda r1, [p6], #-4; mov dj3, #0
+; CHECK-NEXT:    lda r26, [p6], #-4; movs dc2, dj4; or r28, r8, r8; mov dc6, dj4
+; CHECK-NEXT:    lda r22, [p6], #-4; movs dc0, dj4; or r30, r5, r5; mov r5, dj4
+; CHECK-NEXT:    lda m2, [p6], #-4; movs dc1, dj4; or r8, r7, r7; mov r7, dj4
+; CHECK-NEXT:    lda dj2, [p6], #-4; movs dc5, dj4; mov r23, m5
+; CHECK-NEXT:    lda dj6, [p6], #-4; movs dc3, dj4; mov dj1, r31
+; CHECK-NEXT:    lda dn2, [p6, #0]; movs dc7, dj4; mov dj5, r16
+; CHECK-NEXT:    lda dn6, [p6, #-4]; paddb [p1], m4; add r0, r0, #-1; mov p6, p0; movs dj7, r18
 ; CHECK-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p1, #0]; movs p0, p3; movx crsrsmode, #0; mov m4, r1
 ; CHECK-NEXT:  .LBB0_1: // %for.body.i
 ; CHECK-NEXT:    // =>This Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_opt_outerloop_blocks.ll b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_opt_outerloop_blocks.ll
index a4a5c9b46a78..fba50a7537bb 100644
--- a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_opt_outerloop_blocks.ll
+++ b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_opt_outerloop_blocks.ll
@@ -51,7 +51,7 @@ declare i1 @llvm.loop.decrement.i32(i32) #3
 define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm, ptr %psum_1_tdm, ptr %ifm, ptr %add.ptr.i, <64 x i8> %1, i32 %conv10.i, i20 %2, i20 %3, i20 %4, i20 %5, i20 %6, i32 %7, i32 %or25.i.i.i, i32 %8, i20 %9, i20 %10, i20 %11, i20 %12, i20 %13, i20 %14, i20 %15, i20 %16, i20 %17, i20 %18, i20 %19, i32 %conv91.i, i32 %20, i20 %idx.ext.i216.i, i20 %21, i20 %22, i20 %23, i32 %or22.i.i.i) #4 {
 ; CHECK-LABEL: conv2d:
 ; CHECK:       // %bb.0: // %newFuncRoot
-; CHECK-NEXT:    paddxm [sp], #64
+; CHECK-NEXT:    paddxm [sp], #64; nopb ; nopx
 ; CHECK-NEXT:    st p6, [sp, #-64] // 4-byte Folded Spill
 ; CHECK-NEXT:    mova m0, #-68; mov p6, sp
 ; CHECK-NEXT:    padda [p6], m0
@@ -77,15 +77,14 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ; CHECK-NEXT:    lda dn3, [p6, #-4]; movs p6, p1; movx r25, #0; mov p1, p4
 ; CHECK-NEXT:    movs dc0, dc7; vldb.popx x10, [p1, lf1, r25]; mov dc4, dc7
 ; CHECK-NEXT:    vldb.pop.3d x1, [p1, lf1, r25, d0]
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    vldb.128 wl2, [p5, #0]; or r22, r12, r12; mov r19, r8
-; CHECK-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; or r21, r10, r10; mov s0, r1
-; CHECK-NEXT:    mova r16, #5; vldb x8, [p0, #0]; or r10, r3, r3; mov s1, r5
-; CHECK-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3], #64; vldb x6, [p0, #64]; add r0, r0, #-1; mov dc6, dc7; movs dc3, dc7
-; CHECK-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; movs dc2, dc7; or r8, r7, r7; addm.nc r1, r0, #-1
-; CHECK-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vldb.128 wl4, [p5, #16]; movxm p4, #.LBB0_1; movs dc5, dc7
-; CHECK-NEXT:    mova r18, #16; movs dc1, dc7; movx crupsmode, #0; vshuffle x10, x10, x1, r2
-; CHECK-NEXT:    mova r12, #264; st p7, [sp, #-60]; movx crsrsmode, #0; mov m5, r17 // 4-byte Folded Spill
+; CHECK-NEXT:    or r22, r12, r12; mov r19, r8
+; CHECK-NEXT:    mova r16, #5; or r21, r10, r10; mov s0, r1
+; CHECK-NEXT:    mova r18, #16; vldb.128 wl4, [p5, #16]; or r10, r3, r3; mov s1, r5
+; CHECK-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; movs dc3, dc7; or r8, r7, r7; mov dc6, dc7
+; CHECK-NEXT:    mova r12, #264; vldb.128 wl2, [p5, #0]; add r0, r0, #-1; mov dc5, dc7; movs dc2, dc7
+; CHECK-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3], #64; vldb x8, [p0, #0]; movx crupsmode, #0; addm.nc r1, r0, #-1; movs dc1, dc7
+; CHECK-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; st p7, [sp, #-60]; movxm p4, #.LBB0_1 // 4-byte Folded Spill
+; CHECK-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vldb x6, [p0, #64]; movx crsrsmode, #0; vshuffle x10, x10, x1, r2; movs m5, r17
 ; CHECK-NEXT:  .LBB0_1: // %for.body.i
 ; CHECK-NEXT:    // =>This Loop Header: Depth=1
 ; CHECK-NEXT:    // Child Loop BB0_2 Depth 2
@@ -110,10 +109,10 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup54.i
 ; CHECK-NEXT:    // in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    vlda x4, [p7, #192]; paddb.3d [p1], d1; padds [p7], #128; nopxm ; vmac dm0, dm0, x2, x4, r8
-; CHECK-NEXT:    vlda x6, [p7, #128]; vldb.popx x10, [p1, lf1, r25]; movs dc4, dc7; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
-; CHECK-NEXT:    vlda x4, [p7, #192]; paddb [p0], m4; padds [p7], #128; nopx ; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8
-; CHECK-NEXT:    padds.3d [p0], d2; vldb.pop.3d x1, [p1, lf1, r25, d0]; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
-; CHECK-NEXT:    vlda x8, [p0, #0]; mov r0, dc6; vmac dm0, dm0, x2, x4, r8
+; CHECK-NEXT:    vlda x6, [p7, #128]; paddb [p0], m4; movs dc4, dc7; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; CHECK-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p7], #128; nopx ; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8
+; CHECK-NEXT:    vlda.pop.3d x1, [p1, lf1, r25, d0]; paddb.3d [p0], d2; nops ; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; CHECK-NEXT:    vldb x8, [p0, #0]; nopx ; mov r0, dc6; vmac dm0, dm0, x2, x4, r8
 ; CHECK-NEXT:    vldb x6, [p0, #64]; lshl r0, r0, r16; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
 ; CHECK-NEXT:    or r20, r0, r18; mov dj7, r0; vmac dm0, dm0, x2, x4, r8
 ; CHECK-NEXT:    movs dj7, r20; vldb.128 wl2, [p5, dj7]; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
@@ -130,11 +129,11 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ; CHECK-NEXT:    vldb.popx x8, [p1, lf1, r25]
 ; CHECK-NEXT:    vldb.pop.3d x6, [p1, lf1, r25, d0]
 ; CHECK-NEXT:    vldb.popx x8, [p1, lf1, r25]
-; CHECK-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; vmul dm2, x0, x2, r12
-; CHECK-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; add.nc lc, r4, #-6; padds [p0], #128; vmul dm3, x0, x4, r12
-; CHECK-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; movxm ls, #.LBB0_5; vaddmac dm1, dm1, dm2, x10, x8, r10
-; CHECK-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; movxm le, #.L_LEnd0; vaddmac dm0, dm0, dm3, x10, x6, r10
-; CHECK-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopxm ; nopv
+; CHECK-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]
+; CHECK-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; add.nc lc, r4, #-6; padds [p0], #128; vmul dm2, x0, x2, r12
+; CHECK-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; movxm ls, #.LBB0_5; vmul dm3, x0, x4, r12
+; CHECK-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; movxm le, #.L_LEnd0; vaddmac dm1, dm1, dm2, x10, x8, r10
+; CHECK-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopxm ; vaddmac dm0, dm0, dm3, x10, x6, r10
 ; CHECK-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; nopv
 ; CHECK-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x0, x8, x6, r2; nopv
 ; CHECK-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; nopv
@@ -168,7 +167,7 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ;
 ; NO-PROLOGUE-SPLIT-LABEL: conv2d:
 ; NO-PROLOGUE-SPLIT:       // %bb.0: // %newFuncRoot
-; NO-PROLOGUE-SPLIT-NEXT:    paddxm [sp], #64; nopb ; nopx
+; NO-PROLOGUE-SPLIT-NEXT:    paddxm [sp], #64; nopb ; nops ; nopxm ; nopv
 ; NO-PROLOGUE-SPLIT-NEXT:    st p6, [sp, #-64] // 4-byte Folded Spill
 ; NO-PROLOGUE-SPLIT-NEXT:    mova m0, #-68; mov p6, sp
 ; NO-PROLOGUE-SPLIT-NEXT:    padda [p6], m0
@@ -192,22 +191,18 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ; NO-PROLOGUE-SPLIT-NEXT:    lda m3, [p6], #-4
 ; NO-PROLOGUE-SPLIT-NEXT:    lda dj3, [p6, #0]; movx r30, #63; mov dc7, #0
 ; NO-PROLOGUE-SPLIT-NEXT:    lda dn3, [p6, #-4]; movs p6, p1; movx r25, #0; mov p1, p4
-; NO-PROLOGUE-SPLIT-NEXT:    movs dc0, dc7; vldb.popx x4, [p1, lf1, r25]; mov dc4, dc7
-; NO-PROLOGUE-SPLIT-NEXT:    vlda.128 wl2, [p5, #0]; vldb.pop.3d x6, [p1, lf1, r25, d0]
-; NO-PROLOGUE-SPLIT-NEXT:    vlda.128 wl8, [p5, #16]
-; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2, #0]
-; NO-PROLOGUE-SPLIT-NEXT:    vldb x10, [p0, #0]
-; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2, #64]
-; NO-PROLOGUE-SPLIT-NEXT:    nop
-; NO-PROLOGUE-SPLIT-NEXT:    mov s0, r1
-; NO-PROLOGUE-SPLIT-NEXT:    mova r12, #264; or r21, r10, r10; mov r22, r12
-; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3, #0]; vldb x1, [p0, #64]; movx crupsmode, #0; vshuffle x2, x4, x6, r2; vmul dm2, x0, x2, r12
-; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3, #64]; or r10, r3, r3; mov r3, p5
-; NO-PROLOGUE-SPLIT-NEXT:    st p7, [sp, #-60]; mov s1, r5; vaddmac dm1, dm1, dm2, x2, x10, r10 // 4-byte Folded Spill
-; NO-PROLOGUE-SPLIT-NEXT:    movs p5, p2; mov p7, p0; vmul dm3, x0, x8, r12
-; NO-PROLOGUE-SPLIT-NEXT:    movs dc3, dc7; add r0, r0, #-1; mov dc6, dc7
-; NO-PROLOGUE-SPLIT-NEXT:    movs dc2, dc7; or r19, r8, r8; addm.nc r1, r0, #-1
-; NO-PROLOGUE-SPLIT-NEXT:    mova r16, #5; movs dc5, dc7; or r8, r7, r7; mov dc1, dc7
+; NO-PROLOGUE-SPLIT-NEXT:    vlda.128 wl2, [p5, #0]; vldb.popx x4, [p1, lf1, r25]; movs dc0, dc7; mov dc4, dc7
+; NO-PROLOGUE-SPLIT-NEXT:    vlda.128 wl8, [p5, #16]; vldb.pop.3d x6, [p1, lf1, r25, d0]
+; NO-PROLOGUE-SPLIT-NEXT:    vlda x1, [p0, #64]
+; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2, #0]; mov s0, r1
+; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3, #0]; vldb x10, [p0, #0]; mov r21, r10
+; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2, #64]; or r10, r3, r3; mov r3, p5
+; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3, #64]; mov p5, p2
+; NO-PROLOGUE-SPLIT-NEXT:    st p7, [sp, #-60]; add r0, r0, #-1; mov s1, r5 // 4-byte Folded Spill
+; NO-PROLOGUE-SPLIT-NEXT:    mova r12, #264; movs p7, p0; or r22, r12, r12; mov dc3, dc7
+; NO-PROLOGUE-SPLIT-NEXT:    movs dc6, dc7; movx crupsmode, #0; vshuffle x2, x4, x6, r2; vmul dm2, x0, x2, r12
+; NO-PROLOGUE-SPLIT-NEXT:    movs dc2, dc7; or r19, r8, r8; addm.nc r1, r0, #-1; vmul dm3, x0, x8, r12
+; NO-PROLOGUE-SPLIT-NEXT:    mova r16, #5; nopb ; movs dc5, dc7; or r8, r7, r7; mov dc1, dc7; vaddmac dm1, dm1, dm2, x2, x10, r10
 ; NO-PROLOGUE-SPLIT-NEXT:    mova r18, #16; nopb ; movs p4, p3; movx crsrsmode, #0; mov m5, r17; vaddmac dm0, dm0, dm3, x2, x1, r10
 ; NO-PROLOGUE-SPLIT-NEXT:  .LBB0_1: // %for.body.i
 ; NO-PROLOGUE-SPLIT-NEXT:    // =>This Loop Header: Depth=1
@@ -235,24 +230,20 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda x4, [p0, #192]; paddb [p7], m4; padds [p0], #128; nopx ; mov dc4, dc7; vmac dm0, dm0, x2, x4, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda x6, [p0, #128]; paddb.3d [p7], d2; padds.3d [p1], d1; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda x4, [p0, #192]; nopb ; padds [p0], #128; nopx ; mov r0, dc6; vmac dm0, dm0, x2, x4, r8
-; NO-PROLOGUE-SPLIT-NEXT:    nopa ; vldb x10, [p7, #0]; movs p0, r3; lshl r0, r0, r16; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
-; NO-PROLOGUE-SPLIT-NEXT:    vlda x1, [p7, #64]; vldb.popx x4, [p1, lf1, r25]; nops ; or r20, r0, r18; mov dj7, r0; vmac dm0, dm0, x2, x4, r8
+; NO-PROLOGUE-SPLIT-NEXT:    movs p0, r3; vldb x10, [p7, #0]; lshl r0, r0, r16; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; NO-PROLOGUE-SPLIT-NEXT:    vlda x1, [p7, #64]; vldb.popx x4, [p1, lf1, r25]; or r20, r0, r18; mov dj7, r0; vmac dm0, dm0, x2, x4, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda.128 wl2, [p0, dj7]; vldb.pop.3d x6, [p1, lf1, r25, d0]; movs dj7, r20; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
-; NO-PROLOGUE-SPLIT-NEXT:    vlda.128 wl8, [p0, dj7]; nopb ; nopx ; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8
+; NO-PROLOGUE-SPLIT-NEXT:    vlda.128 wl8, [p0, dj7]; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2, #128]; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2, #192]; nopb ; movs p2, p5; movxm p0, #.LBB0_1; vmac dm0, dm0, x2, x4, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3, #128]; vmac dm1, dm1, x2, x6, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3, #192]; mov p3, p4; vmac dm0, dm0, x2, x4, r8
-; NO-PROLOGUE-SPLIT-NEXT:    nop
-; NO-PROLOGUE-SPLIT-NEXT:    vmul dm2, x0, x2, r12
-; NO-PROLOGUE-SPLIT-NEXT:    vshuffle x2, x4, x6, r2; vmul dm3, x0, x8, r12
-; NO-PROLOGUE-SPLIT-NEXT:    nop
-; NO-PROLOGUE-SPLIT-NEXT:    vst.srs.4x dm1, s1, srssign0, [p6], m5; jnzd r1, r1, p0; vaddmac dm1, dm1, dm2, x2, x10, r10
-; NO-PROLOGUE-SPLIT-NEXT:    vst.2d.srs.4x dm0, s1, srssign0, [p6], d3; movx srssign0, #0; vaddmac dm0, dm0, dm3, x2, x1, r10 // Delay Slot 5
-; NO-PROLOGUE-SPLIT-NEXT:    nop // Delay Slot 4
+; NO-PROLOGUE-SPLIT-NEXT:    jnzd r1, r1, p0
+; NO-PROLOGUE-SPLIT-NEXT:    vmul dm2, x0, x2, r12 // Delay Slot 5
+; NO-PROLOGUE-SPLIT-NEXT:    vshuffle x2, x4, x6, r2; vmul dm3, x0, x8, r12 // Delay Slot 4
 ; NO-PROLOGUE-SPLIT-NEXT:    nop // Delay Slot 3
-; NO-PROLOGUE-SPLIT-NEXT:    nop // Delay Slot 2
-; NO-PROLOGUE-SPLIT-NEXT:    nop // Delay Slot 1
+; NO-PROLOGUE-SPLIT-NEXT:    vst.srs.4x dm1, s1, srssign0, [p6], m5; vaddmac dm1, dm1, dm2, x2, x10, r10 // Delay Slot 2
+; NO-PROLOGUE-SPLIT-NEXT:    vst.2d.srs.4x dm0, s1, srssign0, [p6], d3; movx srssign0, #0; vaddmac dm0, dm0, dm3, x2, x1, r10 // Delay Slot 1
 ; NO-PROLOGUE-SPLIT-NEXT:  // %bb.4: // %cooldown.entry
 ; NO-PROLOGUE-SPLIT-NEXT:    vldb.popx x8, [p1, lf1, r25]
 ; NO-PROLOGUE-SPLIT-NEXT:    vldb.pop.3d x6, [p1, lf1, r25, d0]
@@ -295,7 +286,7 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ;
 ; NO-JNZD-LABEL: conv2d:
 ; NO-JNZD:       // %bb.0: // %newFuncRoot
-; NO-JNZD-NEXT:    paddxm [sp], #64; nopb ; nopxm ; nops
+; NO-JNZD-NEXT:    paddxm [sp], #64; nopb ; nops ; nopxm ; nopv
 ; NO-JNZD-NEXT:    st p6, [sp, #-64] // 4-byte Folded Spill
 ; NO-JNZD-NEXT:    mova m0, #-68; mov p6, sp
 ; NO-JNZD-NEXT:    padda [p6], m0
@@ -322,14 +313,13 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ; NO-JNZD-NEXT:    movs dc0, dc7; vldb.popx x10, [p1, lf1, r25]; mov dc4, dc7
 ; NO-JNZD-NEXT:    vldb.pop.3d x1, [p1, lf1, r25, d0]
 ; NO-JNZD-NEXT:    nop
-; NO-JNZD-NEXT:    vldb.128 wl2, [p5, #0]
-; NO-JNZD-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; movx r18, #5; mov r23, r12
-; NO-JNZD-NEXT:    mova r20, #1; vldb x8, [p0, #0]; or r19, r8, r8; mov s0, r1
-; NO-JNZD-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3], #64; vldb x6, [p0, #64]; or r21, r10, r10; mov s1, r5
-; NO-JNZD-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; or r10, r3, r3; mov dc3, dc7
-; NO-JNZD-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vldb.128 wl4, [p5, #16]; or r8, r7, r7; mov dc2, dc7; movs dc6, dc7
-; NO-JNZD-NEXT:    mova r22, #16; movs dc5, dc7; movx crupsmode, #0; vshuffle x10, x10, x1, r2
-; NO-JNZD-NEXT:    mova r12, #264; movs dc1, dc7; movx crsrsmode, #0; mov m5, r17
+; NO-JNZD-NEXT:    mova r20, #1; movx r18, #5; mov r23, r12
+; NO-JNZD-NEXT:    mova r22, #16; vldb.128 wl4, [p5, #16]; or r19, r8, r8; mov s0, r1
+; NO-JNZD-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; or r21, r10, r10; mov s1, r5
+; NO-JNZD-NEXT:    mova r12, #264; vldb.128 wl2, [p5, #0]; or r10, r3, r3; mov dc3, dc7
+; NO-JNZD-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3], #64; vldb x8, [p0, #0]; or r8, r7, r7; mov dc2, dc7; movs dc6, dc7
+; NO-JNZD-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; movs dc5, dc7; movx crupsmode, #0; mov dc1, dc7
+; NO-JNZD-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vldb x6, [p0, #64]; movx crsrsmode, #0; vshuffle x10, x10, x1, r2; movs m5, r17
 ; NO-JNZD-NEXT:  .LBB0_1: // %for.body.i
 ; NO-JNZD-NEXT:    // =>This Loop Header: Depth=1
 ; NO-JNZD-NEXT:    // Child Loop BB0_2 Depth 2
@@ -353,18 +343,18 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ; NO-JNZD-NEXT:    vlda x6, [p4, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
 ; NO-JNZD-NEXT:  // %bb.3: // %for.cond.cleanup54.i
 ; NO-JNZD-NEXT:    // in Loop: Header=BB0_1 Depth=1
-; NO-JNZD-NEXT:    vlda x4, [p4, #192]; paddb.3d [p1], d1; padds [p4], #128; add r0, r0, #-1; mov dc4, dc7; vmac dm0, dm0, x2, x4, r8
-; NO-JNZD-NEXT:    vlda x6, [p4, #128]; paddb [p0], m4; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
-; NO-JNZD-NEXT:    vlda x4, [p4, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p4], #128; nopx ; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8
-; NO-JNZD-NEXT:    vlda.pop.3d x1, [p1, lf1, r25, d0]; paddb.3d [p0], d2; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
-; NO-JNZD-NEXT:    vldb x8, [p0, #0]; mov r16, dc6; vmac dm0, dm0, x2, x4, r8
-; NO-JNZD-NEXT:    vldb x6, [p0, #64]; lshl r16, r16, r18; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
-; NO-JNZD-NEXT:    or r24, r16, r22; mov dj7, r16; vmac dm0, dm0, x2, x4, r8
+; NO-JNZD-NEXT:    vlda x4, [p4, #192]; paddb [p0], m4; padds [p4], #128; add r0, r0, #-1; mov dc4, dc7; vmac dm0, dm0, x2, x4, r8
+; NO-JNZD-NEXT:    vlda x6, [p4, #128]; paddb.3d [p1], d1; padds.3d [p0], d2; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; NO-JNZD-NEXT:    vlda x4, [p4, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p4], #128; nopx ; mov r16, dc6; vmac dm0, dm0, x2, x4, r8
+; NO-JNZD-NEXT:    vlda x8, [p0, #0]; vldb.pop.3d x1, [p1, lf1, r25, d0]; nops ; lshl r16, r16, r18; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; NO-JNZD-NEXT:    vlda x6, [p0, #64]; nopb ; or r24, r16, r22; mov dj7, r16; vmac dm0, dm0, x2, x4, r8
 ; NO-JNZD-NEXT:    movs dj7, r24; vldb.128 wl2, [p5, dj7]; eq r16, r0, r20; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
-; NO-JNZD-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3], #64; vldb.128 wl4, [p5, dj7]; vmac dm0, dm0, x2, x4, r8
-; NO-JNZD-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; vmac dm1, dm1, x2, x6, r8
-; NO-JNZD-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; vmac dm0, dm0, x2, x4, r8
-; NO-JNZD-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vshuffle x10, x10, x1, r2
+; NO-JNZD-NEXT:    vldb.128 wl4, [p5, dj7]; vmac dm0, dm0, x2, x4, r8
+; NO-JNZD-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; NO-JNZD-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8
+; NO-JNZD-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3], #64; vmac dm1, dm1, x2, x6, r8
+; NO-JNZD-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vmac dm0, dm0, x2, x4, r8
+; NO-JNZD-NEXT:    vshuffle x10, x10, x1, r2
 ; NO-JNZD-NEXT:    jz r16, #.LBB0_1
 ; NO-JNZD-NEXT:    nop // Delay Slot 5
 ; NO-JNZD-NEXT:    nop // Delay Slot 4
@@ -375,11 +365,11 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ; NO-JNZD-NEXT:    vldb.popx x8, [p1, lf1, r25]
 ; NO-JNZD-NEXT:    vldb.pop.3d x6, [p1, lf1, r25, d0]
 ; NO-JNZD-NEXT:    vldb.popx x8, [p1, lf1, r25]
-; NO-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; vmul dm2, x0, x2, r12
-; NO-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; add.nc lc, r4, #-6; padds [p0], #128; vmul dm3, x0, x4, r12
-; NO-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; movxm ls, #.LBB0_5; vaddmac dm1, dm1, dm2, x10, x8, r10
-; NO-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; movxm le, #.L_LEnd0; vaddmac dm0, dm0, dm3, x10, x6, r10
-; NO-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopxm ; nopv
+; NO-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]
+; NO-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; add.nc lc, r4, #-6; padds [p0], #128; vmul dm2, x0, x2, r12
+; NO-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; movxm ls, #.LBB0_5; vmul dm3, x0, x4, r12
+; NO-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; movxm le, #.L_LEnd0; vaddmac dm1, dm1, dm2, x10, x8, r10
+; NO-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopxm ; vaddmac dm0, dm0, dm3, x10, x6, r10
 ; NO-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; nopv
 ; NO-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x0, x8, x6, r2; nopv
 ; NO-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; nopv
@@ -410,6 +400,121 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ; NO-JNZD-NEXT:    nop // Delay Slot 3
 ; NO-JNZD-NEXT:    nop // Delay Slot 2
 ; NO-JNZD-NEXT:    nop // Delay Slot 1
+; USE-JNZD-LABEL: conv2d:
+; USE-JNZD:       // %bb.0: // %newFuncRoot
+; USE-JNZD-NEXT:    paddxm [sp], #64; nopb ; nopx
+; USE-JNZD-NEXT:    st p6, [sp, #-64] // 4-byte Folded Spill
+; USE-JNZD-NEXT:    mova m0, #-68; mov p6, sp
+; USE-JNZD-NEXT:    padda [p6], m0
+; USE-JNZD-NEXT:    lda m0, [p6], #-4
+; USE-JNZD-NEXT:    lda dn0, [p6], #-4
+; USE-JNZD-NEXT:    lda dj0, [p6], #-4
+; USE-JNZD-NEXT:    lda dn4, [p6], #-4
+; USE-JNZD-NEXT:    lda dj4, [p6], #-4
+; USE-JNZD-NEXT:    lda m4, [p6], #-4
+; USE-JNZD-NEXT:    lda m1, [p6], #-4
+; USE-JNZD-NEXT:    lda dj1, [p6], #-4
+; USE-JNZD-NEXT:    lda dj5, [p6], #-4
+; USE-JNZD-NEXT:    lda dn1, [p6], #-4
+; USE-JNZD-NEXT:    lda dn5, [p6], #-4
+; USE-JNZD-NEXT:    lda m2, [p6], #-4
+; USE-JNZD-NEXT:    lda dj2, [p6], #-4
+; USE-JNZD-NEXT:    lda dj6, [p6], #-4
+; USE-JNZD-NEXT:    lda dn2, [p6], #-4
+; USE-JNZD-NEXT:    lda dn6, [p6], #-4
+; USE-JNZD-NEXT:    lda r17, [p6], #-4
+; USE-JNZD-NEXT:    lda m3, [p6], #-4
+; USE-JNZD-NEXT:    lda dj3, [p6, #0]; movx r30, #63; mov dc7, #0
+; USE-JNZD-NEXT:    lda dn3, [p6, #-4]; movs p6, p1; movx r25, #0; mov p1, p4
+; USE-JNZD-NEXT:    movs dc0, dc7; vldb.popx x10, [p1, lf1, r25]; mov dc4, dc7
+; USE-JNZD-NEXT:    vldb.pop.3d x1, [p1, lf1, r25, d0]
+; USE-JNZD-NEXT:    or r22, r12, r12; mov r19, r8
+; USE-JNZD-NEXT:    mova r16, #5; or r21, r10, r10; mov s0, r1
+; USE-JNZD-NEXT:    mova r18, #16; vldb.128 wl4, [p5, #16]; or r10, r3, r3; mov s1, r5
+; USE-JNZD-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; movs dc3, dc7; or r8, r7, r7; mov dc6, dc7
+; USE-JNZD-NEXT:    mova r12, #264; vldb.128 wl2, [p5, #0]; add r0, r0, #-1; mov dc5, dc7; movs dc2, dc7
+; USE-JNZD-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3], #64; vldb x8, [p0, #0]; movx crupsmode, #0; addm.nc r1, r0, #-1; movs dc1, dc7
+; USE-JNZD-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; st p7, [sp, #-60]; movxm p4, #.LBB0_1 // 4-byte Folded Spill
+; USE-JNZD-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vldb x6, [p0, #64]; movx crsrsmode, #0; vshuffle x10, x10, x1, r2; movs m5, r17
+; USE-JNZD-NEXT:  .LBB0_1: // %for.body.i
+; USE-JNZD-NEXT:    // =>This Loop Header: Depth=1
+; USE-JNZD-NEXT:    // Child Loop BB0_2 Depth 2
+; USE-JNZD-NEXT:    nopa ; vldb.popx x10, [p1, lf1, r25]; nops ; nopxm ; nopv
+; USE-JNZD-NEXT:    vldb.pop.3d x8, [p1, lf1, r25, d0]
+; USE-JNZD-NEXT:    vldb.popx x10, [p1, lf1, r25]; mov p7, p0
+; USE-JNZD-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; vmul dm2, x0, x2, r12
+; USE-JNZD-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; add.nc lc, r4, #-6; padds [p7], #128; vmul dm3, x0, x4, r12
+; USE-JNZD-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; movxm ls, #.LBB0_2; vaddmac dm1, dm1, dm2, x10, x8, r10
+; USE-JNZD-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p7], #128; movxm le, #.L_LEnd1; vaddmac dm0, dm0, dm3, x10, x6, r10
+; USE-JNZD-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; nops ; nopxm ; nopv
+; USE-JNZD-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p7], #128; nopxm ; nopv
+; USE-JNZD-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x2, x10, x8, r2; nopv
+; USE-JNZD-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p7], #128; nopxm ; nopv
+; USE-JNZD-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; USE-JNZD-NEXT:  .LBB0_2: // %for.body55.i
+; USE-JNZD-NEXT:    // Parent Loop BB0_1 Depth=1
+; USE-JNZD-NEXT:    // => This Inner Loop Header: Depth=2
+; USE-JNZD-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p7], #128; nopxm ; vmac dm0, dm0, x2, x4, r8
+; USE-JNZD-NEXT:  .L_LEnd1:
+; USE-JNZD-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; USE-JNZD-NEXT:  // %bb.3: // %for.cond.cleanup54.i
+; USE-JNZD-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; USE-JNZD-NEXT:    vlda x4, [p7, #192]; paddb.3d [p1], d1; padds [p7], #128; nopx ; mov dc4, dc7; vmac dm0, dm0, x2, x4, r8
+; USE-JNZD-NEXT:    vlda x6, [p7, #128]; paddb [p0], m4; nops ; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; USE-JNZD-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p7], #128; nopx ; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8
+; USE-JNZD-NEXT:    vlda.pop.3d x1, [p1, lf1, r25, d0]; paddb.3d [p0], d2; nops ; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; USE-JNZD-NEXT:    nopa ; vldb x8, [p0, #0]; nops ; nopx ; mov r0, dc6; vmac dm0, dm0, x2, x4, r8
+; USE-JNZD-NEXT:    vldb x6, [p0, #64]; lshl r0, r0, r16; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; USE-JNZD-NEXT:    or r20, r0, r18; mov dj7, r0; vmac dm0, dm0, x2, x4, r8
+; USE-JNZD-NEXT:    movs dj7, r20; vldb.128 wl2, [p5, dj7]; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; USE-JNZD-NEXT:    vldb.128 wl4, [p5, dj7]; vmac dm0, dm0, x2, x4, r8
+; USE-JNZD-NEXT:    vmac dm1, dm1, x2, x6, r8
+; USE-JNZD-NEXT:    vmac dm0, dm0, x2, x4, r8
+; USE-JNZD-NEXT:    jnzd r1, r1, p4
+; USE-JNZD-NEXT:    nop // Delay Slot 5
+; USE-JNZD-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64 // Delay Slot 4
+; USE-JNZD-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64 // Delay Slot 3
+; USE-JNZD-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3], #64; vst.srs.4x dm1, s1, srssign0, [p6], m5 // Delay Slot 2
+; USE-JNZD-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vst.2d.srs.4x dm0, s1, srssign0, [p6], d3; movx srssign0, #0; vshuffle x10, x10, x1, r2 // Delay Slot 1
+; USE-JNZD-NEXT:  // %bb.4: // %cooldown.entry
+; USE-JNZD-NEXT:    vldb.popx x8, [p1, lf1, r25]
+; USE-JNZD-NEXT:    vldb.pop.3d x6, [p1, lf1, r25, d0]
+; USE-JNZD-NEXT:    vldb.popx x8, [p1, lf1, r25]
+; USE-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]
+; USE-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; add.nc lc, r4, #-6; padds [p0], #128; vmul dm2, x0, x2, r12
+; USE-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; movxm ls, #.LBB0_5; vmul dm3, x0, x4, r12
+; USE-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; movxm le, #.L_LEnd0; vaddmac dm1, dm1, dm2, x10, x8, r10
+; USE-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopxm ; vaddmac dm0, dm0, dm3, x10, x6, r10
+; USE-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; nopv
+; USE-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x0, x8, x6, r2; nopv
+; USE-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; nopv
+; USE-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x0, x8, x6, r2; vmac dm1, dm1, x0, x4, r8
+; USE-JNZD-NEXT:  .LBB0_5: // %for.body55.i.cd
+; USE-JNZD-NEXT:    // =>This Inner Loop Header: Depth=1
+; USE-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; vmac dm0, dm0, x0, x2, r8
+; USE-JNZD-NEXT:  .L_LEnd0:
+; USE-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x0, x8, x6, r2; vmac dm1, dm1, x0, x4, r8
+; USE-JNZD-NEXT:  // %bb.6: // %cooldown.exit
+; USE-JNZD-NEXT:    vlda x2, [p0, #192]; nopb ; padds [p0], #128; movx crsrsmode, #0; mov s0, r5; vmac dm0, dm0, x0, x2, r8
+; USE-JNZD-NEXT:    vlda x4, [p0, #128]; nopb ; movs dj0, r17; or r12, r22, r22; vshuffle x0, x8, x6, r2; vmac dm1, dm1, x0, x4, r8
+; USE-JNZD-NEXT:    vlda x2, [p0, #192]; nopb ; padds [p0], #128; or r10, r21, r21; mov srssign0, r6; vmac dm0, dm0, x0, x2, r8
+; USE-JNZD-NEXT:    lda p7, [sp, #-60]; vshuffle x0, x8, x6, r2; vmac dm1, dm1, x0, x4, r8 // 4-byte Folded Reload
+; USE-JNZD-NEXT:    vmac dm0, dm0, x0, x2, r8
+; USE-JNZD-NEXT:    vshuffle x0, x8, x6, r2; vmac dm1, dm1, x0, x4, r8
+; USE-JNZD-NEXT:    vmac dm0, dm0, x0, x2, r8
+; USE-JNZD-NEXT:    vshuffle x0, x8, x6, r2; vmac dm1, dm1, x0, x4, r8
+; USE-JNZD-NEXT:    vmac dm0, dm0, x0, x2, r8
+; USE-JNZD-NEXT:    vmac dm1, dm1, x0, x4, r8
+; USE-JNZD-NEXT:    vmac dm0, dm0, x0, x2, r8
+; USE-JNZD-NEXT:    nop
+; USE-JNZD-NEXT:    nop
+; USE-JNZD-NEXT:    lda p6, [sp, #-64] // 4-byte Folded Reload
+; USE-JNZD-NEXT:    ret lr
+; USE-JNZD-NEXT:    vst.srs.4x dm1, s0, srssign0, [p6, #0] // Delay Slot 5
+; USE-JNZD-NEXT:    vst.srs.4x dm0, s0, srssign0, [p6, dj0] // Delay Slot 4
+; USE-JNZD-NEXT:    nop // Delay Slot 3
+; USE-JNZD-NEXT:    nop // Delay Slot 2
+; USE-JNZD-NEXT:    paddxm [sp], #-64; movx srssign0, #0; mov r8, r19 // Delay Slot 1
 newFuncRoot:
   br label %for.body.i
 
diff --git a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_blocks.ll b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_blocks.ll
index 125947b70f3d..dfbc9e84aaec 100644
--- a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_blocks.ll
+++ b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_blocks.ll
@@ -45,7 +45,7 @@ declare i1 @llvm.loop.decrement.i32(i32) #2
 define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, ptr %p_psum, ptr %p_c, ptr %p_bias, i20 %3, i20 %4, i20 %5, i20 %6, i20 %7, i20 %idx.ext.i, i20 %8, i20 %9, i20 %10, i20 %11, i20 %12, i32 %13, i32 %14, i32 %15, i32 %16, i32 %or23.i.i.i.i, <64 x i8> %17, i32 %18, i32 %19, i32 %20, i32 %21, i32 %22, i32 %or22.i.i.i.i, i32 %conv166, i32 %conv.i.i.i.i, i20 %idx.ext.i478, i20 %23, i20 %24, i20 %25, i20 %26, i20 %27, i20 %28) #3 {
 ; CHECK-LABEL: gemm:
 ; CHECK:       // %bb.0: // %newFuncRoot
-; CHECK-NEXT:    paddxm [sp], #64; nopx
+; CHECK-NEXT:    paddxm [sp], #64; nopb ; nopx
 ; CHECK-NEXT:    st p6, [sp, #-64] // 4-byte Folded Spill
 ; CHECK-NEXT:    mova m0, #-68; mov p6, sp
 ; CHECK-NEXT:    padda [p6], m0
@@ -66,34 +66,33 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt
 ; CHECK-NEXT:    lda r17, [p6], #-4
 ; CHECK-NEXT:    lda r19, [p6], #-4
 ; CHECK-NEXT:    lda m5, [p6], #-4
-; CHECK-NEXT:    vlda.ups.2x cml3, s0, upssign1, [p2], #64
-; CHECK-NEXT:    vlda.ups.2x cmh3, s0, upssign1, [p2], #64; vldb.128 wl4, [p4, #0]
-; CHECK-NEXT:    vlda.ups.2x cml2, s0, upssign1, [p2], #64; vldb.128 wl6, [p4, #16]; add r0, r0, #-1; mov r21, r8
-; CHECK-NEXT:    vlda.ups.2x cmh2, s0, upssign1, [p2], #64; vldb x8, [p1], m4; movx r22, #0; mov dc0, #0
-; CHECK-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; or r23, r10, r10; mov s0, r22
-; CHECK-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; movs dc1, dc0; or r10, r5, r5; mov dc5, dc0
-; CHECK-NEXT:    lda m2, [p6], #-4; vldb.3d x10, [p1], d1; movx crupsmode, #0; vbcst.32 x2, r22
-; CHECK-NEXT:    lda dj2, [p6], #-4; movs dc4, dc0; movx r22, #15; addm.nc r5, r0, #-1
-; CHECK-NEXT:    lda dn2, [p6], #-4; vldb x6, [p0], #64; movs m0, p5; vsel.32 x4, x2, x4, r22
-; CHECK-NEXT:    lda m3, [p6], #-4; vldb.3d x8, [p0], d0; vsel.32 x2, x2, x6, r22
-; CHECK-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p2], #64; vshuffle x6, x8, x0, r1
+; CHECK-NEXT:    vldb.128 wl4, [p4, #0]
+; CHECK-NEXT:    vlda.ups.2x cml3, s0, upssign1, [p2], #64; vldb.128 wl6, [p4, #16]; or r21, r8, r8; mov dc2, #0
+; CHECK-NEXT:    vlda.ups.2x cmh3, s0, upssign1, [p2], #64; vldb x8, [p1], m4; movx r22, #0; mov dc0, #0
+; CHECK-NEXT:    vlda.ups.2x cml2, s0, upssign1, [p2], #64; add r0, r0, #-1; vbcst.32 x2, r22
+; CHECK-NEXT:    vlda.ups.2x cmh2, s0, upssign1, [p2], #64; or r23, r10, r10; mov s0, r22
+; CHECK-NEXT:    lda m2, [p6], #-4; movs dc1, dc0; or r10, r5, r5; mov dc5, dc0
+; CHECK-NEXT:    lda dj2, [p6], #-4; vldb.3d x10, [p1], d1; movx r22, #15; addm.nc r5, r0, #-1
+; CHECK-NEXT:    lda dn2, [p6], #-4; movx crupsmode, #0; vsel.32 x4, x2, x4, r22
+; CHECK-NEXT:    lda m3, [p6], #-4; vsel.32 x2, x2, x6, r22
+; CHECK-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; vshuffle x6, x8, x0, r1
 ; CHECK-NEXT:    lda dj3, [p6, #0]; vshuffle x1, x6, x0, r2
 ; CHECK-NEXT:    lda dn3, [p6, #-4]; movxm p6, #.LBB0_1
-; CHECK-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p2], #64; vshuffle x8, x10, x0, r3
-; CHECK-NEXT:    mova dc2, #0; or r24, r12, r12; mov s1, r17
-; CHECK-NEXT:    mova r12, #776; movs dc3, dc2; movx crsrsmode, #0; vshuffle x10, x8, x0, r4
+; CHECK-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; vldb x6, [p0], #64; or r24, r12, r12; mov s1, r17; movs dc4, dc0
+; CHECK-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p2], #64; movs m0, p5; movx r12, #776; vshuffle x8, x10, x0, r3
+; CHECK-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p2], #64; vldb.3d x8, [p0], d0; movx crsrsmode, #0; vshuffle x10, x8, x0, r4; movs dc3, dc2
 ; CHECK-NEXT:  .LBB0_1: // %for.body
 ; CHECK-NEXT:    // =>This Loop Header: Depth=1
 ; CHECK-NEXT:    // Child Loop BB0_2 Depth 2
-; CHECK-NEXT:    nopa ; vldb x9, [p1], m4; movs m0, p5; nopxm ; vmul dm4, x0, x4, r12
+; CHECK-NEXT:    nopa ; vldb x9, [p1], m4; movs m0, p5; nopxm ; nopv
 ; CHECK-NEXT:    vlda.3d x7, [p1], d1; nopb ; nopx
-; CHECK-NEXT:    vldb x5, [p0], #64; vaddmac dm3, dm3, dm4, x6, x1, r10
-; CHECK-NEXT:    vlda.3d x3, [p0], d0; vmul dm4, x0, x2, r12
-; CHECK-NEXT:    movs m0, p5; vldb x9, [p1], m4; vaddmac dm2, dm2, dm4, x8, x1, r10
-; CHECK-NEXT:    vlda.3d x7, [p1], d1; movxm ls, #.LBB0_2; vaddmac dm1, dm1, dm4, x6, x10, r10
-; CHECK-NEXT:    vldb x5, [p0], #64; movxm le, #.L_LEnd1; vaddmac dm0, dm0, dm4, x8, x10, r10
-; CHECK-NEXT:    vlda.3d x3, [p0], d0; vshuffle x1, x9, x0, r7
-; CHECK-NEXT:    nopa ; vldb x9, [p1], m4; movs m0, p5; add.nc lc, r6, #-3; vshuffle x10, x1, x0, r16; nopv
+; CHECK-NEXT:    vldb x5, [p0], #64; vmul dm4, x0, x4, r12
+; CHECK-NEXT:    vlda.3d x3, [p0], d0
+; CHECK-NEXT:    movs m0, p5; vldb x9, [p1], m4; vaddmac dm3, dm3, dm4, x6, x1, r10
+; CHECK-NEXT:    vlda.3d x7, [p1], d1; movxm ls, #.LBB0_2; vmul dm4, x0, x2, r12
+; CHECK-NEXT:    vldb x5, [p0], #64; movxm le, #.L_LEnd1; vaddmac dm2, dm2, dm4, x8, x1, r10
+; CHECK-NEXT:    vlda.3d x3, [p0], d0; vshuffle x1, x9, x0, r7; vaddmac dm1, dm1, dm4, x6, x10, r10
+; CHECK-NEXT:    nopa ; vldb x9, [p1], m4; movs m0, p5; add.nc lc, r6, #-3; vshuffle x10, x1, x0, r16; vaddmac dm0, dm0, dm4, x8, x10, r10
 ; CHECK-NEXT:    vlda.3d x7, [p1], d1; nopb ; nops ; nopx ; vshuffle x8, x7, x0, r18; nopv
 ; CHECK-NEXT:    nopa ; vldb x5, [p0], #64; nops ; nopx ; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8
 ; CHECK-NEXT:    vlda.3d x3, [p0], d0; nopb ; nops ; nopx ; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8
@@ -236,7 +235,7 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda x1, [p1], m4; paddb.2d [p4], d7; movs p2, p7; nopx ; vshuffle x10, x1, x0, r16; vmac dm1, dm1, x5, x6, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    padda [p2], m6; vldb x8, [p0], #64; movs m0, p5; nopx ; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    nopa ; vldb.3d x10, [p0], d0; nops ; nopx ; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8
-; NO-PROLOGUE-SPLIT-NEXT:    vldb.3d x5, [p1], d1; nopx ; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8
+; NO-PROLOGUE-SPLIT-NEXT:    nopa ; vldb.3d x5, [p1], d1; nopx ; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cmh3, s0, upssign1, [p2], #64; vldb.128 wl6, [p4, #0]; vshuffle x10, x1, x0, r16; vmac dm1, dm1, x5, x6, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cml3, s0, upssign1, [p7, dj2]; vldb.128 wl3, [p4, #16]; movs p7, p6; nopx ; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cml2, s0, upssign1, [p2], #64; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8
@@ -250,11 +249,10 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt
 ; NO-PROLOGUE-SPLIT-NEXT:    vst.srs.4x dm1, s1, srssign0, [p3], #64; vsel.32 x4, x4, x3, r22
 ; NO-PROLOGUE-SPLIT-NEXT:    vst.2d.srs.4x dm0, s1, srssign0, [p3], d3; movx srssign0, #0; vshuffle x1, x5, x0, r3; vaddmac dm3, dm3, dm4, x8, x6, r10
 ; NO-PROLOGUE-SPLIT-NEXT:    movxm p2, #.LBB0_1; vaddmac dm2, dm2, dm4, x10, x6, r10
-; NO-PROLOGUE-SPLIT-NEXT:    vshuffle x6, x1, x0, r4; vmul dm4, x0, x4, r12
-; NO-PROLOGUE-SPLIT-NEXT:    jnzd r5, r5, p2
-; NO-PROLOGUE-SPLIT-NEXT:    vaddmac dm1, dm1, dm4, x8, x6, r10 // Delay Slot 5
-; NO-PROLOGUE-SPLIT-NEXT:    vaddmac dm0, dm0, dm4, x10, x6, r10 // Delay Slot 4
-; NO-PROLOGUE-SPLIT-NEXT:    nop // Delay Slot 3
+; NO-PROLOGUE-SPLIT-NEXT:    jnzd r5, r5, p2; vshuffle x6, x1, x0, r4; vmul dm4, x0, x4, r12
+; NO-PROLOGUE-SPLIT-NEXT:    nop // Delay Slot 5
+; NO-PROLOGUE-SPLIT-NEXT:    vaddmac dm1, dm1, dm4, x8, x6, r10 // Delay Slot 4
+; NO-PROLOGUE-SPLIT-NEXT:    vaddmac dm0, dm0, dm4, x10, x6, r10 // Delay Slot 3
 ; NO-PROLOGUE-SPLIT-NEXT:    nop // Delay Slot 2
 ; NO-PROLOGUE-SPLIT-NEXT:    nop // Delay Slot 1
 ; NO-PROLOGUE-SPLIT-NEXT:  // %bb.4: // %cooldown.entry
@@ -300,7 +298,7 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt
 ;
 ; NO-JNZD-LABEL: gemm:
 ; NO-JNZD:       // %bb.0: // %newFuncRoot
-; NO-JNZD-NEXT:    paddxm [sp], #64; nopxm
+; NO-JNZD-NEXT:    paddxm [sp], #64; nopb ; nopxm ; nops
 ; NO-JNZD-NEXT:    st p6, [sp, #-64] // 4-byte Folded Spill
 ; NO-JNZD-NEXT:    mova m0, #-68; mov p6, sp
 ; NO-JNZD-NEXT:    padda [p6], m0
@@ -322,19 +320,19 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt
 ; NO-JNZD-NEXT:    lda r19, [p6], #-4
 ; NO-JNZD-NEXT:    lda m5, [p6], #-4
 ; NO-JNZD-NEXT:    nop
-; NO-JNZD-NEXT:    vlda.ups.2x cml3, s0, upssign1, [p2], #64; vldb.128 wl4, [p4, #0]; mov dc2, #0
+; NO-JNZD-NEXT:    vlda.ups.2x cml3, s0, upssign1, [p2], #64; vldb.128 wl4, [p4, #0]
 ; NO-JNZD-NEXT:    vlda.ups.2x cmh3, s0, upssign1, [p2], #64; vldb.128 wl6, [p4, #16]; mov r21, r8
-; NO-JNZD-NEXT:    vlda.ups.2x cml2, s0, upssign1, [p2], #64; vldb x8, [p1], m4; movx r24, #0; mov dc0, #0
-; NO-JNZD-NEXT:    vlda.ups.2x cmh2, s0, upssign1, [p2], #64; vbcst.32 x2, r24
-; NO-JNZD-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; movs dc1, dc0; mov dc5, dc0
+; NO-JNZD-NEXT:    vlda.ups.2x cml2, s0, upssign1, [p2], #64; vldb x8, [p1], m4; mov dc2, #0
+; NO-JNZD-NEXT:    vlda.ups.2x cmh2, s0, upssign1, [p2], #64; mov dc0, #0
+; NO-JNZD-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; movs dc1, dc0; movx r24, #0; mov dc5, dc0
 ; NO-JNZD-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; vldb.3d x1, [p1], d1; movx crupsmode, #0; mov s0, r24
-; NO-JNZD-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p2], #64; vldb x6, [p0], #64; movx r24, #15; mov m0, p5; movs dc4, dc0
-; NO-JNZD-NEXT:    lda m2, [p6], #-4; vldb.3d x8, [p0], d0; vsel.32 x4, x2, x4, r24
+; NO-JNZD-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p2], #64; movx r24, #15; vbcst.32 x2, r24
+; NO-JNZD-NEXT:    lda m2, [p6], #-4; vsel.32 x4, x2, x4, r24
 ; NO-JNZD-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p2], #64; or r23, r10, r10; vsel.32 x2, x2, x6, r24
 ; NO-JNZD-NEXT:    lda dj2, [p6], #-4; or r25, r12, r12; vshuffle x8, x8, x0, r1
-; NO-JNZD-NEXT:    lda dn2, [p6], #-4; movx r22, #1; vshuffle x10, x8, x0, r2
-; NO-JNZD-NEXT:    lda m3, [p6], #-4; movx r12, #776; mov s1, r17
-; NO-JNZD-NEXT:    lda dj3, [p6, #0]; or r10, r5, r5; vshuffle x1, x1, x0, r3
+; NO-JNZD-NEXT:    lda dn2, [p6], #-4; vldb x6, [p0], #64; movx r22, #1; vshuffle x10, x8, x0, r2; movs dc4, dc0
+; NO-JNZD-NEXT:    lda m3, [p6], #-4; movs m0, p5; movx r12, #776; mov s1, r17
+; NO-JNZD-NEXT:    lda dj3, [p6, #0]; vldb.3d x8, [p0], d0; or r10, r5, r5; vshuffle x1, x1, x0, r3
 ; NO-JNZD-NEXT:    lda dn3, [p6, #-4]; movs dc3, dc2; movx crsrsmode, #0; vshuffle x1, x1, x0, r4
 ; NO-JNZD-NEXT:  .LBB0_1: // %for.body
 ; NO-JNZD-NEXT:    // =>This Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined-aa.ll b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined-aa.ll
index 502ee192b9be..3994f2eae751 100644
--- a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined-aa.ll
+++ b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined-aa.ll
@@ -106,9 +106,9 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt
 ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup99
 ; CHECK-NEXT:    // in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    padda [p2], m5; paddb.2d [p4], d7; movs m0, p5; add r0, r0, #-1; vshuffle x10, x1, x0, r16; vmac dm1, dm1, x5, x6, r8
-; CHECK-NEXT:    nopa ; vldb x8, [p0], #64; movs p6, p2; nopx ; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8
-; CHECK-NEXT:    nopa ; vldb x10, [p1], m4; nops ; nopx ; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8
-; CHECK-NEXT:    nopa ; vldb.3d x5, [p1], d1; nopx ; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8
+; CHECK-NEXT:    movs p6, p2; vldb x8, [p0], #64; nopx ; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8
+; CHECK-NEXT:    vldb x10, [p1], m4; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8
+; CHECK-NEXT:    vldb.3d x5, [p1], d1; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8
 ; CHECK-NEXT:    vlda.3d x1, [p0], d0; vldb.128 wl6, [p4, #0]; vshuffle x10, x1, x0, r16; vmac dm1, dm1, x5, x6, r8
 ; CHECK-NEXT:    vlda.ups.2x cml3, s0, upssign1, [p6], #64; vldb.128 wl3, [p4, #16]; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8
 ; CHECK-NEXT:    vlda.ups.2x cmh3, s0, upssign1, [p6], #64; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8
@@ -122,12 +122,10 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt
 ; CHECK-NEXT:    vst.srs.4x dm1, s1, srssign0, [p3], #64; vsel.32 x4, x4, x3, r22
 ; CHECK-NEXT:    vst.2d.srs.4x dm0, s1, srssign0, [p3], d3; movx srssign0, #0; vshuffle x10, x5, x0, r3; vaddmac dm3, dm3, dm4, x8, x6, r12
 ; CHECK-NEXT:    vshuffle x6, x10, x0, r4; vaddmac dm2, dm2, dm4, x1, x6, r12
-; CHECK-NEXT:    vmul dm4, x0, x4, r10
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    jnz r0, #.LBB0_1; vaddmac dm1, dm1, dm4, x8, x6, r12
-; CHECK-NEXT:    vaddmac dm0, dm0, dm4, x1, x6, r12 // Delay Slot 5
-; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    nop // Delay Slot 3
+; CHECK-NEXT:    jnz r0, #.LBB0_1; vmul dm4, x0, x4, r10
+; CHECK-NEXT:    nop // Delay Slot 5
+; CHECK-NEXT:    vaddmac dm1, dm1, dm4, x8, x6, r12 // Delay Slot 4
+; CHECK-NEXT:    vaddmac dm0, dm0, dm4, x1, x6, r12 // Delay Slot 3
 ; CHECK-NEXT:    nop // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 ; CHECK-NEXT:  // %bb.4: // %cooldown.entry
diff --git a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined.ll b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined.ll
index c73701b5f2da..c9f84a0887b7 100644
--- a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined.ll
+++ b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined.ll
@@ -107,9 +107,9 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt
 ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup99
 ; CHECK-NEXT:    // in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    padda [p2], m5; paddb.2d [p4], d7; movs m0, p5; add r0, r0, #-1; vshuffle x10, x1, x0, r16; vmac dm1, dm1, x5, x6, r8
-; CHECK-NEXT:    nopa ; vldb x8, [p0], #64; movs p6, p2; nopx ; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8
-; CHECK-NEXT:    nopa ; vldb x10, [p1], m4; nops ; nopx ; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8
-; CHECK-NEXT:    nopa ; vldb.3d x5, [p1], d1; nopx ; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8
+; CHECK-NEXT:    movs p6, p2; vldb x8, [p0], #64; nopx ; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8
+; CHECK-NEXT:    vldb x10, [p1], m4; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8
+; CHECK-NEXT:    vldb.3d x5, [p1], d1; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8
 ; CHECK-NEXT:    vlda.3d x1, [p0], d0; vldb.128 wl6, [p4, #0]; vshuffle x10, x1, x0, r16; vmac dm1, dm1, x5, x6, r8
 ; CHECK-NEXT:    vlda.ups.2x cml3, s0, upssign1, [p6], #64; vldb.128 wl3, [p4, #16]; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8
 ; CHECK-NEXT:    vlda.ups.2x cmh3, s0, upssign1, [p6], #64; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8
@@ -123,12 +123,10 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt
 ; CHECK-NEXT:    vst.srs.4x dm1, s1, srssign0, [p3], #64; vsel.32 x4, x4, x3, r22
 ; CHECK-NEXT:    vst.2d.srs.4x dm0, s1, srssign0, [p3], d3; movx srssign0, #0; vshuffle x10, x5, x0, r3; vaddmac dm3, dm3, dm4, x8, x6, r12
 ; CHECK-NEXT:    vshuffle x6, x10, x0, r4; vaddmac dm2, dm2, dm4, x1, x6, r12
-; CHECK-NEXT:    vmul dm4, x0, x4, r10
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    jnz r0, #.LBB0_1; vaddmac dm1, dm1, dm4, x8, x6, r12
-; CHECK-NEXT:    vaddmac dm0, dm0, dm4, x1, x6, r12 // Delay Slot 5
-; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    nop // Delay Slot 3
+; CHECK-NEXT:    jnz r0, #.LBB0_1; vmul dm4, x0, x4, r10
+; CHECK-NEXT:    nop // Delay Slot 5
+; CHECK-NEXT:    vaddmac dm1, dm1, dm4, x8, x6, r12 // Delay Slot 4
+; CHECK-NEXT:    vaddmac dm0, dm0, dm4, x1, x6, r12 // Delay Slot 3
 ; CHECK-NEXT:    nop // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 ; CHECK-NEXT:  // %bb.4: // %cooldown.entry
diff --git a/llvm/test/CodeGen/AIE/hardware-loops/loop-with-call.ll b/llvm/test/CodeGen/AIE/hardware-loops/loop-with-call.ll
index a31040ee5271..3b6b3ed3e574 100644
--- a/llvm/test/CodeGen/AIE/hardware-loops/loop-with-call.ll
+++ b/llvm/test/CodeGen/AIE/hardware-loops/loop-with-call.ll
@@ -204,8 +204,8 @@ define dso_local void @_Z5test4i(i32 noundef %n) {
 ; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
 ; AIE2P-NEXT:    nop // Delay Slot 3
-; AIE2P-NEXT:    st lr, [sp, #-64] // 4-byte Folded Spill Delay Slot 2
-; AIE2P-NEXT:    nop // Delay Slot 1
+; AIE2P-NEXT:    nop // Delay Slot 2
+; AIE2P-NEXT:    st lr, [sp, #-64] // 4-byte Folded Spill Delay Slot 1
 ; AIE2P-NEXT:  .LBB1_1: // %for.body
 ; AIE2P-NEXT:    // =>This Inner Loop Header: Depth=1
 ; AIE2P-NEXT:    nopa ; nopb ; jl #_Z16addToSymbolTablePKci; nops
@@ -309,10 +309,10 @@ define dso_local void @memcpy_lowered_to_call(ptr nocapture writeonly %a, ptr no
 ; AIE2P-NEXT:    nopa ; nopb ; st r8, [sp, #-60]; ge r0, r1, r0; mov r8, r0; nopv // 4-byte Folded Spill
 ; AIE2P-NEXT:    jnz r0, #.LBB2_3
 ; AIE2P-NEXT:    nopx // Delay Slot 5
-; AIE2P-NEXT:    st lr, [sp, #-64] // 4-byte Folded Spill Delay Slot 4
-; AIE2P-NEXT:    st p6, [sp, #-56] // 4-byte Folded Spill Delay Slot 3
-; AIE2P-NEXT:    st p7, [sp, #-52] // 4-byte Folded Spill Delay Slot 2
-; AIE2P-NEXT:    nop // Delay Slot 1
+; AIE2P-NEXT:    nop // Delay Slot 4
+; AIE2P-NEXT:    st lr, [sp, #-64] // 4-byte Folded Spill Delay Slot 3
+; AIE2P-NEXT:    st p6, [sp, #-56] // 4-byte Folded Spill Delay Slot 2
+; AIE2P-NEXT:    st p7, [sp, #-52] // 4-byte Folded Spill Delay Slot 1
 ; AIE2P-NEXT:  // %bb.1:
 ; AIE2P-NEXT:    movs p6, p0; mov p7, p1
 ; AIE2P-NEXT:  .LBB2_2: // %for.body
diff --git a/llvm/test/CodeGen/AIE/hardware-loops/nested.ll b/llvm/test/CodeGen/AIE/hardware-loops/nested.ll
index 39aa8210245f..8d4b34ef041c 100644
--- a/llvm/test/CodeGen/AIE/hardware-loops/nested.ll
+++ b/llvm/test/CodeGen/AIE/hardware-loops/nested.ll
@@ -24,17 +24,16 @@ define void @nested(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef
 ; AIE2:       // %bb.0: // %for.cond3.preheader.lr.ph
 ; AIE2-NEXT:    nopb ; mova r3, #0; nops ; nopxm ; nopv
 ; AIE2-NEXT:    mova r4, #2; nopx
-; AIE2-NEXT:    movxm p2, #.LBB0_2
 ; AIE2-NEXT:    lda r2, [p0, #0]
+; AIE2-NEXT:    movxm p2, #.LBB0_2
 ; AIE2-NEXT:  .LBB0_1: // %for.cond3.preheader
 ; AIE2-NEXT:    // =>This Loop Header: Depth=1
 ; AIE2-NEXT:    // Child Loop BB0_2 Depth 2
-; AIE2-NEXT:    nopa ; lshl r5, r3, r4; nopm
+; AIE2-NEXT:    nopa ; nopb ; lshl r5, r3, r4; nopm
 ; AIE2-NEXT:    mov dj0, r5
 ; AIE2-NEXT:    lda p3, [p1, dj0]
 ; AIE2-NEXT:    nop
 ; AIE2-NEXT:    nop
-; AIE2-NEXT:    nop
 ; AIE2-NEXT:    mova r6, #0
 ; AIE2-NEXT:    add.nc r5, r1, #-1
 ; AIE2-NEXT:  .LBB0_2: // %for.body6
@@ -73,17 +72,16 @@ define void @nested(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef
 ; AIE2P:       // %bb.0: // %for.cond3.preheader.lr.ph
 ; AIE2P-NEXT:    mova r3, #0; nopb ; nops ; nopxm ; nopv
 ; AIE2P-NEXT:    mova r4, #2; nopx
-; AIE2P-NEXT:    movxm p2, #.LBB0_2
 ; AIE2P-NEXT:    lda r2, [p0, #0]
+; AIE2P-NEXT:    movxm p2, #.LBB0_2
 ; AIE2P-NEXT:  .LBB0_1: // %for.cond3.preheader
 ; AIE2P-NEXT:    // =>This Loop Header: Depth=1
 ; AIE2P-NEXT:    // Child Loop BB0_2 Depth 2
-; AIE2P-NEXT:    nopa ; lshl r5, r3, r4; nopm
+; AIE2P-NEXT:    nopa ; nopb ; lshl r5, r3, r4; nopm
 ; AIE2P-NEXT:    mov dj0, r5
 ; AIE2P-NEXT:    lda p3, [p1, dj0]
 ; AIE2P-NEXT:    nop
 ; AIE2P-NEXT:    nop
-; AIE2P-NEXT:    nop
 ; AIE2P-NEXT:    mova r6, #0
 ; AIE2P-NEXT:    add.nc r5, r1, #-1
 ; AIE2P-NEXT:  .LBB0_2: // %for.body6
@@ -127,12 +125,11 @@ define void @nested(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef
 ; AIE2PS-NEXT:  .LBB0_1: // %for.cond3.preheader
 ; AIE2PS-NEXT:    // =>This Loop Header: Depth=1
 ; AIE2PS-NEXT:    // Child Loop BB0_2 Depth 2
-; AIE2PS-NEXT:    nopa ; lshl r16, r4, r6; nopm
+; AIE2PS-NEXT:    nopa ; nopb ; lshl r16, r4, r6; nopm
 ; AIE2PS-NEXT:    mov dj0, r16
 ; AIE2PS-NEXT:    lda p3, [p1, dj0]
 ; AIE2PS-NEXT:    nop
 ; AIE2PS-NEXT:    nop
-; AIE2PS-NEXT:    nop
 ; AIE2PS-NEXT:    addm.nc r3, r1, #-1
 ; AIE2PS-NEXT:    mova r16, #0
 ; AIE2PS-NEXT:  .LBB0_2: // %for.body6
diff --git a/llvm/test/CodeGen/AIE/schedule/commit-block-schedule.ll b/llvm/test/CodeGen/AIE/schedule/commit-block-schedule.ll
index cb2b638af789..4c89cd3bc47c 100644
--- a/llvm/test/CodeGen/AIE/schedule/commit-block-schedule.ll
+++ b/llvm/test/CodeGen/AIE/schedule/commit-block-schedule.ll
@@ -23,8 +23,8 @@ define void @test_commit_block_schedule(i1 %0) {
 ; CHECK-NEXT:    nopa ; nopb ; nops ; movxm le, #.L_LEnd0; nopv
 ; CHECK-NEXT:    nopa ; nopb ; nops ; nopxm ; nopv
 ; CHECK-NEXT:    nopa ; nopb ; nops ; nopxm ; nopv
-; CHECK-NEXT:    nopa ; nopb ; nops ; nopx ; vmov cml2, cml0; nopv
 ; CHECK-NEXT:    nopa ; nopb ; nops ; nopxm ; nopv
+; CHECK-NEXT:    nopa ; nopb ; nops ; nopx ; vmov cml2, cml0; nopv
 ; CHECK-NEXT:  .LBB0_2: // %for.body54
 ; CHECK-NEXT:    // Parent Loop BB0_1 Depth=1
 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/AIE/switch.ll b/llvm/test/CodeGen/AIE/switch.ll
index 01686e40e019..2ec5147ccf10 100644
--- a/llvm/test/CodeGen/AIE/switch.ll
+++ b/llvm/test/CodeGen/AIE/switch.ll
@@ -164,8 +164,8 @@ define  i32 @test(i8 signext %i) noinline nounwind optnone {
 ; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
 ; AIE2P-NEXT:    nop // Delay Slot 3
-; AIE2P-NEXT:    st lr, [sp, #-64] // 4-byte Folded Spill Delay Slot 2
-; AIE2P-NEXT:    nop // Delay Slot 1
+; AIE2P-NEXT:    nop // Delay Slot 2
+; AIE2P-NEXT:    st lr, [sp, #-64] // 4-byte Folded Spill Delay Slot 1
 ; AIE2P-NEXT:  // %bb.1: // %entry
 ; AIE2P-NEXT:    movxm p0, ##.LJTI0_0
 ; AIE2P-NEXT:    movxm r1, #1048575