From de38636b3532f97b1642eef56eaf6fc560b92aea Mon Sep 17 00:00:00 2001
From: Martien de Jong <martien.de-jong@xilinx.com>
Date: Mon, 18 May 2026 17:32:15 +0200
Subject: [PATCH 1/9] [AIE][InterBlockScheduling] Separate region creation and
 Pro/Epi insertion

---
 .../Target/AIE/AIEInterBlockScheduling.cpp    | 76 +++++++++++--------
 llvm/lib/Target/AIE/AIEInterBlockScheduling.h | 23 ++++--
 2 files changed, 58 insertions(+), 41 deletions(-)
diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
index 61ad93cc7711..6f2cb373825a 100644
--- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
+++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
@@ -839,16 +839,17 @@ void InterBlockScheduling::enterRegion(MachineBasicBlock *BB,
   DEBUG_BLOCKS(dbgs() << "    >> enterRegion, Iter=" << BS.FixPoint.NumIters
                       << "\n");
 
-  // Only add regions of loops when in the GatheringRegions phase
+  // Only add regions of loops when in the GatheringRegions phase.
   if (BS.Kind != BlockType::Loop ||
       BS.FixPoint.Stage == SchedulingStage::GatheringRegions) {
-    ArrayRef<MachineBundle> TopFixedBundles =
-        RegionBegin == BB->begin() ? ArrayRef<MachineBundle>(BS.TopInsert)
-                                   : ArrayRef<MachineBundle>();
-    ArrayRef<MachineBundle> BotFixedBundles =
-        RegionEnd == BB->end() ? ArrayRef<MachineBundle>(BS.BottomInsert)
-                               : ArrayRef<MachineBundle>();
-    BS.addRegion(BB, RegionBegin, RegionEnd, TopFixedBundles, BotFixedBundles);
+    BS.addRegion(BB, RegionBegin, RegionEnd);
+    // Fixed bundles result from loop pipelining and are set separately on the
+    // region, after the instructions have been physically inserted into the
+    // block by emitInterBlockTop / emitInterBlockBottom.
+    if (RegionBegin == BB->begin() && !BS.TopInsert.empty())
+      BS.getCurrentRegion().setTopFixedBundles(BS.TopInsert);
+    if (RegionEnd == BB->end() && !BS.BottomInsert.empty())
+      BS.getCurrentRegion().setBotFixedBundles(BS.BottomInsert);
   }
 }
 
@@ -1174,32 +1175,13 @@ bool InterBlockEdges::isPostBoundaryNode(SUnit *SU) const {
 }
 
 Region::Region(MachineBasicBlock *BB, MachineBasicBlock::iterator Begin,
-               MachineBasicBlock::iterator End,
-               ArrayRef<MachineBundle> TopFixedBundles,
-               ArrayRef<MachineBundle> BotFixedBundles)
-    : BB(BB), TopFixedBundles(TopFixedBundles),
-      BotFixedBundles(BotFixedBundles) {
-  MachineBasicBlock::iterator FreeBegin =
-      std::next(Begin, TopFixedBundles.size());
-  MachineBasicBlock::iterator FreeEnd = std::prev(End, BotFixedBundles.size());
-
-  // Verify that all fixed instructions are at the right place in the MBB
-  assert(TopFixedBundles.empty() || Begin == BB->begin());
-  assert(TopFixedBundles.empty() ||
-         all_of(TopFixedBundles.back().Instrs, [FreeBegin](
-                                                   const MachineInstr *MI) {
-           return getBundleStart(MI->getIterator()) == std::prev(FreeBegin);
-         }));
-  assert(BotFixedBundles.empty() || End == BB->end());
-  assert(
-      BotFixedBundles.empty() ||
-      all_of(BotFixedBundles.front().Instrs, [FreeEnd](const MachineInstr *MI) {
-        return getBundleStart(MI->getIterator()) == FreeEnd;
-      }));
-
+               MachineBasicBlock::iterator End)
+    : BB(BB) {
   // When the region is created, its instructions haven't been re-ordered yet,
-  // so this is effectively saving the semantic order.
-  for (auto It = FreeBegin; It != FreeEnd; ++It) {
+  // so this is effectively saving the semantic order. Fixed bundles (if any)
+  // are set separately via setTopFixedBundles / setBotFixedBundles, which
+  // will trim the corresponding entries from SemanticOrder.
+  for (auto It = Begin; It != End; ++It) {
     SemanticOrder.push_back(&*It);
   }
   if (End != BB->end()) {
@@ -1207,6 +1189,34 @@ Region::Region(MachineBasicBlock *BB, MachineBasicBlock::iterator Begin,
   }
 }
 
+void Region::setTopFixedBundles(ArrayRef<MachineBundle> Bundles) {
+  assert(TopFixedBundles.empty() && "TopFixedBundles already set.");
+  // Verify the fixed instructions are physically at the top of the block.
+  const auto FreeBegin = std::next(BB->begin(), Bundles.size());
+  assert(all_of(Bundles.back().Instrs, [FreeBegin](const MachineInstr *MI) {
+    return getBundleStart(MI->getIterator()) == std::prev(FreeBegin);
+  }));
+  TopFixedBundles = Bundles;
+  // Remove the fixed instructions from the front of SemanticOrder so that
+  // getFreeInstructions() returns only the truly free instructions.
+  SemanticOrder.erase(SemanticOrder.begin(),
+                      SemanticOrder.begin() + Bundles.size());
+}
+
+void Region::setBotFixedBundles(ArrayRef<MachineBundle> Bundles) {
+  assert(BotFixedBundles.empty() && "BotFixedBundles already set.");
+  // Verify the fixed instructions are physically at the bottom of the block.
+  const auto FreeEnd = std::prev(BB->end(), Bundles.size());
+  assert(all_of(Bundles.front().Instrs, [FreeEnd](const MachineInstr *MI) {
+    return getBundleStart(MI->getIterator()) == FreeEnd;
+  }));
+  BotFixedBundles = Bundles;
+  // Remove the fixed instructions from the back of SemanticOrder so that
+  // getFreeInstructions() returns only the truly free instructions.
+  SemanticOrder.erase(SemanticOrder.end() - Bundles.size(),
+                      SemanticOrder.end());
+}
+
 BlockState::BlockState(MachineBasicBlock *Block) : TheBlock(Block) {
   classify();
   setBlockProperties();
diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
index bbd3e75770f6..32682718021b 100644
--- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
+++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
@@ -153,9 +153,7 @@ class Region {
 
 public:
   Region(MachineBasicBlock *BB, MachineBasicBlock::iterator Begin,
-         MachineBasicBlock::iterator End,
-         ArrayRef<MachineBundle> TopFixedBundles,
-         ArrayRef<MachineBundle> BotFixedBundles);
+         MachineBasicBlock::iterator End);
 
   using free_iterator = std::vector<MachineInstr *>::const_iterator;
   using fixed_iterator = MachineBasicBlock::iterator;
@@ -181,6 +179,18 @@ class Region {
   }
   ArrayRef<MachineBundle> getBotFixedBundles() const { return BotFixedBundles; }
 
+  /// Set the fixed bundles at the top of the region (e.g. a SWP epilogue).
+  /// The instructions must already be physically present at the start of the
+  /// block. Trims SemanticOrder to exclude the newly fixed instructions.
+  /// \pre The region starts at BB->begin().
+  void setTopFixedBundles(ArrayRef<MachineBundle> Bundles);
+
+  /// Set the fixed bundles at the bottom of the region (e.g. a SWP prologue).
+  /// The instructions must already be physically present at the end of the
+  /// block. Trims SemanticOrder to exclude the newly fixed instructions.
+  /// \pre The region ends at BB->end().
+  void setBotFixedBundles(ArrayRef<MachineBundle> Bundles);
+
   MachineInstr *getExitInstr() const { return ExitInstr; }
 
   std::vector<MachineBundle> Bundles;
@@ -240,15 +250,12 @@ class BlockState {
     TheBundles.insert(TheBundles.end(), Bundles.begin(), Bundles.end());
   }
   void addRegion(MachineBasicBlock *BB, MachineBasicBlock::iterator RegionBegin,
-                 MachineBasicBlock::iterator RegionEnd,
-                 ArrayRef<MachineBundle> TopFixedBundles,
-                 ArrayRef<MachineBundle> BotFixedBundles) {
+                 MachineBasicBlock::iterator RegionEnd) {
     assert((Kind == BlockType::Loop &&
             FixPoint.Stage == SchedulingStage::GatheringRegions) ||
            FixPoint.Stage == SchedulingStage::Scheduling);
     CurrentRegion = Regions.size();
-    Regions.emplace_back(BB, RegionBegin, RegionEnd, TopFixedBundles,
-                         BotFixedBundles);
+    Regions.emplace_back(BB, RegionBegin, RegionEnd);
   }
   auto &getCurrentRegion() const { return Regions.at(CurrentRegion); }
   auto &getCurrentRegion() { return Regions[CurrentRegion]; }

From 6c368a3b4e27ebb0c4850f3e5ca4b56228310501 Mon Sep 17 00:00:00 2001
From: Martien de Jong <martien.de-jong@xilinx.com>
Date: Mon, 18 May 2026 18:26:23 +0200
Subject: [PATCH 2/9] [AIE][InterBlock] Also use GatheringRegions for regular,
 non-loop blocks

---
 .../Target/AIE/AIEInterBlockScheduling.cpp    | 71 ++++++++++++-------
 llvm/lib/Target/AIE/AIEInterBlockScheduling.h |  7 +-
 llvm/lib/Target/AIE/AIEMachineScheduler.cpp   |  7 +-
 3 files changed, 51 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
index 6f2cb373825a..da818941314f 100644
--- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
+++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
@@ -383,6 +383,13 @@ void InterBlockScheduling::enterBlock(MachineBasicBlock *BB) {
                       << CurrentBlockState->kindAsString() << " FixPointIter="
                       << CurrentBlockState->FixPoint.NumIters
                       << " II=" << CurrentBlockState->FixPoint.II << "\n");
+  // Emit SWP prologues/epilogues that belong to this block. This only applies
+  // in the Scheduling stage: during GatheringRegions the regions are only being
+  // recorded without physically inserting any SWP code yet.
+  if (CurrentBlockState->FixPoint.Stage != SchedulingStage::GatheringRegions) {
+    emitInterBlockTop(*CurrentBlockState);
+    emitInterBlockBottom(*CurrentBlockState);
+  }
 }
 namespace {
 /// This implements the interface to the postpipeliner to extract the
@@ -610,19 +617,22 @@ MachineInstr *InterBlockScheduling::latencyConverged(BlockState &BS) const {
 }
 
 SchedulingStage InterBlockScheduling::updateFixPoint(BlockState &BS) {
-  if (BS.Kind != BlockType::Loop) {
-    return SchedulingStage::SchedulingDone;
-  }
-
   if (BS.FixPoint.Stage == SchedulingStage::GatheringRegions) {
-    // This is the first time we schedule this loop. In that first
-    // iteration, we have recorded the region decomposition.
-    // Now we can create the interblock edges between the top and the bottom
-    // region
-    BS.initInterBlock(*Context, *HR);
+    // This is the first visit to this block. The region decomposition has been
+    // gathered. Now transition to Scheduling so the next pass actually
+    // schedules the gathered regions.
+    if (BS.Kind == BlockType::Loop) {
+      // For loops, also create the interblock edges between the top and the
+      // bottom region.
+      BS.initInterBlock(*Context, *HR);
+    }
     return SchedulingStage::Scheduling;
   }
 
+  if (BS.Kind != BlockType::Loop) {
+    return SchedulingStage::SchedulingDone;
+  }
+
   BS.FixPoint.NumIters++;
   if (BS.FixPoint.Stage == SchedulingStage::Scheduling) {
     return updateScheduling(BS);
@@ -839,13 +849,29 @@ void InterBlockScheduling::enterRegion(MachineBasicBlock *BB,
   DEBUG_BLOCKS(dbgs() << "    >> enterRegion, Iter=" << BS.FixPoint.NumIters
                       << "\n");
 
-  // Only add regions of loops when in the GatheringRegions phase.
-  if (BS.Kind != BlockType::Loop ||
-      BS.FixPoint.Stage == SchedulingStage::GatheringRegions) {
+  if (BS.FixPoint.Stage == SchedulingStage::GatheringRegions) {
+    // Gather region boundaries and capture the invariant SemanticOrder for all
+    // block types. Fixed bundles are NOT set here: they result from loop
+    // pipelining, which happens during Scheduling, and are applied via the
+    // setTopFixedBundles / setBotFixedBundles calls in the Scheduling pass.
     BS.addRegion(BB, RegionBegin, RegionEnd);
-    // Fixed bundles result from loop pipelining and are set separately on the
-    // region, after the instructions have been physically inserted into the
-    // block by emitInterBlockTop / emitInterBlockBottom.
+  } else if (BS.Kind != BlockType::Loop) {
+    // Scheduling pass for non-loop blocks: set fixed bundles on the
+    // pre-gathered region now that emitInterBlockTop / emitInterBlockBottom
+    // has physically inserted the SWP instructions into the block.
+    //
+    // If Regions is empty, the block was empty during GatheringRegions (e.g.
+    // a newly-created dedicated exit block). The machine scheduler skips
+    // enterRegion for empty blocks so no region was captured. Create it now
+    // with correct free-instruction boundaries, excluding any fixed bundles.
+    if (BS.getRegions().empty()) {
+      const unsigned TopCount =
+          (RegionBegin == BB->begin()) ? BS.TopInsert.size() : 0u;
+      const unsigned BotCount =
+          (RegionEnd == BB->end()) ? BS.BottomInsert.size() : 0u;
+      BS.addRegion(BB, std::next(RegionBegin, TopCount),
+                   std::prev(RegionEnd, BotCount));
+    }
     if (RegionBegin == BB->begin() && !BS.TopInsert.empty())
       BS.getCurrentRegion().setTopFixedBundles(BS.TopInsert);
     if (RegionEnd == BB->end() && !BS.BottomInsert.empty())
@@ -1197,10 +1223,9 @@ void Region::setTopFixedBundles(ArrayRef<MachineBundle> Bundles) {
     return getBundleStart(MI->getIterator()) == std::prev(FreeBegin);
   }));
   TopFixedBundles = Bundles;
-  // Remove the fixed instructions from the front of SemanticOrder so that
-  // getFreeInstructions() returns only the truly free instructions.
-  SemanticOrder.erase(SemanticOrder.begin(),
-                      SemanticOrder.begin() + Bundles.size());
+  // SemanticOrder was captured during GatheringRegions before the fixed
+  // bundles were inserted into the block, so it already contains only the
+  // free instructions. No adjustment is needed.
 }
 
 void Region::setBotFixedBundles(ArrayRef<MachineBundle> Bundles) {
@@ -1211,10 +1236,9 @@ void Region::setBotFixedBundles(ArrayRef<MachineBundle> Bundles) {
     return getBundleStart(MI->getIterator()) == FreeEnd;
   }));
   BotFixedBundles = Bundles;
-  // Remove the fixed instructions from the back of SemanticOrder so that
-  // getFreeInstructions() returns only the truly free instructions.
-  SemanticOrder.erase(SemanticOrder.end() - Bundles.size(),
-                      SemanticOrder.end());
+  // SemanticOrder was captured during GatheringRegions before the fixed
+  // bundles were inserted into the block, so it already contains only the
+  // free instructions. No adjustment is needed.
 }
 
 BlockState::BlockState(MachineBasicBlock *Block) : TheBlock(Block) {
@@ -1277,7 +1301,6 @@ void BlockState::classify() {
   if (LoopAware && IsLoop(TheBlock) &&
       llvm::all_of(TheBlock->successors(), CanFixLoopSchedule)) {
     Kind = BlockType::Loop;
-    FixPoint.Stage = SchedulingStage::GatheringRegions;
   }
 
   // We will mark the epilogues in a second sweep, when all states have been
diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
index 32682718021b..7c1aa0400e07 100644
--- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
+++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
@@ -113,7 +113,7 @@ enum class SchedulingStage {
 /// Parameters that drive fixpoint convergence
 class FixedpointState {
 public:
-  SchedulingStage Stage = SchedulingStage::Scheduling;
+  SchedulingStage Stage = SchedulingStage::GatheringRegions;
   // Parameters of the loop-aware convergence
   int LatencyMargin = 0;
   SmallMapVector<MachineInstr *, int, 8> PerMILatencyMargin;
@@ -251,9 +251,8 @@ class BlockState {
   }
   void addRegion(MachineBasicBlock *BB, MachineBasicBlock::iterator RegionBegin,
                  MachineBasicBlock::iterator RegionEnd) {
-    assert((Kind == BlockType::Loop &&
-            FixPoint.Stage == SchedulingStage::GatheringRegions) ||
-           FixPoint.Stage == SchedulingStage::Scheduling);
+    assert(FixPoint.Stage == SchedulingStage::GatheringRegions ||
+           (FixPoint.Stage == SchedulingStage::Scheduling && Regions.empty()));
     CurrentRegion = Regions.size();
     Regions.emplace_back(BB, RegionBegin, RegionEnd);
   }
diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
index 0aef0e14b0cd..616f8826c917 100644
--- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
+++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
@@ -783,13 +783,8 @@ void AIEPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) {
   CurMBB = MBB;
   // We force bottom up region processing, so the first region
   // from a block is the bottom one. We reset this when leaving any
-  // region
+  // region.
   IsBottomRegion = true;
-
-  // The block may have a timed region, append its instructions.
-  auto &BS = InterBlock.getBlockState(MBB);
-  InterBlock.emitInterBlockTop(BS);
-  InterBlock.emitInterBlockBottom(BS);
 }
 
 static MachineBasicBlock::iterator

From 73adde41fc2581aaf4c0da067496d8c3f590e03b Mon Sep 17 00:00:00 2001
From: Martien de Jong <martien.de-jong@xilinx.com>
Date: Tue, 19 May 2026 15:18:41 +0200
Subject: [PATCH 3/9] [AIE][InterBlock] Factor out InterBlockEdges into
 DataDependenceHelper

---
 .../Target/AIE/AIEDataDependenceHelper.cpp    | 23 +++++++-
 llvm/lib/Target/AIE/AIEDataDependenceHelper.h | 52 ++++++++++++++++++-
 .../Target/AIE/AIEInterBlockScheduling.cpp    | 23 --------
 llvm/lib/Target/AIE/AIEInterBlockScheduling.h | 47 -----------------
 4 files changed, 73 insertions(+), 72 deletions(-)

diff --git a/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp b/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp
index 08116d367768..ea36c7ae9f9f 100644
--- a/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp
+++ b/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
+// (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 //
 //===----------------------------------------------------------------------===//
 
@@ -72,4 +72,25 @@ void DataDependenceHelper::dumpDot(raw_ostream &OS,
   OS << "}\n";
 }
 
+void InterBlockEdges::addNode(MachineInstr *MI) {
+  if (auto Index = DDG.initSUnit(*MI)) {
+    IndexMap &TheMap = Boundary ? SuccMap : PredMap;
+    TheMap.emplace(MI, *Index);
+  }
+}
+
+void InterBlockEdges::markBoundary() { Boundary = DDG.SUnits.size(); }
+
+const SUnit *InterBlockEdges::getPreBoundaryNode(MachineInstr *MI) const {
+  const auto Found = PredMap.find(MI);
+  if (Found == PredMap.end()) {
+    return nullptr;
+  }
+  return &DDG.SUnits.at(Found->second);
+}
+
+bool InterBlockEdges::isPostBoundaryNode(SUnit *SU) const {
+  return Boundary ? SU->NodeNum >= *Boundary : false;
+}
+
 } // end namespace llvm::AIE
diff --git a/llvm/lib/Target/AIE/AIEDataDependenceHelper.h b/llvm/lib/Target/AIE/AIEDataDependenceHelper.h
index 94ad326ef07f..34b12e9c0a94 100644
--- a/llvm/lib/Target/AIE/AIEDataDependenceHelper.h
+++ b/llvm/lib/Target/AIE/AIEDataDependenceHelper.h
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
+// (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 //
 //===----------------------------------------------------------------------===//
 //
@@ -17,6 +17,8 @@
 
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include <map>
+#include <optional>
 
 namespace llvm {
 
@@ -53,6 +55,54 @@ class DataDependenceHelper : public ScheduleDAGInstrs {
   // are printed.
   void dumpDot(raw_ostream &OS, bool IncludeBoundaries) const;
 };
+
+/// This class generates all edges between nodes in two flow-adjacent regions.
+/// The nodes are added in forward flow order, marking the boundary at the
+/// appropriate point.
+class InterBlockEdges {
+  DataDependenceHelper DDG;
+  // The boundary between Pred and Succ nodes.
+  std::optional<unsigned> Boundary;
+
+  /// We can add the same instruction on both sides of the boundary.
+  /// We maintain explicit maps to retrieve the corresponding SUnit.
+  using IndexMap = std::map<MachineInstr *, unsigned>;
+  IndexMap PredMap;
+  IndexMap SuccMap;
+
+public:
+  InterBlockEdges(const MachineSchedContext &Context)
+      : DDG(Context, true, true) {}
+
+  /// Add a Node to the DAG.
+  void addNode(MachineInstr *);
+
+  /// Mark the boundary between the predecessor block and the successor block.
+  /// In normal operation, there should just be one call to this method.
+  /// Nodes added before are part of the predecessor, nodes added after are
+  /// part of the successor.
+  void markBoundary();
+
+  /// Create all the edges by interpreting read and write events of the nodes
+  /// in reverse order.
+  void buildEdges() { DDG.buildEdges(); }
+
+  /// To iterate forward across the SUnits of the underlying DDG.
+  auto begin() const { return DDG.SUnits.begin(); }
+  auto end() const { return DDG.SUnits.end(); }
+
+  /// The following two methods are used to find the cross-boundary edges,
+  /// by starting from a pre-boundary node and selecting its successor edges
+  /// that connect to a post-boundary node.
+  /// ---
+  /// Retrieve the SUnit that represents MI's instance before the
+  /// boundary, null if not found.
+  const SUnit *getPreBoundaryNode(MachineInstr *MI) const;
+
+  /// Check whether SU represents an instruction after the boundary.
+  bool isPostBoundaryNode(SUnit *SU) const;
+};
+
 } // namespace AIE
 } // namespace llvm
 
diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
index da818941314f..14d7712b5587 100644
--- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
+++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
@@ -1177,29 +1177,6 @@ int InterBlockScheduling::getCyclesToAvoidResourceConflicts(
   return NopCounter;
 }
 
-void InterBlockEdges::addNode(MachineInstr *MI) {
-  if (auto Index = DDG.initSUnit(*MI)) {
-    IndexMap &TheMap = Boundary ? SuccMap : PredMap;
-    TheMap.emplace(MI, *Index);
-  }
-}
-
-// Mark the boundary between the predecessor block and the successor block
-void InterBlockEdges::markBoundary() { Boundary = DDG.SUnits.size(); }
-
-const SUnit *InterBlockEdges::getPreBoundaryNode(MachineInstr *MI) const {
-  auto Found = PredMap.find(MI);
-  if (Found == PredMap.end()) {
-    return nullptr;
-  }
-
-  return &DDG.SUnits.at(Found->second);
-}
-
-bool InterBlockEdges::isPostBoundaryNode(SUnit *SU) const {
-  return Boundary ? SU->NodeNum >= *Boundary : false;
-}
-
 Region::Region(MachineBasicBlock *BB, MachineBasicBlock::iterator Begin,
                MachineBasicBlock::iterator End)
     : BB(BB) {
diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
index 7c1aa0400e07..8bb6a13f7264 100644
--- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
+++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
@@ -31,53 +31,6 @@
 
 namespace llvm::AIE {
 
-/// This class generates all edges between nodes in two flow-adjacent regions
-/// The nodes are added in forward flow order, marking the boundary at the
-/// appropriate point.
-class InterBlockEdges {
-  DataDependenceHelper DDG;
-  // the boundary between Pred and Succ nodes
-  std::optional<unsigned> Boundary;
-
-  /// We can add the same instruction on both sides of the boundary.
-  /// We maintain explicit maps to retrieve the corresponding SUnit
-  using IndexMap = std::map<MachineInstr *, unsigned>;
-  IndexMap PredMap;
-  IndexMap SuccMap;
-
-public:
-  InterBlockEdges(const MachineSchedContext &Context)
-      : DDG(Context, true, true) {}
-
-  /// Add a Node to the DAG.
-  void addNode(MachineInstr *);
-
-  /// Mark the boundary between the predecessor block and the successor block.
-  /// In normal operation, there should just be one call to this method.
-  /// Nodes added before are part of the predecesor, nodes added after are
-  /// part of the successor
-  void markBoundary();
-
-  /// Create all the edges by interpreting read and write events of the nodes
-  // in reverse order.
-  void buildEdges() { DDG.buildEdges(); }
-
-  /// To iterate forward across the SUnits of the underlying DDG.
-  auto begin() const { return DDG.SUnits.begin(); }
-  auto end() const { return DDG.SUnits.end(); }
-
-  /// The following two methods are used to find the cross-boundary edges,
-  /// by starting from a pre-boundary node and select its successor edges that
-  /// connect to a post-boundary node.
-  /// ---
-  /// Retrieve the SUnit that represents MI's instance before the
-  /// boundary, null if not found.
-  const SUnit *getPreBoundaryNode(MachineInstr *MI) const;
-
-  /// Check whether SU represents an instruction after the boundary
-  bool isPostBoundaryNode(SUnit *SU) const;
-};
-
 // BlockType determines scheduling priority, direction and safety margin
 // handling.
 enum class BlockType { Regular, Loop, Epilogue };

From e07642435d19e2e496cb3a7f82c227bb0c62f43a Mon Sep 17 00:00:00 2001
From: Martien de Jong <martien.de-jong@xilinx.com>
Date: Wed, 20 May 2026 09:08:50 +0200
Subject: [PATCH 4/9] succedges first attempt.

Revert to old latency computation
---
 llvm/lib/Target/AIE/AIEInterBlockScheduling.h |   2 +
 llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp   | 102 ++++++++++++++++--
 llvm/lib/Target/AIE/AIEMaxLatencyFinder.h     |  62 ++++++++++-
 3 files changed, 153 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
index 8bb6a13f7264..737195227930 100644
--- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
+++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
@@ -396,6 +396,8 @@ class InterBlockScheduling {
 
   AIEAlternateDescriptors &getSelectedAltDescs() { return SelectedAltDescs; }
 
+  const MachineSchedContext *getContext() const { return Context; }
+
   std::optional<SWPEpilogueContext>
   getSWPEpilogueContext(MachineBasicBlock *MBB);
 };
diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
index a8859864a1bd..793565f4ed2d 100644
--- a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
+++ b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
@@ -210,7 +210,55 @@ MaxLatencyFinder::MaxLatencyFinder(ScheduleDAGInstrs *DAG, AAResults *AA)
       SafeToIgnoreMemDeps(Scheduler && CurBB &&
                           Scheduler->getInterBlock()
                               .getBlockState(CurBB)
-                              .isSafeToIgnoreMemDeps()) {}
+                              .isSafeToIgnoreMemDeps()) {
+  if (CurBB && Scheduler) {
+    const Region &CurRegion =
+        Scheduler->getInterBlock().getBlockState(CurBB).getCurrentRegion();
+    buildInterBlockEdges(CurRegion);
+  }
+}
+
+void MaxLatencyFinder::buildInterBlockEdges(const Region &CurRegion) {
+  const MachineSchedContext &C = *Scheduler->getInterBlock().getContext();
+  const InterBlockScheduling &IB = Scheduler->getInterBlock();
+
+  HasUnknownSuccessors = CurBB->succ_empty();
+
+  for (MachineBasicBlock *SuccBB : CurBB->successors()) {
+    SuccessorEdges &SE = PerSuccEdges.emplace_back(C);
+
+    // Pre-boundary: free instructions of the current region.
+    for (MachineInstr *MI : CurRegion.getFreeInstructions())
+      SE.addNode(MI);
+
+    SE.markBoundary();
+
+    // Post-boundary: always use getFreeInstructions() as the single source of
+    // node identity. Empty regions signify empty basic blocks; in that case no
+    // post-boundary nodes are added.
+    const BlockState &SBS = IB.getBlockState(SuccBB);
+    if (!SBS.getRegions().empty()) {
+      for (MachineInstr *MI : SBS.getTop().getFreeInstructions())
+        SE.addNode(MI);
+    }
+
+    SE.buildEdges();
+
+    // After the graph is built, record the scheduled cycle depth for each
+    // post-boundary instruction. This is kept separate from node insertion
+    // so that nodes are always added in getFreeInstructions() order.
+    // Instructions absent from Depths return depth 0 from getDepth(), which
+    // is the conservative value (no latency reduction) for unscheduled nodes.
+    if (SBS.isScheduled() && !SBS.getRegions().empty()) {
+      int Cycle = 0;
+      for (const MachineBundle &Bundle : SBS.getTop().Bundles) {
+        for (MachineInstr *MI : Bundle.getInstrs())
+          SE.recordDepth(MI, Cycle);
+        ++Cycle;
+      }
+    }
+  }
+}
 
 unsigned MaxLatencyFinder::operator()(MachineInstr &MI) {
   LLVM_DEBUG(dbgs() << MI << "\n");
@@ -239,16 +287,18 @@ unsigned MaxLatencyFinder::operator()(MachineInstr &MI) {
     }
     return Latency;
   }
+
   LLVM_DEBUG(dbgs() << "Earliest for: " << MI);
-  // Track the earliest use in any successor block, given the cycles in
-  // which these uses are scheduled
+
+  // Original findEarliestRef-based effective latency (the actual return value).
+  // For each successor block, find the earliest cycle in its top region where
+  // MI creates a dependency; reduce the raw latency by that many cycles.
   int Earliest = Latency;
   for (MachineBasicBlock *SuccBB : CurBB->successors()) {
-    auto &SBS = IB.getBlockState(SuccBB);
+    const auto &SBS = IB.getBlockState(SuccBB);
     assert(SBS.isScheduled());
     if (SBS.getRegions().empty()) {
-      // Blocks can be empty. getTop() will fail, and Earliest=0 is
-      // a conservative value
+      // Empty block: no instructions to find a dependency in, conservative.
       Earliest = 0;
       continue;
     }
@@ -257,11 +307,43 @@ unsigned MaxLatencyFinder::operator()(MachineInstr &MI) {
         findEarliestRef(MI, TopBundles, Earliest, AA, SafeToIgnoreMemDeps)
             .Cycle;
   }
-
   LLVM_DEBUG(dbgs() << "   Earliest=" << Earliest << "\n");
-  Latency = std::max(Latency - Earliest, 1);
-  LLVM_DEBUG(dbgs() << "EffectiveLatency=" << Latency << "\n");
-  return Latency;
+  const int OldEffectiveLatency = std::max(Latency - Earliest, 1);
+  LLVM_DEBUG(dbgs() << "   EffectiveLatency(old)=" << OldEffectiveLatency
+                    << "\n");
+
+  // New DDG-based computation — logged for comparison while investigating.
+  int NewEffectiveLatency = HasUnknownSuccessors ? Latency : 0;
+  for (SuccessorEdges &SE : PerSuccEdges) {
+    const SUnit *Pred = SE.getPreBoundaryNode(&MI);
+    if (!Pred)
+      continue;
+
+    for (const SDep &Dep : Pred->Succs) {
+      SUnit *Succ = Dep.getSUnit();
+      if (!SE.isPostBoundaryNode(Succ))
+        continue;
+
+      MachineInstr *SuccMI = Succ->getInstr();
+      if (!SuccMI) {
+        // ExitSU: use the full edge latency conservatively.
+        NewEffectiveLatency =
+            std::max(NewEffectiveLatency, Dep.getSignedLatency());
+        continue;
+      }
+
+      // Remaining latency = edge latency minus cycles already elapsed in
+      // the successor before SuccMI executes. getDepth returns 0 for
+      // unscheduled instructions, giving the conservative full edge latency.
+      const int Remaining = Dep.getSignedLatency() - SE.getDepth(SuccMI);
+      LLVM_DEBUG(dbgs() << "   Remaining=" << Remaining << " for " << *SuccMI);
+      NewEffectiveLatency = std::max(NewEffectiveLatency, Remaining);
+    }
+  }
+  LLVM_DEBUG(dbgs() << "   EffectiveLatency(new)=" << NewEffectiveLatency
+                    << "\n");
+
+  return static_cast<unsigned>(OldEffectiveLatency);
 }
 
 } // namespace llvm::AIE
diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h
index e5e17d40452b..4304b4d64a87 100644
--- a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h
+++ b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h
@@ -16,8 +16,12 @@
 #define LLVM_LIB_TARGET_AIE_MAXLATENCYFINDER_H
 
 #include "AIEBaseSubtarget.h"
+#include "AIEDataDependenceHelper.h"
 #include "AIEMachineScheduler.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include <map>
+#include <memory>
+#include <vector>
 
 using namespace llvm;
 
@@ -41,6 +45,46 @@ InstrAndCycle findEarliestRef(const MachineInstr &SrcMI,
                               AAResults *AA = nullptr,
                               bool SafeToIgnoreMemDeps = false);
 
+/// Interblock dependence information for one specific CFG successor.
+/// Contains a DDG built from the current block's bottom region (pre-boundary)
+/// and the successor's top region (post-boundary). When the successor has been
+/// scheduled, Depths records the cycle of each post-boundary instruction;
+/// unscheduled instructions are absent from Depths and treated as depth 0
+/// (conservative — no latency reduction).
+class SuccessorEdges {
+  /// Heap-allocated so that SuccessorEdges is moveable; InterBlockEdges
+  /// cannot be safely moved due to internal SUnit pointers.
+  std::unique_ptr<InterBlockEdges> Edges;
+  /// Maps each scheduled post-boundary MachineInstr* to its cycle
+  /// (0-indexed from the top of the successor block).
+  std::map<MachineInstr *, int> Depths;
+
+public:
+  explicit SuccessorEdges(const MachineSchedContext &C)
+      : Edges(std::make_unique<InterBlockEdges>(C)) {}
+
+  // Building interface — called during buildInterBlockEdges().
+  void addNode(MachineInstr *MI) { Edges->addNode(MI); }
+  void markBoundary() { Edges->markBoundary(); }
+  void recordDepth(MachineInstr *MI, int Cycle) { Depths[MI] = Cycle; }
+  void buildEdges() { Edges->buildEdges(); }
+
+  // Query interface — called from MaxLatencyFinder::operator().
+  const SUnit *getPreBoundaryNode(MachineInstr *MI) const {
+    return Edges->getPreBoundaryNode(MI);
+  }
+  bool isPostBoundaryNode(SUnit *SU) const {
+    return Edges->isPostBoundaryNode(SU);
+  }
+  /// Returns the scheduled cycle depth of MI. Returns 0 if MI is not found,
+  /// which is the conservative value for unscheduled instructions (no
+  /// reduction of the edge latency).
+  int getDepth(MachineInstr *MI) const {
+    const auto It = Depths.find(MI);
+    return It != Depths.end() ? It->second : 0;
+  }
+};
+
 class MaxLatencyFinder {
   const AIEPostRASchedStrategy *const Scheduler;
   const AIEBaseInstrInfo *const TII;
@@ -51,10 +95,22 @@ class MaxLatencyFinder {
   AAResults *AA;
   bool SafeToIgnoreMemDeps;
 
-  // Check whether this region connects to the successor blocks
-  //
+  /// One entry per CFG successor of CurBB. SuccessorEdges is moveable
+  /// (InterBlockEdges is heap-allocated via unique_ptr inside it), so the
+  /// vector may reallocate freely.
+  std::vector<SuccessorEdges> PerSuccEdges;
+
+  /// True when CurBB has no CFG successors (e.g. a return block), requiring
+  /// the conservative raw latency as a floor.
+  bool HasUnknownSuccessors = false;
+
+  // Check whether this region connects to the successor blocks.
   bool isBottomRegion(MachineInstr *ExitMI);
 
+  // Build one SuccessorEdges per CFG successor of CurBB and populate
+  // PerSuccEdges.
+  void buildInterBlockEdges(const Region &CurRegion);
+
 public:
   // Constructors
   MaxLatencyFinder(const AIEPostRASchedStrategy *const Scheduler,
@@ -65,7 +121,7 @@ class MaxLatencyFinder {
 
   MaxLatencyFinder(ScheduleDAGInstrs *DAG, AAResults *AA = nullptr);
 
-  // Find the maximum latency of MI taking  successors into account
+  // Find the maximum latency of MI taking successors into account.
   unsigned operator()(MachineInstr &MI);
 };
 

From 58f33025054ca8ab48f5ddfb80835ba26e666058 Mon Sep 17 00:00:00 2001
From: Martien de Jong <martien.de-jong@xilinx.com>
Date: Wed, 20 May 2026 10:14:58 +0200
Subject: [PATCH 5/9] [AIE] Simpler MaxLatencyFinder constructor.

---
 llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp | 18 +++++++++---------
 llvm/lib/Target/AIE/AIEMaxLatencyFinder.h   | 10 +++++-----
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
index 793565f4ed2d..acfd00734b90 100644
--- a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
+++ b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
@@ -174,20 +174,20 @@ InstrAndCycle findEarliestRef(const MachineInstr &SrcMI,
   return {/*MI=*/nullptr, Cycle};
 }
 
-MaxLatencyFinder::MaxLatencyFinder(
-    const AIEPostRASchedStrategy *const Scheduler,
-    const AIEBaseInstrInfo *const TII,
-    const InstrItineraryData *const Itineraries,
-    const MCRegisterInfo *const TRI, MachineBasicBlock *const CurBB,
-    AAResults *AA)
-    : Scheduler(Scheduler), TII(TII), Itineraries(Itineraries), TRI(TRI),
-      CurBB(CurBB), InterBlock(true), AA(AA), SafeToIgnoreMemDeps(false) {}
+MaxLatencyFinder::MaxLatencyFinder(const MachineSchedContext &C,
+                                   const AIEPostRASchedStrategy *Scheduler,
+                                   MachineBasicBlock *CurBB)
+    : Scheduler(Scheduler), TII(static_cast<const AIEBaseInstrInfo *>(
+                                C.MF->getSubtarget().getInstrInfo())),
+      Itineraries(C.MF->getSubtarget().getInstrItineraryData()),
+      TRI(C.MF->getSubtarget().getRegisterInfo()), CurBB(CurBB),
+      InterBlock(true), AA(C.AA), SafeToIgnoreMemDeps(false) {}
 
 // This is called from different contexts, so we need some case analysis
 // If we have a basic block, we are in a regular MachineScheduler invocation,
 // and we will be able to retrieve its strategy,
 // Otherwise we are an abstract region; Scheduler will be nullptr, which
-// will not be derefenced.
+// will not be dereferenced.
 MaxLatencyFinder::MaxLatencyFinder(ScheduleDAGInstrs *DAG, AAResults *AA)
     : Scheduler(DAG->getBB()
                     ? static_cast<AIEScheduleDAGMI *>(DAG)->getSchedImpl()
diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h
index 4304b4d64a87..384174373e6b 100644
--- a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h
+++ b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h
@@ -113,11 +113,11 @@ class MaxLatencyFinder {
 
 public:
   // Constructors
-  MaxLatencyFinder(const AIEPostRASchedStrategy *const Scheduler,
-                   const AIEBaseInstrInfo *const TII,
-                   const InstrItineraryData *const Itineraries,
-                   const MCRegisterInfo *const TRI,
-                   MachineBasicBlock *const CurBB, AAResults *AA = nullptr);
+  // Derive TII, TRI, Itineraries, and AA from the scheduling context, keeping
+  // only Scheduler and CurBB as explicit parameters.
+  MaxLatencyFinder(const MachineSchedContext &C,
+                   const AIEPostRASchedStrategy *Scheduler,
+                   MachineBasicBlock *CurBB);
 
   MaxLatencyFinder(ScheduleDAGInstrs *DAG, AAResults *AA = nullptr);
 

From 3fac8ada14381ce5df0d96f9916cfa54c3399c21 Mon Sep 17 00:00:00 2001
From: Martien de Jong <martien.de-jong@xilinx.com>
Date: Wed, 20 May 2026 16:43:38 +0200
Subject: [PATCH 6/9] [AIE] SchedStrategy.buildGraph also calls the appropriate
 DAG mutators

This prepares for less rigid DAG mutators that can be localized,
and given the function/interblock context in a less indirect way
---
 llvm/lib/Target/AIE/AIEBaseSubtarget.cpp     | 20 +++++++++++++++++---
 llvm/lib/Target/AIE/AIEBaseSubtarget.h       | 20 ++++++++++++++++++--
 llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp | 12 +++++-------
 llvm/lib/Target/AIE/AIEMachineScheduler.cpp  |  9 +++++++++
 4 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
index bf08fdbf2ff4..a5c95241b2b1 100644
--- a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
+++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
@@ -912,16 +912,30 @@ AIEBaseSubtarget::getPostRAMutationsImpl(const Triple &TT, AAResults *AA) {
   if (!TT.isAIE1()) {
     if (EnableWAWStickyRegisters)
       Mutations.emplace_back(std::make_unique<WAWStickyRegistersEdges>());
-    Mutations.emplace_back(std::make_unique<RegionEndEdges>(AA));
+    // RegionEndEdges must run before MemoryEdges/WAWEdges/BiasDepth, and
+    // EmitFixedSUnits must run last. Both are applied via applyMutations()
+    // inside AIEPostRASchedStrategy::buildGraph, which also suppresses the
+    // redundant postProcessDAG() call from ScheduleDAGMI::schedule().
+    Mutations.emplace_back(createRegionEndEdgesMutation(AA));
     Mutations.emplace_back(std::make_unique<MemoryEdges>(true));
     Mutations.emplace_back(std::make_unique<MachineSchedWAWEdges>());
     Mutations.emplace_back(std::make_unique<BiasDepth>());
-    Mutations.emplace_back(std::make_unique<EmitFixedSUnits>(
-        EnableAAInEmitFixedSUnits ? AA : nullptr));
+    Mutations.emplace_back(createEmitFixedSUnitsMutation(AA));
   }
   return Mutations;
 }
 
+std::unique_ptr<ScheduleDAGMutation>
+AIEBaseSubtarget::createRegionEndEdgesMutation(AAResults *AA) {
+  return std::make_unique<RegionEndEdges>(AA);
+}
+
+std::unique_ptr<ScheduleDAGMutation>
+AIEBaseSubtarget::createEmitFixedSUnitsMutation(AAResults *AA) {
+  return std::make_unique<EmitFixedSUnits>(EnableAAInEmitFixedSUnits ? AA
+                                                                     : nullptr);
+}
+
 // List the Mutations that apply to the interblock DAG construction.
 std::vector<std::unique_ptr<ScheduleDAGMutation>>
 AIEBaseSubtarget::getDDGMutationsImpl(const Triple &TT, bool ExactLatencies) {
diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.h b/llvm/lib/Target/AIE/AIEBaseSubtarget.h
index 9eaa4b29d920..9c0302b4b7d3 100644
--- a/llvm/lib/Target/AIE/AIEBaseSubtarget.h
+++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.h
@@ -56,8 +56,12 @@ class AIEBaseSubtarget : public TargetSubtargetInfo {
   }
   void getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>>
                               &Mutations) const override {
-    Mutations =
-        AIEBaseSubtarget::getPostRAMutationsImpl(getTargetTriple(), nullptr);
+    // Post-RA mutations are applied directly in
+    // AIEPostRASchedStrategy::buildGraph, which owns the full graph
+    // construction pipeline. The registered Mutations list is intentionally
+    // empty so that the postProcessDAG() call in ScheduleDAGMI::schedule()
+    // is a no-op.
+    Mutations.clear();
   }
 
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
@@ -84,6 +88,18 @@ class AIEBaseSubtarget : public TargetSubtargetInfo {
   static std::vector<std::unique_ptr<ScheduleDAGMutation>>
   getSMSMutationsImpl(const Triple &TT);
 
+  /// Create the RegionEndEdges mutation for use in buildGraph, where it is
+  /// invoked directly after the other post-RA mutations and before
+  /// createEmitFixedSUnitsMutation (ordering is significant).
+  static std::unique_ptr<ScheduleDAGMutation>
+  createRegionEndEdgesMutation(AAResults *AA);
+
+  /// Create the EmitFixedSUnits mutation for use in buildGraph, invoked after
+  /// createRegionEndEdgesMutation to preserve the ExitSU-edge ordering
+  /// invariant.
+  static std::unique_ptr<ScheduleDAGMutation>
+  createEmitFixedSUnitsMutation(AAResults *AA);
+
   /// Whether to enable the pre-RA MachinePipeliner. This can be disabled to let
   /// the post-RA pipeliner handle the scheduling.
   bool enableMachinePipeliner() const override;
diff --git a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp
index a8cb2cd06ef3..2b1b64d08f38 100644
--- a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp
+++ b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp
@@ -378,13 +378,11 @@ void AIEBasePassConfig::addPreSched2() {
 
 ScheduleDAGInstrs *
 AIEBaseTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
-  ScheduleDAGMI *DAG =
-      new AIEScheduleDAGMI(C, std::make_unique<AIEPostRASchedStrategy>(C),
-                           /* RemoveKillFlags=*/true);
-  for (auto &Mutation :
-       AIEBaseSubtarget::getPostRAMutationsImpl(getTargetTriple(), C->AA))
-    DAG->addMutation(std::move(Mutation));
-  return DAG;
+  // Post-RA mutations are applied directly in
+  // AIEPostRASchedStrategy::buildGraph, so the registered Mutations list is
+  // intentionally empty (matching the empty list from getPostRAMutations).
+  return new AIEScheduleDAGMI(C, std::make_unique<AIEPostRASchedStrategy>(C),
+                              /* RemoveKillFlags=*/true);
 }
 
 ScheduleDAGInstrs *
diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
index 616f8826c917..fd045039ae14 100644
--- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
+++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
@@ -11,6 +11,7 @@
 #include "AIEMachineScheduler.h"
 #include "AIEBaseAliasAnalysis.h"
 #include "AIEBaseInstrInfo.h"
+#include "AIEBaseSubtarget.h"
 #include "AIEBundle.h"
 #include "AIEHazardRecognizer.h"
 #include "AIEInterBlockScheduling.h"
@@ -1732,6 +1733,14 @@ void llvm::AIEPostRASchedStrategy::buildGraph(ScheduleDAGMI &DAG, AAResults *AA,
   DAG.ExitSU.setInstr(Region.getExitInstr());
   DAG.makeMaps();
   DAG.buildEdges(Context->AA);
+  // Apply all post-RA mutations in the correct order. buildGraph owns the
+  // complete graph construction pipeline (raw edges + mutations); the
+  // postProcessDAG() call in ScheduleDAGMI::schedule() is a no-op because
+  // the registered Mutations list is intentionally empty (getPostRAMutations
+  // returns empty, and createPostMachineScheduler no longer registers them).
+  const Triple &TT = DAG.MF.getTarget().getTargetTriple();
+  for (auto &M : AIEBaseSubtarget::getPostRAMutationsImpl(TT, Context->AA))
+    M->apply(&DAG);
   static_cast<AIEScheduleDAGMI &>(DAG).recordDbgInstrs(Region);
 }
 

From c3b66ba24bfce07d40b8f08ba418c97310d62b50 Mon Sep 17 00:00:00 2001
From: Martien de Jong <martien.de-jong@xilinx.com>
Date: Thu, 21 May 2026 14:30:43 +0200
Subject: [PATCH 7/9] [AIE][InterBlock] Deal with successor region's ExitSU

Use new, more accurate exit latency, retaining the computation of
the old one for now. The old behavious can be restored by returning
OldEffectiveLatency rather than NewEffectiveLatency in
MaxLatencyFinder::operator()

reference updates have been checked superficially. Nothing outrageous
stands out, but I don't give a guarantee for correctness yet.
---
 llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp   |  67 +++--
 llvm/lib/Target/AIE/AIEMaxLatencyFinder.h     |   8 +
 .../CodeGen/AIE/aie2/end-to-end/Add2D-red.ll  |  40 ++-
 .../AIE/aie2/end-to-end/Conv2D-red-swp.ll     |  24 +-
 .../AIE/aie2/end-to-end/TanhTemplated-swp.ll  |  40 +--
 llvm/test/CodeGen/AIE/aie2/extract.ll         |  12 +-
 .../aie2/schedule/loopaware/Add2D-like.mir    |   5 +-
 .../AIE/aie2/schedule/swp/doloop-stage0.ll    |  15 +-
 .../AIE/aie2/schedule/swp/prepipeliner-ore.ll |   2 +-
 llvm/test/CodeGen/AIE/aie2/set.ll             |   8 +-
 .../aie2p/end-to-end/conv2d_bfp16_convert.ll  |   4 +-
 .../AIE/aie2p/end-to-end/gelu-templated.ll    |  61 +++--
 .../aie2p/end-to-end/ore-hardware-loops.ll    |   4 +-
 .../AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll |  17 +-
 llvm/test/CodeGen/AIE/aie2p/upd_ext_bfp16.ll  |  18 +-
 .../CodeGen/AIE/aie2ps/conv2d-outer-loop.ll   |  61 ++---
 .../conv2d_int8_outerloop_pipelined.ll        |  33 ++-
 .../end-to-end/conv2d_opt_outerloop_blocks.ll | 251 +++++++++++++-----
 .../end-to-end/gemm_int8_outerloop_blocks.ll  |  76 +++---
 .../gemm_int8_outerloop_pipelined-aa.ll       |  16 +-
 .../gemm_int8_outerloop_pipelined.ll          |  16 +-
 .../AIE/hardware-loops/loop-with-call.ll      |  12 +-
 .../test/CodeGen/AIE/hardware-loops/nested.ll |  13 +-
 .../AIE/schedule/commit-block-schedule.ll     |   2 +-
 llvm/test/CodeGen/AIE/switch.ll               |   4 +-
 25 files changed, 456 insertions(+), 353 deletions(-)

diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
index acfd00734b90..9677aa989514 100644
--- a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
+++ b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
@@ -245,18 +245,21 @@ void MaxLatencyFinder::buildInterBlockEdges(const Region &CurRegion) {
     SE.buildEdges();
 
     // After the graph is built, record the scheduled cycle depth for each
-    // post-boundary instruction. This is kept separate from node insertion
-    // so that nodes are always added in getFreeInstructions() order.
+    // post-boundary instruction and the total length of the successor block's
+    // top region.
     // Instructions absent from Depths return depth 0 from getDepth(), which
     // is the conservative value (no latency reduction) for unscheduled nodes.
-    if (SBS.isScheduled() && !SBS.getRegions().empty()) {
-      int Cycle = 0;
-      for (const MachineBundle &Bundle : SBS.getTop().Bundles) {
-        for (MachineInstr *MI : Bundle.getInstrs())
-          SE.recordDepth(MI, Cycle);
-        ++Cycle;
+    if (!SBS.isScheduled() || SBS.getRegions().empty()) {
+      return;
+    }
+    int Cycle = 0;
+    for (const MachineBundle &Bundle : SBS.getTop().Bundles) {
+      for (MachineInstr *MI : Bundle.getInstrs()) {
+        SE.recordDepth(MI, Cycle);
       }
+      ++Cycle;
     }
+    SE.setTopRegionLength(Cycle);
   }
 }
 
@@ -314,36 +317,54 @@ unsigned MaxLatencyFinder::operator()(MachineInstr &MI) {
 
   // New DDG-based computation — logged for comparison while investigating.
   int NewEffectiveLatency = HasUnknownSuccessors ? Latency : 0;
+  LLVM_DEBUG(dbgs() << "   NewEffectiveLatency init=" << NewEffectiveLatency
+                    << (HasUnknownSuccessors ? " (HasUnknownSuccessors)"
+                                             : " (known successors)")
+                    << "\n");
   for (SuccessorEdges &SE : PerSuccEdges) {
     const SUnit *Pred = SE.getPreBoundaryNode(&MI);
-    if (!Pred)
+    if (!Pred) {
+      LLVM_DEBUG(
+          dbgs() << "   No pre-boundary node for this successor, skip\n");
       continue;
+    }
+    LLVM_DEBUG(dbgs() << "   Pre-boundary SU#" << Pred->NodeNum << " has "
+                      << Pred->Succs.size() << " successor edge(s)\n");
 
     for (const SDep &Dep : Pred->Succs) {
       SUnit *Succ = Dep.getSUnit();
-      if (!SE.isPostBoundaryNode(Succ))
-        continue;
-
-      MachineInstr *SuccMI = Succ->getInstr();
-      if (!SuccMI) {
-        // ExitSU: use the full edge latency conservatively.
-        NewEffectiveLatency =
-            std::max(NewEffectiveLatency, Dep.getSignedLatency());
+      if (!SE.isPostBoundaryNode(Succ)) {
+        LLVM_DEBUG(dbgs() << "   SU#" << Succ->NodeNum
+                          << " is not a post-boundary node, skip\n");
         continue;
       }
 
-      // Remaining latency = edge latency minus cycles already elapsed in
-      // the successor before SuccMI executes. getDepth returns 0 for
-      // unscheduled instructions, giving the conservative full edge latency.
-      const int Remaining = Dep.getSignedLatency() - SE.getDepth(SuccMI);
-      LLVM_DEBUG(dbgs() << "   Remaining=" << Remaining << " for " << *SuccMI);
+      // For ExitSU the depth is the full length of the successor block's
+      // top region (all its cycles have elapsed before reaching ExitSU).
+      // For a regular instruction node the depth is its scheduled cycle
+      // within the block.
+      const int Depth = Succ->isBoundaryNode() ? SE.getTopRegionLength()
+                                               : SE.getDepth(Succ->getInstr());
+      const int EdgeLat = Dep.getSignedLatency();
+      const int Remaining = EdgeLat - Depth;
+      LLVM_DEBUG(
+          dbgs() << "   " << (Succ->isBoundaryNode() ? "ExitSU" : "SU#")
+                 << (Succ->isBoundaryNode() ? ""
+                                            : std::to_string(Succ->NodeNum))
+                 << ": latency=" << EdgeLat << ", depth=" << Depth
+                 << ", remaining=" << Remaining
+                 << ", updating NewEffectiveLatency " << NewEffectiveLatency
+                 << " -> " << std::max(NewEffectiveLatency, Remaining) << "\n");
       NewEffectiveLatency = std::max(NewEffectiveLatency, Remaining);
     }
   }
+  // Cap at the raw maxLatency of the source instruction, matching the old
+  // computation where the result is naturally bounded by Latency.
+  NewEffectiveLatency = std::min(NewEffectiveLatency, Latency);
   LLVM_DEBUG(dbgs() << "   EffectiveLatency(new)=" << NewEffectiveLatency
                     << "\n");
 
-  return static_cast<unsigned>(OldEffectiveLatency);
+  return static_cast<unsigned>(NewEffectiveLatency);
 }
 
 } // namespace llvm::AIE
diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h
index 384174373e6b..22bcb5a4c441 100644
--- a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h
+++ b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h
@@ -58,6 +58,10 @@ class SuccessorEdges {
   /// Maps each scheduled post-boundary MachineInstr* to its cycle
   /// (0-indexed from the top of the successor block).
   std::map<MachineInstr *, int> Depths;
+  /// Number of bundles in the successor block's top region. Used to reduce
+  /// the ExitSU edge latency by the number of cycles already elapsed in the
+  /// successor before it exits.
+  int SuccTopRegionLength = 0;
 
 public:
   explicit SuccessorEdges(const MachineSchedContext &C)
@@ -68,6 +72,7 @@ class SuccessorEdges {
   void markBoundary() { Edges->markBoundary(); }
   void recordDepth(MachineInstr *MI, int Cycle) { Depths[MI] = Cycle; }
   void buildEdges() { Edges->buildEdges(); }
+  void setTopRegionLength(int Length) { SuccTopRegionLength = Length; }
 
   // Query interface — called from MaxLatencyFinder::operator().
   const SUnit *getPreBoundaryNode(MachineInstr *MI) const {
@@ -83,6 +88,9 @@ class SuccessorEdges {
     const auto It = Depths.find(MI);
     return It != Depths.end() ? It->second : 0;
   }
+  /// Returns the number of bundles in the scheduled successor block's top
+  /// region. Returns 0 for unscheduled successors (conservative).
+  int getTopRegionLength() const { return SuccTopRegionLength; }
 };
 
 class MaxLatencyFinder {
diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll
index 68ee86c98a44..713f0d7e3981 100644
--- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll
+++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll
@@ -34,8 +34,8 @@ declare { ptr, i20, i20 } @llvm.aie2.add.3d(ptr, i20, i20, i20, i20, i20, i20, i
 define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm2_data, ptr noalias %ofm_data, ptr %.out, ptr %conv.i.i.i.out, ptr %idx.ext9.out, ptr %.out1, ptr %.out2, ptr %.out3, ptr %.out4, ptr %.out5, ptr %conv.i.i.i.i.i.out, ptr %.out6, ptr %conv.i.i.i46.out, ptr %xtraiter.out, ptr %in_ptr1.051.unr.ce.out, ptr %in_ptr2.0.in50.unr.ce.out, ptr %out_ptr.049.unr.ce.out, ptr %itr_left_cnt0.048.unr.ce.out, ptr %itr_left_cnt1.047.unr.ce.out) #3 {
 ; ASM-LABEL: add2d:
 ; ASM:       // %bb.0: // %newFuncRoot
-; ASM-NEXT:    paddb [p0], #40; lda r2, [p0, #64]; nops ; nopxm ; nopv
-; ASM-NEXT:    lda m2, [p0], #-4; nopx
+; ASM-NEXT:    lda r2, [p0, #64]; paddb [p0], #40; nopxm
+; ASM-NEXT:    lda m2, [p0], #-4
 ; ASM-NEXT:    lda m5, [p0], #8
 ; ASM-NEXT:    lda m4, [p0], #8
 ; ASM-NEXT:    lda m3, [p0], #-24
@@ -60,14 +60,14 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm
 ; ASM-NEXT:    lda p0, [p7], #-4; st m1, [p0, #0]; add r7, r2, #-1; mov r6, #1
 ; ASM-NEXT:    lda p4, [p7], #-4; st m0, [p0, #0]; ne r6, r0, r6
 ; ASM-NEXT:    lda r13, [p7], #-4; st dj0, [p0, #0]; movx r0, #3
-; ASM-NEXT:    st dj4, [p0, #0]; ltu r7, r7, r0
-; ASM-NEXT:    st dn0, [p0, #0]; nez r1, r1
-; ASM-NEXT:    lda r9, [p7], #-4; st dn4, [p5, #0]; jz r7, #.LBB0_2
-; ASM-NEXT:    lda r10, [p7], #-4; st r1, [p6, #0] // Delay Slot 5
-; ASM-NEXT:    lda r11, [p7], #-4; st r5, [p0, #0] // Delay Slot 4
-; ASM-NEXT:    lda p7, [p7, #-4]; paddb [p2], m5; st r6, [p4, #0] // Delay Slot 3
-; ASM-NEXT:    lda r12, [p7, #0]; paddb [p2], m4; and r8, r2, r0; mov p0, r13; padds [p1], m2 // Delay Slot 2
-; ASM-NEXT:    mova r0, #0; paddb [p2], m3; st r8, [p0, #0] // Delay Slot 1
+; ASM-NEXT:    lda r9, [p7], #-4; st dj4, [p0, #0]; ltu r7, r7, r0
+; ASM-NEXT:    lda r10, [p7], #-4; st dn0, [p0, #0]; nez r1, r1
+; ASM-NEXT:    lda r11, [p7], #-4; st dn4, [p5, #0]; jz r7, #.LBB0_2
+; ASM-NEXT:    lda p7, [p7, #-4]; st r1, [p6, #0] // Delay Slot 5
+; ASM-NEXT:    st r5, [p0, #0] // Delay Slot 4
+; ASM-NEXT:    paddb [p2], m5; st r6, [p4, #0] // Delay Slot 3
+; ASM-NEXT:    lda r12, [p7, #0]; paddb [p2], m4; and r8, r2, r0; mov p0, r13 // Delay Slot 2
+; ASM-NEXT:    padda [p1], m2; paddb [p2], m3; movx r0, #0; st r8, [p0, #0] // Delay Slot 1
 ; ASM-NEXT:  // %bb.1:
 ; ASM-NEXT:    j #.LBB0_5
 ; ASM-NEXT:    nop // Delay Slot 5
@@ -76,20 +76,14 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm
 ; ASM-NEXT:    nop // Delay Slot 2
 ; ASM-NEXT:    mova r1, #0 // Delay Slot 1
 ; ASM-NEXT:  .LBB0_2: // %entry.new
-; ASM-NEXT:    nopb ; vlda.ups.s32.d8 cm2, s1, [p1], m1; nops ; nopx ; mov dc0, #0; nopv
+; ASM-NEXT:    vlda.ups.s32.d8 cm2, s1, [p1], m1; mov dc0, #0
 ; ASM-NEXT:    vlda.ups.s32.d8 cm1, s1, [p1], m1; mov dc4, dc0
-; ASM-NEXT:    vlda.3d.ups.s32.d8 cm7, s1, [p2], d0; nopx
-; ASM-NEXT:    vlda.ups.s32.d8 cm0, s1, [p1], m1
-; ASM-NEXT:    vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; mov crUPSSign, r4
-; ASM-NEXT:    vlda.ups.s32.d8 cm4, s1, [p1], m1; mov s1, r3
-; ASM-NEXT:    vlda.3d.ups.s32.d8 cm5, s1, [p2], d0
-; ASM-NEXT:    vlda.3d.ups.s32.d8 cm3, s1, [p2], d0
-; ASM-NEXT:    nop
-; ASM-NEXT:    movxm ls, #.LBB0_3
-; ASM-NEXT:    mova r0, #-4; movxm le, #.L_LEnd0
-; ASM-NEXT:    and r0, r2, r0
-; ASM-NEXT:    mova r2, #-2; add r0, r0, #-4
-; ASM-NEXT:    lshl r0, r0, r2; mov crSRSSign, r6
+; ASM-NEXT:    vlda.3d.ups.s32.d8 cm7, s1, [p2], d0; movx r0, #-4; mov crUPSSign, r4
+; ASM-NEXT:    vlda.ups.s32.d8 cm0, s1, [p1], m1; movxm ls, #.LBB0_3
+; ASM-NEXT:    vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; movxm le, #.L_LEnd0
+; ASM-NEXT:    vlda.ups.s32.d8 cm4, s1, [p1], m1; and r0, r2, r0; mov s1, r3
+; ASM-NEXT:    vlda.3d.ups.s32.d8 cm5, s1, [p2], d0; add r0, r0, #-4; mov r2, #-2
+; ASM-NEXT:    vlda.3d.ups.s32.d8 cm3, s1, [p2], d0; lshl r0, r0, r2; mov crSRSSign, r6
 ; ASM-NEXT:    add r0, r0, #1; mov s0, r5
 ; ASM-NEXT:    nopb ; nopa ; nops ; nopx ; add.nc lc, r0, #-1; nopv
 ; ASM-NEXT:  .LBB0_3: // %for.body
diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll
index a70e2cd07a66..c0977cea0eeb 100644
--- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll
+++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll
@@ -216,7 +216,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
 ; ZOL-NEXT:  .LBB0_1: // %outer.loop.header
 ; ZOL-NEXT:    // =>This Loop Header: Depth=1
 ; ZOL-NEXT:    // Child Loop BB0_2 Depth 2
-; ZOL-NEXT:    vlda.ups.s32.s16 bmh0, s0, [p2, #32]; nopx
+; ZOL-NEXT:    vlda.ups.s32.s16 bmh0, s0, [p2, #32]; nopb ; nopx
 ; ZOL-NEXT:    vlda.ups.s32.s16 bml0, s0, [p2], m5
 ; ZOL-NEXT:    vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m7, p5
 ; ZOL-NEXT:    vlda.ups.s32.s16 bml1, s0, [p2], m7
@@ -229,17 +229,17 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
 ; ZOL-NEXT:    vlda.ups.s32.s16 bmh7, s0, [p2, #32]; vldb wh3, [p0], m6
 ; ZOL-NEXT:    vlda.ups.s32.s16 bml7, s0, [p2], m7; vldb wl7, [p0], m6
 ; ZOL-NEXT:    vlda.ups.s32.s16 bmh5, s0, [p2, #32]; vldb.3d wh7, [p0], d0
-; ZOL-NEXT:    vlda.ups.s32.s16 bml5, s0, [p2], m5; movxm ls, #.LBB0_2
-; ZOL-NEXT:    vldb wl6, [p1], #32; movxm le, #.L_LEnd0
-; ZOL-NEXT:    vlda wh6, [p1], #32; vldb wl5, [p0], m6; mov r1, p0
-; ZOL-NEXT:    vlda.ups.s32.s16 bmh6, s0, [p2, #32]; vldb wh5, [p0], m6; and r0, r0, r9
-; ZOL-NEXT:    vlda wl8, [p1], #32; vldb wl3, [p0], m6; add r0, r0, #33
-; ZOL-NEXT:    vlda wh8, [p1], #32; vldb.3d wh3, [p0], d0; vshift.align x4, x4, s1, x3, r0
-; ZOL-NEXT:    vlda.ups.s32.s16 bml6, s0, [p2, #0]; vldb wl1, [p1], #32; and r1, r1, r9; vshift.align x2, x2, s1, x7, r0
-; ZOL-NEXT:    vldb wh1, [p1], #32; add r0, r1, #33; mov r1, p0
-; ZOL-NEXT:    vldb wl10, [p1], #32; vshuffle x7, x4, x2, r2
-; ZOL-NEXT:    vldb wh10, [p1], #32; vshuffle x9, x7, x0, r8
-; ZOL-NEXT:    nopb ; nopa ; nops ; and r1, r1, r9; add.nc lc, r5, #-2; nopv
+; ZOL-NEXT:    vlda.ups.s32.s16 bml5, s0, [p2], m5
+; ZOL-NEXT:    vldb wl6, [p1], #32; movxm ls, #.LBB0_2
+; ZOL-NEXT:    vldb wh6, [p1], #32; movxm le, #.L_LEnd0
+; ZOL-NEXT:    vlda.ups.s32.s16 bmh6, s0, [p2, #32]; vldb wl5, [p0], m6; and r0, r0, r9; mov r1, p0
+; ZOL-NEXT:    vlda wl8, [p1], #32; vldb wh5, [p0], m6; add r0, r0, #33
+; ZOL-NEXT:    vlda wh8, [p1], #32; vldb wl3, [p0], m6; vshift.align x4, x4, s1, x3, r0
+; ZOL-NEXT:    vlda.ups.s32.s16 bml6, s0, [p2, #0]; vldb.3d wh3, [p0], d0; and r1, r1, r9; vshift.align x2, x2, s1, x7, r0
+; ZOL-NEXT:    vldb wl1, [p1], #32; add r0, r1, #33; mov r1, p0
+; ZOL-NEXT:    vldb wh1, [p1], #32; vshuffle x7, x4, x2, r2
+; ZOL-NEXT:    vldb wl10, [p1], #32; vshuffle x9, x7, x0, r8
+; ZOL-NEXT:    vldb wh10, [p1], #32; nopa ; nops ; and r1, r1, r9; add.nc lc, r5, #-2; nopv
 ; ZOL-NEXT:  .LBB0_2: // %inner.loop
 ; ZOL-NEXT:    // Parent Loop BB0_1 Depth=1
 ; ZOL-NEXT:    // => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll
index 401aa4affd85..c4dae8bfdf2c 100644
--- a/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll
+++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll
@@ -4,7 +4,7 @@
 ; See https://llvm.org/LICENSE.txt for license information.
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 ;
-; (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
+; (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 
 ; RUN: llc -O2 -mtriple=aie2 \
 ; RUN:    %s -o - | FileCheck %s
@@ -65,6 +65,7 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non
 ; CHECK-NEXT:    nopb ; nopa ; nops ; movxm r3, #16512; nopv
 ; CHECK-NEXT:    nopa ; movxm r4, #-16256
 ; CHECK-NEXT:    movxm r5, #32767
+; CHECK-NEXT:    movxm r6, #15616
 ; CHECK-NEXT:    movxm r0, #16256
 ; CHECK-NEXT:    movxm r1, #16384
 ; CHECK-NEXT:    lda r0, [p2, #0]; movxm r2, #16128
@@ -73,37 +74,36 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non
 ; CHECK-NEXT:    vbcst.16 x2, r2
 ; CHECK-NEXT:    mova r1, #0; vconv.fp32.bf16 bmh0, wl2
 ; CHECK-NEXT:    vbcst.16 x2, r1
-; CHECK-NEXT:    vldb wl3, [p0], #32; vmov wh0, wl2
-; CHECK-NEXT:    mova r1, #-5; vmov wh3, wl2
+; CHECK-NEXT:    vmov wh0, wl2
+; CHECK-NEXT:    mova r1, #-5; vldb wl3, [p0], #32; vmov wh3, wl2
 ; CHECK-NEXT:    mova r1, #60; lshl r2, r0, r1; vconv.fp32.bf16 bmh1, wl3
-; CHECK-NEXT:    movxm r6, #15616; vmul.f bmh2, x0, x3, r1
-; CHECK-NEXT:    movxm r7, #16000
+; CHECK-NEXT:    movxm r7, #16000; vmul.f bmh2, x0, x3, r1
 ; CHECK-NEXT:    vbcst.16 x1, r3
 ; CHECK-NEXT:    vbcst.16 x8, r4
-; CHECK-NEXT:    vbcst.16 x10, r5; vmul.f bmh3, x0, x3, r1
+; CHECK-NEXT:    vbcst.16 x10, r5
 ; CHECK-NEXT:    vbcst.16 x6, r6
-; CHECK-NEXT:    vconv.bf16.fp32 wl3, bmh2; vbcst.16 x4, r7
-; CHECK-NEXT:    vmov wh6, wl2
+; CHECK-NEXT:    vbcst.16 x4, r7; vmul.f bmh3, x0, x3, r1
+; CHECK-NEXT:    vconv.bf16.fp32 wl3, bmh2; vmov wh6, wl2
+; CHECK-NEXT:    vmov wh4, wl2
 ; CHECK-NEXT:    vmin_ge.bf16 x3, r16, x3, x1
 ; CHECK-NEXT:    or r8, r16, r16; vmax_lt.bf16 x3, r16, x3, x8
-; CHECK-NEXT:    vconv.bf16.fp32 wl5, bmh3; vband x7, x10, x3
-; CHECK-NEXT:    vmov wh7, wl2
+; CHECK-NEXT:    vband x7, x10, x3
+; CHECK-NEXT:    vconv.bf16.fp32 wl5, bmh3; vmov wh3, wl2
+; CHECK-NEXT:    vldb wl7, [p0], #32; vmov wh7, wl2
 ; CHECK-NEXT:    vmin_ge.bf16 x5, r16, x5, x1
 ; CHECK-NEXT:    vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x5, x8
-; CHECK-NEXT:    vband x7, x10, x5
-; CHECK-NEXT:    vldb wl7, [p0], #32; vmov wh7, wl2; vmul.f bmh2, x6, x7, r1
-; CHECK-NEXT:    vmov wh4, wl2
-; CHECK-NEXT:    vmov wh3, wl2; vmul.f bmh4, x6, x7, r1
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    vmov wh5, wl2; vmac.f bmh3, bmh0, x3, x4, r1
+; CHECK-NEXT:    vband x7, x10, x5; vmul.f bmh2, x6, x7, r1
+; CHECK-NEXT:    vmov wh7, wl2
+; CHECK-NEXT:    vmac.f bmh3, bmh0, x3, x4, r1
+; CHECK-NEXT:    vmul.f bmh4, x6, x7, r1
 ; CHECK-NEXT:    vmul.f bmh5, x0, x7, r1
-; CHECK-NEXT:    vmac.f bmh6, bmh0, x5, x4, r1
+; CHECK-NEXT:    vmov wh5, wl2
 ; CHECK-NEXT:    vconv.bf16.fp32 wl7, bmh2; vmul.f bmh7, x0, x7, r1
-; CHECK-NEXT:    vconv.bf16.fp32 wl3, bmh4
+; CHECK-NEXT:    vmac.f bmh6, bmh0, x5, x4, r1
 ; CHECK-NEXT:    vmsc.f bmh3, bmh3, x7, x3, r1
-; CHECK-NEXT:    movxm ls, #.LBB0_1; vmsc.f bml4, bmh6, x3, x5, r1
+; CHECK-NEXT:    vconv.bf16.fp32 wl3, bmh4; movxm ls, #.LBB0_1
 ; CHECK-NEXT:    vconv.bf16.fp32 wl3, bmh5; movxm le, #.L_LEnd0
-; CHECK-NEXT:    add.nc lc, r2, #-2
+; CHECK-NEXT:    add.nc lc, r2, #-2; vmsc.f bml4, bmh6, x3, x5, r1
 ; CHECK-NEXT:    vconv.bf16.fp32 wl5, bmh7; vmin_ge.bf16 x3, r16, x3, x1
 ; CHECK-NEXT:    vmax_lt.bf16 x3, r16, x3, x8
 ; CHECK-NEXT:    mova r0, #28; vconv.bf16.fp32 wl7, bmh3; vmin_ge.bf16 x11, r16, x5, x1
diff --git a/llvm/test/CodeGen/AIE/aie2/extract.ll b/llvm/test/CodeGen/AIE/aie2/extract.ll
index 28e9ce6a1524..220b8e83f119 100644
--- a/llvm/test/CodeGen/AIE/aie2/extract.ll
+++ b/llvm/test/CodeGen/AIE/aie2/extract.ll
@@ -99,9 +99,9 @@ define dso_local noundef <32 x i8> @_Z30test_extract_v64uint4_256_1024Dv128_DU8_
 ; CHECK-NEXT:    jz r0, #.LBB2_6
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    vlda wl4, [sp, #-160] // 32-byte Folded Reload Delay Slot 3
+; CHECK-NEXT:    nop // Delay Slot 3
 ; CHECK-NEXT:    vlda wh5, [sp, #-64] // 32-byte Folded Reload Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
+; CHECK-NEXT:    vlda wl4, [sp, #-160] // 32-byte Folded Reload Delay Slot 1
 ; CHECK-NEXT:  // %bb.3: // %if.else.i
 ; CHECK-NEXT:    j #.LBB2_6
 ; CHECK-NEXT:    nop // Delay Slot 5
@@ -444,9 +444,9 @@ define dso_local noundef <4 x i64> @_Z20test_extract_v8acc32Dv32_u7__acc32i(<16
 ; CHECK-NEXT:    jz r0, #.LBB13_6
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    vlda amll0, [sp, #-160] // 32-byte Folded Reload Delay Slot 3
+; CHECK-NEXT:    nop // Delay Slot 3
 ; CHECK-NEXT:    vlda amhh0, [sp, #-64] // 32-byte Folded Reload Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
+; CHECK-NEXT:    vlda amll0, [sp, #-160] // 32-byte Folded Reload Delay Slot 1
 ; CHECK-NEXT:  // %bb.3: // %if.else.i
 ; CHECK-NEXT:    j #.LBB13_6
 ; CHECK-NEXT:    nop // Delay Slot 5
@@ -664,9 +664,9 @@ define dso_local noundef <4 x i64> @_Z20test_extract_v4acc64Dv16_u7__acc64i(<16
 ; CHECK-NEXT:    jz r0, #.LBB20_6
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    vlda amll0, [sp, #-160] // 32-byte Folded Reload Delay Slot 3
+; CHECK-NEXT:    nop // Delay Slot 3
 ; CHECK-NEXT:    vlda amhh0, [sp, #-64] // 32-byte Folded Reload Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
+; CHECK-NEXT:    vlda amll0, [sp, #-160] // 32-byte Folded Reload Delay Slot 1
 ; CHECK-NEXT:  // %bb.3: // %if.else.i
 ; CHECK-NEXT:    j #.LBB20_6
 ; CHECK-NEXT:    nop // Delay Slot 5
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir
index 2c48a9426854..19af57435e7d 100644
--- a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 
 # RUN: llc --mtriple=aie2 --run-pass=postmisched \
 # RUN:     %s -o - | FileCheck %s
@@ -35,9 +35,6 @@ body:             |
   ; CHECK-NEXT:   NOP
   ; CHECK-NEXT:   NOP
   ; CHECK-NEXT:   $s1 = MOV_mv_scl killed $r2
-  ; CHECK-NEXT:   NOP
-  ; CHECK-NEXT:   NOP
-  ; CHECK-NEXT:   NOP
   ; CHECK-NEXT:   BUNDLE implicit-def $r1, implicit-def dead $srcarry, implicit-def $s0, implicit killed $r1, implicit killed $r4 {
   ; CHECK-NEXT:     renamable $r1 = ADD_add_r_ri killed renamable $r1, -4, implicit-def dead $srcarry
   ; CHECK-NEXT:     $s0 = MOV_mv_scl killed $r4
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/swp/doloop-stage0.ll b/llvm/test/CodeGen/AIE/aie2/schedule/swp/doloop-stage0.ll
index c2b8cf49d88d..b3384ae6f241 100644
--- a/llvm/test/CodeGen/AIE/aie2/schedule/swp/doloop-stage0.ll
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/swp/doloop-stage0.ll
@@ -4,7 +4,7 @@
 ; See https://llvm.org/LICENSE.txt for license information.
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 ;
-; (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates
+; (c) Copyright 2023-2026 Advanced Micro Devices, Inc. or its affiliates
 ; RUN: llc --mtriple=aie2 -O2 --aie-pipeliner-max-guards=2 -enable-aie-zol-without-minitercount=false %s -o - | FileCheck %s
 
 ; Similar to stage0.ll, but now with a do-while. Again we expect a three
@@ -18,14 +18,13 @@
 define dso_local i32 @dot(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 noundef %n) {
 ; CHECK-LABEL: dot:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    nopa ; movxm m0, #2044
-; CHECK-NEXT:    lda r3, [p1], m0; add r5, r1, #-1
-; CHECK-NEXT:    lda r2, [p0], m0; jz r5, #.LBB0_5
-; CHECK-NEXT:    nop // Delay Slot 5
-; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    nop // Delay Slot 3
+; CHECK-NEXT:    nopa ; nopb ; add r5, r1, #-1; nopm
+; CHECK-NEXT:    jz r5, #.LBB0_5
+; CHECK-NEXT:    movxm m0, #2044 // Delay Slot 5
+; CHECK-NEXT:    lda r2, [p0], m0 // Delay Slot 4
+; CHECK-NEXT:    lda r3, [p1], m0 // Delay Slot 3
 ; CHECK-NEXT:    nop // Delay Slot 2
-; CHECK-NEXT:    movx r0, #0 // Delay Slot 1
+; CHECK-NEXT:    mova r0, #0 // Delay Slot 1
 ; CHECK-NEXT:  // %bb.1: // %do.body
 ; CHECK-NEXT:    lda r4, [p1], m0; add r5, r5, #-1
 ; CHECK-NEXT:    lda r1, [p0], m0; jz r5, #.LBB0_4
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/swp/prepipeliner-ore.ll b/llvm/test/CodeGen/AIE/aie2/schedule/swp/prepipeliner-ore.ll
index 2eb14f4b2d98..fa1c257540d1 100644
--- a/llvm/test/CodeGen/AIE/aie2/schedule/swp/prepipeliner-ore.ll
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/swp/prepipeliner-ore.ll
@@ -55,7 +55,7 @@ define dso_local i32 @dot(ptr addrspace(6) nocapture readonly %a, ptr addrspace(
 ; PRE-NEXT:   - NS:              '3'
 ; PRE-NEXT:   - Loop:            bb.2.for.body
 ; PRE-NEXT:   - Prologue:        bb.1.for.body.preheader
-; PRE-NEXT:   - PrologueBundles: '10'
+; PRE-NEXT:   - PrologueBundles: '7'
 ; PRE-NEXT:   - Epilogue:        bb.3
 ; PRE-NEXT:   - EpilogueBundles: '6'
 ; PRE-NEXT: ...
diff --git a/llvm/test/CodeGen/AIE/aie2/set.ll b/llvm/test/CodeGen/AIE/aie2/set.ll
index 66c81001a1ad..294939198b68 100644
--- a/llvm/test/CodeGen/AIE/aie2/set.ll
+++ b/llvm/test/CodeGen/AIE/aie2/set.ll
@@ -16,8 +16,8 @@ define dso_local noundef <64 x i8> @_Z29test_set_v128uint_set_512_256iDv32_DU8_(
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
 ; CHECK-NEXT:    nop // Delay Slot 3
-; CHECK-NEXT:    vmov wl0, wl2 // Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
+; CHECK-NEXT:    nop // Delay Slot 2
+; CHECK-NEXT:    vmov wl0, wl2 // Delay Slot 1
 ; CHECK-NEXT:  // %bb.1: // %if.else.i
 ; CHECK-NEXT:    vmov wh0, wl0
 ; CHECK-NEXT:  .LBB0_2: // %_ZL13set_v128uint4iDv32_DU8_.exit
@@ -199,8 +199,8 @@ define dso_local noundef <128 x i8> @_Z27test_set_v256uint4_1024_512iDv64_DU8_(i
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
 ; CHECK-NEXT:    nop // Delay Slot 3
-; CHECK-NEXT:    vmov x4, x0 // Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
+; CHECK-NEXT:    nop // Delay Slot 2
+; CHECK-NEXT:    vmov x4, x0 // Delay Slot 1
 ; CHECK-NEXT:  // %bb.1: // %if.else.i
 ; CHECK-NEXT:    vmov x5, x4
 ; CHECK-NEXT:  .LBB4_2: // %_ZL13set_v256uint4iDv64_DU8_.exit
diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll
index 0dd29ac72714..ccf6cb3a1140 100644
--- a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll
+++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll
@@ -27,9 +27,9 @@ define weak_odr dso_local void @convert_bf16_to_bfp16(ptr noalias %in, ptr noali
 ; CHECK-NEXT:    vlda.pop.512 x6, [p0, lf0, r24]; movs dc1, dj0; mov dn1, dn0
 ; CHECK-NEXT:    vldb.pop.512.2d x4, [p0, lf0, r24, d1]
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    lda m0, [p2, #4]; vldb.fill.512 [p0, lf0, r24]
+; CHECK-NEXT:    vldb.fill.512 [p0, lf0, r24]
 ; CHECK-NEXT:    vlda.pop.512 x6, [p0, lf0, r24]
-; CHECK-NEXT:    vldb.pop.512.2d x4, [p0, lf0, r24, d1]; movxm ls, #.LBB0_1
+; CHECK-NEXT:    lda m0, [p2, #4]; vldb.pop.512.2d x4, [p0, lf0, r24, d1]; movxm ls, #.LBB0_1
 ; CHECK-NEXT:    movxm le, #.L_LEnd0
 ; CHECK-NEXT:    nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopx ; add.nc lc, r0, #-3; nopv
 ; CHECK-NEXT:    vlda.pop.512 x6, [p0, lf0, r24]; nopb ; nops ; nopx ; vconv.fp32.bf16 cml1, x6; nopv
diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/gelu-templated.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/gelu-templated.ll
index b72536458b9e..e52c94ef8500 100644
--- a/llvm/test/CodeGen/AIE/aie2p/end-to-end/gelu-templated.ll
+++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/gelu-templated.ll
@@ -17,54 +17,53 @@
 define void @gelu_fn(ptr noalias %ifm, ptr noalias %ofm, ptr nonnull align 64 dereferenceable(64) %params) {
 ; CHECK-LABEL: gelu_fn:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    vlda.conv.fp32.bf16 cml1, [p0], #64; nopb ; nopx
+; CHECK-NEXT:    vlda.conv.fp32.bf16 cml1, [p0], #64; nopx
 ; CHECK-NEXT:    movxm r0, #16544
 ; CHECK-NEXT:    vbcst.16 x6, r0
 ; CHECK-NEXT:    lda r1, [p2, #0]; movxm r0, #17280
 ; CHECK-NEXT:    mova r0, #60; vbcst.16 x2, r0
 ; CHECK-NEXT:    vadd.f dm3, dm1, dm0, r0
-; CHECK-NEXT:    vconv.fp32.bf16 cml0, x6
+; CHECK-NEXT:    vlda.conv.fp32.bf16 cml2, [p0], #64; vconv.fp32.bf16 cml0, x6
+; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vlda.conv.fp32.bf16 cml2, [p0], #64
 ; CHECK-NEXT:    movxm r2, #15821
-; CHECK-NEXT:    mova r2, #255; movx r4, #1; vbcst.16 x4, r2
+; CHECK-NEXT:    mova r2, #255; movx r4, #1; vbcst.16 x4, r2; vadd.f dm3, dm2, dm0, r0
 ; CHECK-NEXT:    vlda.conv.fp32.bf16 cml1, [p0], #64; vconv.bf16.fp32 x8, cml3; lshl r2, r1, r4; vbcst.16 x0, r2
-; CHECK-NEXT:    mova r2, #828; mov m0, r2; vadd.f dm3, dm2, dm0, r0
-; CHECK-NEXT:    vlda.conv.fp32.bf16 cml2, [p0], #64; vmul.f dm2, x8, x2, r2
+; CHECK-NEXT:    mova r2, #828; mov m0, r2
+; CHECK-NEXT:    vmul.f dm2, x8, x2, r2
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vadd.f dm3, dm1, dm0, r0
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vadd.f dm3, dm2, dm0, r0
-; CHECK-NEXT:    vconv.bf16.fp32 x10, cml3
+; CHECK-NEXT:    vconv.bf16.fp32 x10, cml3; vadd.f dm3, dm1, dm0, r0
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vlda.conv.fp32.bf16 cml2, [p0], #64; vmul.f dm1, x10, x2, r2
 ; CHECK-NEXT:    vconv.bf16.fp32 x8, cml2
-; CHECK-NEXT:    vmul.f dm1, x10, x2, r2
-; CHECK-NEXT:    vconv.bf16.fp32 x1, cml3
-; CHECK-NEXT:    vlda.conv.fp32.bf16 cml1, [p0], #64; vmul.f dm4, x8, x4, r2
-; CHECK-NEXT:    vconv.bf16.fp32 x7, cml3; vmul.f dm2, x1, x2, r2
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    vmul.f dm3, x7, x2, r2
-; CHECK-NEXT:    vconv.bf16.fp32 x10, cml1; vadd.f dm1, dm1, dm0, r0
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    vlda.conv.fp32.bf16 cml2, [p0], #64; vconv.bf16.fp32 x8, cml4; movx r3, #0; vmul.f dm4, x10, x4, r2
-; CHECK-NEXT:    vconv.bf16.fp32 x5, cml2; mov s0, r3
-; CHECK-NEXT:    vfloor.s32.bf16 x1, wl8, s0
-; CHECK-NEXT:    vconv.bf16.fp32 x5, cml3; vmul.f dm4, x5, x4, r2
-; CHECK-NEXT:    vconv.bf16.fp32 x7, cml1; movxm ls, #.LBB0_1; vadd.f dm2, dm2, dm0, r0
-; CHECK-NEXT:    mova r4, #-5; nopb ; vfloor.s32.bf16 x3, wh8, s0; movxm le, #.L_LEnd0; vmul.f dm3, x5, x4, r2
-; CHECK-NEXT:    mova r1, #2; vconv.bf16.fp32 x10, cml4; lshl r4, r1, r4; vmul.f dm4, x7, x2, r2
-; CHECK-NEXT:    vlda.conv.fp32.bf16 cml1, [p0], #64; vshuffle x1, x1, x3, r1
-; CHECK-NEXT:    vfloor.s32.bf16 x9, wl10, s0; vmin_ge.16 x3, r16, x1, x0, vaddsign1
-; CHECK-NEXT:    vfloor.s32.bf16 x3, wh10, s0; vbcst.16 x6, r3
-; CHECK-NEXT:    vconv.bf16.fp32 x8, cml4; vmax_lt.16 x11, r16, x3, x6, vaddsign1
-; CHECK-NEXT:    padda [p1], m0; nopb ; nops ; nopx ; add.nc lc, r4, #-7; nopv
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmul.f dm4, x8, x4, r2
+; CHECK-NEXT:    vconv.bf16.fp32 x1, cml3; vadd.f dm3, dm2, dm0, r0
+; CHECK-NEXT:    vlda.conv.fp32.bf16 cml1, [p0], #64
+; CHECK-NEXT:    vconv.bf16.fp32 x10, cml1; vmul.f dm2, x1, x2, r2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmul.f dm4, x10, x4, r2
+; CHECK-NEXT:    mova r3, #0; vconv.bf16.fp32 x8, cml4; vadd.f dm1, dm1, dm0, r0
+; CHECK-NEXT:    vconv.bf16.fp32 x7, cml3; mov s0, r3
+; CHECK-NEXT:    vlda.conv.fp32.bf16 cml2, [p0], #64; vfloor.s32.bf16 x1, wl8, s0
+; CHECK-NEXT:    vconv.bf16.fp32 x5, cml2; vmul.f dm3, x7, x2, r2
+; CHECK-NEXT:    vfloor.s32.bf16 x3, wh8, s0; movxm ls, #.LBB0_1
+; CHECK-NEXT:    mova r4, #-5; nopb ; vconv.bf16.fp32 x10, cml4; movxm le, #.L_LEnd0; vmul.f dm4, x5, x4, r2
+; CHECK-NEXT:    vconv.bf16.fp32 x7, cml1; lshl r4, r1, r4; vadd.f dm2, dm2, dm0, r0
+; CHECK-NEXT:    vlda.conv.fp32.bf16 cml1, [p0], #64; movx r1, #2; vbcst.16 x6, r3
+; CHECK-NEXT:    vfloor.s32.bf16 x9, wl10, s0; vshuffle x1, x1, x3, r1; vmul.f dm4, x7, x2, r2
+; CHECK-NEXT:    vconv.bf16.fp32 x5, cml3; vmin_ge.16 x3, r16, x1, x0, vaddsign1
+; CHECK-NEXT:    nopa ; nopb ; vfloor.s32.bf16 x3, wh10, s0; nopx ; add.nc lc, r4, #-7; nopv
+; CHECK-NEXT:    padda [p1], m0; nopb ; vconv.bf16.fp32 x8, cml4; nopx ; vmax_lt.16 x11, r16, x3, x6, vaddsign1; vmul.f dm3, x5, x4, r2
 ; CHECK-NEXT:  .LBB0_1: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    nopa ; nopb ; vconv.bf16.fp32 x10, cml2; nopxm ; nopv
 ; CHECK-NEXT:    nopa ; nopb ; nops ; nopxm ; vadd.f dm2, dm4, dm0, r0
 ; CHECK-NEXT:    vlda.conv.fp32.bf16 cml1, [p0], #64; nopb ; vconv.bf16.fp32 x7, cml4; nopx ; vmov cml4, cml1; vmul.f dm4, x10, x2, r2
 ; CHECK-NEXT:    nopa ; nopb ; vst x11, [p1], #64; nopx ; vshuffle x1, x9, x3, r1; nopv
-; CHECK-NEXT:    nopa ; nopb ; vfloor.s32.bf16 x3, wh8, s0; nopx ; vmin_ge.16 x5, r16, x1, x0, vaddsign1; nopv
-; CHECK-NEXT:    nopa ; nopb ; vfloor.s32.bf16 x9, wl8, s0; nopx ; vmax_lt.16 x11, r16, x5, x6, vaddsign1; nopv
+; CHECK-NEXT:    vfloor.s32.bf16 x3, wh8, s0; vmin_ge.16 x5, r16, x1, x0, vaddsign1
+; CHECK-NEXT:    vfloor.s32.bf16 x9, wl8, s0; vmax_lt.16 x11, r16, x5, x6, vaddsign1
 ; CHECK-NEXT:  .L_LEnd0:
 ; CHECK-NEXT:    nopa ; nopb ; vconv.bf16.fp32 x8, cml3; nopxm ; vmul.f dm3, x7, x4, r2
 ; CHECK-NEXT:  // %bb.2:
diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/ore-hardware-loops.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/ore-hardware-loops.ll
index b7354dcb9369..6febf7745d81 100644
--- a/llvm/test/CodeGen/AIE/aie2p/end-to-end/ore-hardware-loops.ll
+++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/ore-hardware-loops.ll
@@ -68,9 +68,9 @@ define weak_odr dso_local void @convert_bf16_to_bfp16(ptr noalias %in, ptr noali
 ; ASM-NEXT:    vlda.pop.512 x6, [p0, lf0, r24]; movs dc1, dj0; mov dn1, dn0
 ; ASM-NEXT:    vldb.pop.512.2d x4, [p0, lf0, r24, d1]
 ; ASM-NEXT:    nop
-; ASM-NEXT:    lda m0, [p2, #4]; vldb.fill.512 [p0, lf0, r24]
+; ASM-NEXT:    vldb.fill.512 [p0, lf0, r24]
 ; ASM-NEXT:    vlda.pop.512 x6, [p0, lf0, r24]
-; ASM-NEXT:    vldb.pop.512.2d x4, [p0, lf0, r24, d1]; movxm ls, #.LBB0_1
+; ASM-NEXT:    lda m0, [p2, #4]; vldb.pop.512.2d x4, [p0, lf0, r24, d1]; movxm ls, #.LBB0_1
 ; ASM-NEXT:    movxm le, #.L_LEnd0
 ; ASM-NEXT:    nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopx ; add.nc lc, r0, #-3; nopv
 ; ASM-NEXT:    vlda.pop.512 x6, [p0, lf0, r24]; nopb ; nops ; nopx ; vconv.fp32.bf16 cml1, x6; nopv
diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll
index d890556cbc69..a6f59f17636e 100644
--- a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll
+++ b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll
@@ -44,18 +44,15 @@ define void @heavy_3d_user(i32 %dimsAI.sroa.5.0.copyload.i, i32 %dimsAI.sroa.7.0
 ; FINE-GRAINED-NEXT:  .LBB0_1: // %for.body.i
 ; FINE-GRAINED-NEXT:    // =>This Loop Header: Depth=1
 ; FINE-GRAINED-NEXT:    // Child Loop BB0_2 Depth 2
-; FINE-GRAINED-NEXT:    nopa ; nopb ; nopx ; mov dn2, r3; movs dj2, p6
-; FINE-GRAINED-NEXT:    movs dn6, r3; mov r17, dc6
-; FINE-GRAINED-NEXT:    movs dj6, p6; mov m2, m4
-; FINE-GRAINED-NEXT:    mova p1, #0; movs dc6, r4; mov r25, r18
-; FINE-GRAINED-NEXT:    vldb.pop.576.3d ex0, [p1, lf1, r25, d2]
-; FINE-GRAINED-NEXT:    nop
+; FINE-GRAINED-NEXT:    nopa ; nopb ; movs dj2, p6; nopx ; mov dn2, r3; nopv
+; FINE-GRAINED-NEXT:    nopa ; movs dn6, r3; nopx ; mov r17, dc6
+; FINE-GRAINED-NEXT:    movs dj6, p6; or r6, r5, r5; mov r5, dj4
+; FINE-GRAINED-NEXT:    movs m2, m4; vmov lfl1, lfl0
 ; FINE-GRAINED-NEXT:    movs m1, m5; mov dn1, r3
-; FINE-GRAINED-NEXT:    movs dc1, dc0; vmov lfl1, lfl0
-; FINE-GRAINED-NEXT:    movs dj1, m5; vmov lfh1, lfh0
+; FINE-GRAINED-NEXT:    movs dc1, dc0; mov dj1, m5
 ; FINE-GRAINED-NEXT:    mova p0, #0; movs dn5, r3; mov dj5, m5
-; FINE-GRAINED-NEXT:    paddb.3d [p0], d1; or r6, r5, r5; mov r5, dj4
-; FINE-GRAINED-NEXT:    mova p0, #0; mov r21, dc5
+; FINE-GRAINED-NEXT:    mova p1, #0; paddb.3d [p0], d1; or r25, r18, r18; vmov lfh1, lfh0; movs dc6, r4
+; FINE-GRAINED-NEXT:    mova p0, #0; vldb.pop.576.3d ex0, [p1, lf1, r25, d2]; mov r21, dc5
 ; FINE-GRAINED-NEXT:  .LBB0_2: // %for.body125.i
 ; FINE-GRAINED-NEXT:    // Parent Loop BB0_1 Depth=1
 ; FINE-GRAINED-NEXT:    // => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/AIE/aie2p/upd_ext_bfp16.ll b/llvm/test/CodeGen/AIE/aie2p/upd_ext_bfp16.ll
index a3ea23ad71f1..cb2c9076b76d 100644
--- a/llvm/test/CodeGen/AIE/aie2p/upd_ext_bfp16.ll
+++ b/llvm/test/CodeGen/AIE/aie2p/upd_ext_bfp16.ll
@@ -4,7 +4,7 @@
 ; See https://llvm.org/LICENSE.txt for license information.
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 ;
-; (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
+; (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aie2p | FileCheck %s
 
 %struct.v64bfp16ebs16 = type <{ <64 x i8>, <8 x i8> }>
@@ -197,8 +197,8 @@ define dso_local noundef <32 x i8> @_Z20test_extract_v32int813v64bfp16ebs16i(%st
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
 ; CHECK-NEXT:    nop // Delay Slot 3
-; CHECK-NEXT:    vmov x0, x2 // Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
+; CHECK-NEXT:    nop // Delay Slot 2
+; CHECK-NEXT:    vmov x0, x2 // Delay Slot 1
 ; CHECK-NEXT:  // %bb.1: // %if.else.i.i
 ; CHECK-NEXT:    vmov wl0, wh0
 ; CHECK-NEXT:  .LBB10_2: // %_ZL15extract_v32int813v64bfp16ebs16i.exit
@@ -238,8 +238,8 @@ define dso_local noundef <32 x i8> @_Z20test_extract_v32int812v64bfp16ebs8i(%str
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
 ; CHECK-NEXT:    nop // Delay Slot 3
-; CHECK-NEXT:    vmov x0, x2 // Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
+; CHECK-NEXT:    nop // Delay Slot 2
+; CHECK-NEXT:    vmov x0, x2 // Delay Slot 1
 ; CHECK-NEXT:  // %bb.1: // %if.else.i.i
 ; CHECK-NEXT:    vmov wl0, wh0
 ; CHECK-NEXT:  .LBB11_2: // %_ZL15extract_v32int812v64bfp16ebs8i.exit
@@ -340,8 +340,8 @@ define dso_local %struct.v128bfp16ebs8 @_Z11test_insert13v128bfp16ebs8i12v64bfp1
 ; CHECK-NEXT:    nopa ; jz r0, #.LBB15_2
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    mov r4, el2 // Delay Slot 3
-; CHECK-NEXT:    vmov x1, x2 // Delay Slot 2
+; CHECK-NEXT:    vmov x1, x2 // Delay Slot 3
+; CHECK-NEXT:    mov r4, el2 // Delay Slot 2
 ; CHECK-NEXT:    mov r5, eh2 // Delay Slot 1
 ; CHECK-NEXT:  // %bb.1: // %if.end.i
 ; CHECK-NEXT:    j #.LBB15_3
@@ -622,8 +622,8 @@ define dso_local %struct.v128bfp16ebs16 @_Z11test_insert14v128bfp16ebs16i13v64bf
 ; CHECK-NEXT:    nopa ; jz r0, #.LBB22_2
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    mov r4, el2 // Delay Slot 3
-; CHECK-NEXT:    vmov x1, x2 // Delay Slot 2
+; CHECK-NEXT:    vmov x1, x2 // Delay Slot 3
+; CHECK-NEXT:    mov r4, el2 // Delay Slot 2
 ; CHECK-NEXT:    mov r5, eh2 // Delay Slot 1
 ; CHECK-NEXT:  // %bb.1: // %if.end.i
 ; CHECK-NEXT:    j #.LBB22_3
diff --git a/llvm/test/CodeGen/AIE/aie2ps/conv2d-outer-loop.ll b/llvm/test/CodeGen/AIE/aie2ps/conv2d-outer-loop.ll
index 10b3d045ad97..c42209602468 100644
--- a/llvm/test/CodeGen/AIE/aie2ps/conv2d-outer-loop.ll
+++ b/llvm/test/CodeGen/AIE/aie2ps/conv2d-outer-loop.ll
@@ -32,16 +32,16 @@ define void @conv2d_outer_loop(ptr noalias %ifm, ptr noalias %wts, ptr noalias %
 ; CHECK-NEXT:    eq r18, r20, r16
 ; CHECK-NEXT:    paddxm [sp], #64; lshl r28, r24, r16
 ; CHECK-NEXT:    st p6, [sp, #-64]; ltu r27, r16, r6 // 4-byte Folded Spill
-; CHECK-NEXT:    mova dj1, #96; st r26, [p3, dj0]; or r18, r28, r18
-; CHECK-NEXT:    st.s8 r18, [p3, dj1]; add r2, r18, r2
-; CHECK-NEXT:    sel.nez r18, r2, r0, r27
+; CHECK-NEXT:    st p7, [sp, #-60]; or r18, r28, r18 // 4-byte Folded Spill
+; CHECK-NEXT:    mova dj1, #96; st r26, [p3, dj0]; add r2, r18, r2
+; CHECK-NEXT:    st.s8 r18, [p3, dj1]; sel.nez r18, r2, r0, r27
 ; CHECK-NEXT:    ne r26, r2, r16
 ; CHECK-NEXT:    jnz r26, #.LBB0_2
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    mova dj0, #76; ltu r28, r16, r18 // Delay Slot 3
-; CHECK-NEXT:    st r28, [p3, dj0]; mov r7, r8 // Delay Slot 2
-; CHECK-NEXT:    mova r2, #5; st p7, [sp, #-60]; or r17, r10, r10; mov r19, r12 // 4-byte Folded Spill Delay Slot 1
+; CHECK-NEXT:    nop // Delay Slot 3
+; CHECK-NEXT:    mova dj0, #76; ltu r28, r16, r18; mov r7, r8 // Delay Slot 2
+; CHECK-NEXT:    mova r2, #5; st r28, [p3, dj0]; or r17, r10, r10; mov r19, r12 // Delay Slot 1
 ; CHECK-NEXT:  // %bb.1: // %entry
 ; CHECK-NEXT:    jnz r24, #.LBB0_3
 ; CHECK-NEXT:    nop // Delay Slot 5
@@ -73,11 +73,9 @@ define void @conv2d_outer_loop(ptr noalias %ifm, ptr noalias %wts, ptr noalias %
 ; CHECK-NEXT:    or r26, r24, r18
 ; CHECK-NEXT:  .LBB0_3: // %if.end.i
 ; CHECK-NEXT:    mova m0, #80; nopb ; nops ; nopx ; mov p2, p3; nopv
-; CHECK-NEXT:    padda [p2], m0
-; CHECK-NEXT:    st r28, [p2], #24
-; CHECK-NEXT:    st.s8 r26, [p2, #0]; ne r6, r20, r6
-; CHECK-NEXT:    jnz r6, #.LBB0_5
-; CHECK-NEXT:    nop // Delay Slot 5
+; CHECK-NEXT:    padda [p2], m0; ne r6, r20, r6; nopm
+; CHECK-NEXT:    st r28, [p2], #24; jnz r6, #.LBB0_5
+; CHECK-NEXT:    st.s8 r26, [p2, #0] // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
 ; CHECK-NEXT:    nop // Delay Slot 3
 ; CHECK-NEXT:    nop // Delay Slot 2
@@ -86,17 +84,15 @@ define void @conv2d_outer_loop(ptr noalias %ifm, ptr noalias %wts, ptr noalias %
 ; CHECK-NEXT:    movxm r6, #16777215
 ; CHECK-NEXT:    mova dj0, #92; and r4, r4, r6
 ; CHECK-NEXT:    st r4, [p3, dj0]
-; CHECK-NEXT:    nop
 ; CHECK-NEXT:  .LBB0_5: // %_Z24setup_conv2d_iter_paramsR13conv2d_params.exit
-; CHECK-NEXT:    mova dj0, #84; nopb ; nopxm
-; CHECK-NEXT:    lda r20, [p3, dj0]; extend.u8 r4, r26
-; CHECK-NEXT:    mova dj0, #120; eq r6, r4, r22
-; CHECK-NEXT:    lda r24, [p3, dj0]; jnz r6, #.LBB0_7
+; CHECK-NEXT:    nopa ; nopb ; nops ; extend.u8 r4, r26; nopm ; nopv
+; CHECK-NEXT:    eq r6, r4, r22
+; CHECK-NEXT:    jnz r6, #.LBB0_7
 ; CHECK-NEXT:    nop // Delay Slot 5
-; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    nop // Delay Slot 3
-; CHECK-NEXT:    nop // Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
+; CHECK-NEXT:    mova dj0, #84 // Delay Slot 4
+; CHECK-NEXT:    lda r20, [p3, dj0] // Delay Slot 3
+; CHECK-NEXT:    mova dj0, #120 // Delay Slot 2
+; CHECK-NEXT:    lda r24, [p3, dj0] // Delay Slot 1
 ; CHECK-NEXT:  // %bb.6: // %_Z24setup_conv2d_iter_paramsR13conv2d_params.exit
 ; CHECK-NEXT:    ne r4, r4, r16
 ; CHECK-NEXT:    jnz r4, #.LBB0_11
@@ -142,22 +138,21 @@ define void @conv2d_outer_loop(ptr noalias %ifm, ptr noalias %wts, ptr noalias %
 ; CHECK-NEXT:  .LBB0_8: // %for.body.i68
 ; CHECK-NEXT:    // =>This Loop Header: Depth=1
 ; CHECK-NEXT:    // Child Loop BB0_9 Depth 2
-; CHECK-NEXT:    nopa ; vldb x1, [p1, #64]; nopx ; mov r0, dc6; nops
+; CHECK-NEXT:    nopa ; nopb ; nops ; nopx ; mov r0, dc6; nopv
 ; CHECK-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p4, #0]; vldb.popx x4, [p0, lf0, r24]; lshl r0, r0, r2; mov dc4, dc3
 ; CHECK-NEXT:    vlda.pop.3d x6, [p0, lf0, r24, d0]; or r20, r0, r16; mov dj3, r0
 ; CHECK-NEXT:    vldb.128 wl2, [p2, dj3]; mov dj3, r20
 ; CHECK-NEXT:    vldb.128 wl8, [p2, dj3]
 ; CHECK-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p6, #0]; vldb x10, [p1, #0]
-; CHECK-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p4, #64]
-; CHECK-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p6, #64]; vldb.popx x10, [p0, lf0, r24]
-; CHECK-NEXT:    vldb.pop.3d x8, [p0, lf0, r24, d0]
-; CHECK-NEXT:    vldb.popx x10, [p0, lf0, r24]; mov p7, p1
-; CHECK-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]; vshuffle x2, x4, x6, r6; vmul dm2, x0, x2, r10
-; CHECK-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; add.nc lc, r18, #-6; padds [p7], #128; vmul dm3, x0, x8, r10
-; CHECK-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]; movxm ls, #.LBB0_9; vaddmac dm1, dm1, dm2, x2, x10, r12
-; CHECK-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; padds [p7], #128; movxm le, #.L_LEnd0; vaddmac dm0, dm0, dm3, x2, x1, r12
+; CHECK-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p4, #64]; vldb.popx x10, [p0, lf0, r24]
+; CHECK-NEXT:    vlda x1, [p1, #64]; vldb.pop.3d x8, [p0, lf0, r24, d0]
+; CHECK-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p6, #64]; vldb.popx x10, [p0, lf0, r24]; mov p7, p1
+; CHECK-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]
+; CHECK-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; padds [p7], #128; add.nc lc, r18, #-6; vshuffle x2, x4, x6, r6; vmul dm2, x0, x2, r10
+; CHECK-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]; movxm ls, #.LBB0_9; vmul dm3, x0, x8, r10
+; CHECK-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; padds [p7], #128; movxm le, #.L_LEnd0; vaddmac dm1, dm1, dm2, x2, x10, r12
 ; CHECK-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]; nops ; nopxm ; nopv
-; CHECK-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; padds [p7], #128; nopxm ; nopv
+; CHECK-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; padds [p7], #128; nopxm ; vaddmac dm0, dm0, dm3, x2, x1, r12
 ; CHECK-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]; nops ; nopx ; vshuffle x2, x10, x8, r6; nopv
 ; CHECK-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p0, lf0, r24]; padds [p7], #128; nopxm ; nopv
 ; CHECK-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p0, lf0, r24, d0]; nops ; nopx ; vshuffle x2, x10, x8, r6; vmac dm1, dm1, x2, x6, r8
@@ -170,9 +165,9 @@ define void @conv2d_outer_loop(ptr noalias %ifm, ptr noalias %wts, ptr noalias %
 ; CHECK-NEXT:  // %bb.10: // %for.cond.cleanup54.i89
 ; CHECK-NEXT:    // in Loop: Header=BB0_8 Depth=1
 ; CHECK-NEXT:    vlda x4, [p7, #192]; paddb [p1], m3; padds [p7], #128; add r4, r4, #-1; nopm ; vmac dm0, dm0, x2, x4, r8
-; CHECK-NEXT:    vlda x6, [p7, #128]; paddb [p4], #128; padds [p6], #128; nopx ; vshuffle x2, x10, x8, r6; vmac dm1, dm1, x2, x6, r8
-; CHECK-NEXT:    vlda x4, [p7, #192]; paddb.3d [p0], d1; padds [p7], #128; nopxm ; vmac dm0, dm0, x2, x4, r8
-; CHECK-NEXT:    nopa ; paddb.3d [p1], d2; nops ; nopx ; vshuffle x2, x10, x8, r6; vmac dm1, dm1, x2, x6, r8
+; CHECK-NEXT:    vlda x6, [p7, #128]; paddb.3d [p1], d2; padds [p4], #128; nopx ; vshuffle x2, x10, x8, r6; vmac dm1, dm1, x2, x6, r8
+; CHECK-NEXT:    vlda x4, [p7, #192]; paddb [p6], #128; padds [p7], #128; nopxm ; vmac dm0, dm0, x2, x4, r8
+; CHECK-NEXT:    nopa ; paddb.3d [p0], d1; nops ; nopx ; vshuffle x2, x10, x8, r6; vmac dm1, dm1, x2, x6, r8
 ; CHECK-NEXT:    vmac dm0, dm0, x2, x4, r8
 ; CHECK-NEXT:    vshuffle x2, x10, x8, r6; vmac dm1, dm1, x2, x6, r8
 ; CHECK-NEXT:    vmac dm0, dm0, x2, x4, r8
diff --git a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_int8_outerloop_pipelined.ll b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_int8_outerloop_pipelined.ll
index 3ba9874aebe2..f4015e67392c 100644
--- a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_int8_outerloop_pipelined.ll
+++ b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_int8_outerloop_pipelined.ll
@@ -43,8 +43,8 @@ declare i1 @llvm.loop.decrement.i32(i32) #3
 define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %cond, ptr %cond.i, ptr %ifm, i32 %cond88.i, i20 %idx.ext.i.i, i20 %idx.ext.i330.i, i20 %idx.ext.i334.i, i32 %1, i20 %idx.ext.i338.i, i20 %2, i20 %3, i20 %4, i20 %5, i20 %6, i20 %7, i20 %8, i20 %9, i20 %10, i20 %11, i20 %12, i20 %13, i20 %14, i20 %15, i20 %16, i32 %conv197.i, i32 %conv.i.i.i.i.i, i20 %idx.ext.i342.i, i20 %idx.ext.i344.i, i20 %17, i20 %18, i20 %19, i32 %or.i.i, i32 %cond15.i.i.i.i.i, i20 %20, i20 %21, i20 %22, i20 %23, i20 %24, i32 %or22.i.i.i.i.i) #4 personality ptr @__gxx_personality_v0 {
 ; CHECK-LABEL: conv2d:
 ; CHECK:       // %bb.0: // %newFuncRoot
-; CHECK-NEXT:    paddxm [sp], #64; nopb ; nops ; nopx ; mov m4, p4; nopv
-; CHECK-NEXT:    mova m0, #-68; st p6, [sp, #-64]; nopx // 4-byte Folded Spill
+; CHECK-NEXT:    paddxm [sp], #64; nopb ; nopx ; mov m4, p4
+; CHECK-NEXT:    mova m0, #-68; st p6, [sp, #-64] // 4-byte Folded Spill
 ; CHECK-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p1], m4; movs m2, p5; mov p6, sp
 ; CHECK-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p1], m2; paddb [p6], m0
 ; CHECK-NEXT:    lda m5, [p6], #-4
@@ -53,8 +53,9 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %cond, ptr %cond.i, ptr
 ; CHECK-NEXT:    lda r21, [p6], #-4; mov s0, r1
 ; CHECK-NEXT:    lda r29, [p6], #-4; movx crupsmode, #0
 ; CHECK-NEXT:    lda r25, [p6], #-4
-; CHECK-NEXT:    lda dn0, [p6], #-4
-; CHECK-NEXT:    lda r27, [p6], #-4
+; CHECK-NEXT:    lda dn0, [p6], #-4; paddb [p1], m4
+; CHECK-NEXT:    lda r27, [p6], #-4; paddb [p1], m5
+; CHECK-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p1, #0]
 ; CHECK-NEXT:    lda m1, [p6], #-4
 ; CHECK-NEXT:    lda r31, [p6], #-4
 ; CHECK-NEXT:    lda r16, [p6], #-4
@@ -64,20 +65,16 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %cond, ptr %cond.i, ptr
 ; CHECK-NEXT:    lda r18, [p6], #-4
 ; CHECK-NEXT:    lda dn3, [p6], #-4
 ; CHECK-NEXT:    lda dn7, [p6], #-4
-; CHECK-NEXT:    lda r20, [p6], #-4
-; CHECK-NEXT:    lda m6, [p6], #-4
-; CHECK-NEXT:    lda r1, [p6], #-4
-; CHECK-NEXT:    lda r26, [p6], #-4
-; CHECK-NEXT:    lda r22, [p6], #-4; mov dj4, #0
-; CHECK-NEXT:    lda m2, [p6], #-4; mov s1, r3
-; CHECK-NEXT:    lda dj2, [p6], #-4; or r28, r8, r8; mov dj3, #0
-; CHECK-NEXT:    lda dj6, [p6], #-4; movs dc2, dj4; or r30, r5, r5; mov r5, dj4
-; CHECK-NEXT:    lda dn2, [p6, #0]; movs dc6, dj4; or r8, r7, r7; mov r7, dj4
-; CHECK-NEXT:    lda dn6, [p6, #-4]; movs dc0, dj4; mov dj1, r31
-; CHECK-NEXT:    padda [p1], m4; movs dc1, dj4; mov dj5, r16
-; CHECK-NEXT:    padda [p1], m5; movs dc5, dj4; mov dj7, r18
-; CHECK-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p1, #0]; movs dc3, dj4; mov r23, m5
-; CHECK-NEXT:    padda [p1], m4; movs dc7, dj4; add r0, r0, #-1; mov p6, p0
+; CHECK-NEXT:    lda r20, [p6], #-4; mov dj4, #0
+; CHECK-NEXT:    lda m6, [p6], #-4; mov s1, r3
+; CHECK-NEXT:    lda r1, [p6], #-4; mov dj3, #0
+; CHECK-NEXT:    lda r26, [p6], #-4; movs dc2, dj4; or r28, r8, r8; mov dc6, dj4
+; CHECK-NEXT:    lda r22, [p6], #-4; movs dc0, dj4; or r30, r5, r5; mov r5, dj4
+; CHECK-NEXT:    lda m2, [p6], #-4; movs dc1, dj4; or r8, r7, r7; mov r7, dj4
+; CHECK-NEXT:    lda dj2, [p6], #-4; movs dc5, dj4; mov r23, m5
+; CHECK-NEXT:    lda dj6, [p6], #-4; movs dc3, dj4; mov dj1, r31
+; CHECK-NEXT:    lda dn2, [p6, #0]; movs dc7, dj4; mov dj5, r16
+; CHECK-NEXT:    lda dn6, [p6, #-4]; paddb [p1], m4; add r0, r0, #-1; mov p6, p0; movs dj7, r18
 ; CHECK-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p1, #0]; movs p0, p3; movx crsrsmode, #0; mov m4, r1
 ; CHECK-NEXT:  .LBB0_1: // %for.body.i
 ; CHECK-NEXT:    // =>This Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_opt_outerloop_blocks.ll b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_opt_outerloop_blocks.ll
index a4a5c9b46a78..fba50a7537bb 100644
--- a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_opt_outerloop_blocks.ll
+++ b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/conv2d_opt_outerloop_blocks.ll
@@ -51,7 +51,7 @@ declare i1 @llvm.loop.decrement.i32(i32) #3
 define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm, ptr %psum_1_tdm, ptr %ifm, ptr %add.ptr.i, <64 x i8> %1, i32 %conv10.i, i20 %2, i20 %3, i20 %4, i20 %5, i20 %6, i32 %7, i32 %or25.i.i.i, i32 %8, i20 %9, i20 %10, i20 %11, i20 %12, i20 %13, i20 %14, i20 %15, i20 %16, i20 %17, i20 %18, i20 %19, i32 %conv91.i, i32 %20, i20 %idx.ext.i216.i, i20 %21, i20 %22, i20 %23, i32 %or22.i.i.i) #4 {
 ; CHECK-LABEL: conv2d:
 ; CHECK:       // %bb.0: // %newFuncRoot
-; CHECK-NEXT:    paddxm [sp], #64
+; CHECK-NEXT:    paddxm [sp], #64; nopb ; nopx
 ; CHECK-NEXT:    st p6, [sp, #-64] // 4-byte Folded Spill
 ; CHECK-NEXT:    mova m0, #-68; mov p6, sp
 ; CHECK-NEXT:    padda [p6], m0
@@ -77,15 +77,14 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ; CHECK-NEXT:    lda dn3, [p6, #-4]; movs p6, p1; movx r25, #0; mov p1, p4
 ; CHECK-NEXT:    movs dc0, dc7; vldb.popx x10, [p1, lf1, r25]; mov dc4, dc7
 ; CHECK-NEXT:    vldb.pop.3d x1, [p1, lf1, r25, d0]
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    vldb.128 wl2, [p5, #0]; or r22, r12, r12; mov r19, r8
-; CHECK-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; or r21, r10, r10; mov s0, r1
-; CHECK-NEXT:    mova r16, #5; vldb x8, [p0, #0]; or r10, r3, r3; mov s1, r5
-; CHECK-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3], #64; vldb x6, [p0, #64]; add r0, r0, #-1; mov dc6, dc7; movs dc3, dc7
-; CHECK-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; movs dc2, dc7; or r8, r7, r7; addm.nc r1, r0, #-1
-; CHECK-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vldb.128 wl4, [p5, #16]; movxm p4, #.LBB0_1; movs dc5, dc7
-; CHECK-NEXT:    mova r18, #16; movs dc1, dc7; movx crupsmode, #0; vshuffle x10, x10, x1, r2
-; CHECK-NEXT:    mova r12, #264; st p7, [sp, #-60]; movx crsrsmode, #0; mov m5, r17 // 4-byte Folded Spill
+; CHECK-NEXT:    or r22, r12, r12; mov r19, r8
+; CHECK-NEXT:    mova r16, #5; or r21, r10, r10; mov s0, r1
+; CHECK-NEXT:    mova r18, #16; vldb.128 wl4, [p5, #16]; or r10, r3, r3; mov s1, r5
+; CHECK-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; movs dc3, dc7; or r8, r7, r7; mov dc6, dc7
+; CHECK-NEXT:    mova r12, #264; vldb.128 wl2, [p5, #0]; add r0, r0, #-1; mov dc5, dc7; movs dc2, dc7
+; CHECK-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3], #64; vldb x8, [p0, #0]; movx crupsmode, #0; addm.nc r1, r0, #-1; movs dc1, dc7
+; CHECK-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; st p7, [sp, #-60]; movxm p4, #.LBB0_1 // 4-byte Folded Spill
+; CHECK-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vldb x6, [p0, #64]; movx crsrsmode, #0; vshuffle x10, x10, x1, r2; movs m5, r17
 ; CHECK-NEXT:  .LBB0_1: // %for.body.i
 ; CHECK-NEXT:    // =>This Loop Header: Depth=1
 ; CHECK-NEXT:    // Child Loop BB0_2 Depth 2
@@ -110,10 +109,10 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup54.i
 ; CHECK-NEXT:    // in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    vlda x4, [p7, #192]; paddb.3d [p1], d1; padds [p7], #128; nopxm ; vmac dm0, dm0, x2, x4, r8
-; CHECK-NEXT:    vlda x6, [p7, #128]; vldb.popx x10, [p1, lf1, r25]; movs dc4, dc7; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
-; CHECK-NEXT:    vlda x4, [p7, #192]; paddb [p0], m4; padds [p7], #128; nopx ; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8
-; CHECK-NEXT:    padds.3d [p0], d2; vldb.pop.3d x1, [p1, lf1, r25, d0]; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
-; CHECK-NEXT:    vlda x8, [p0, #0]; mov r0, dc6; vmac dm0, dm0, x2, x4, r8
+; CHECK-NEXT:    vlda x6, [p7, #128]; paddb [p0], m4; movs dc4, dc7; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; CHECK-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p7], #128; nopx ; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8
+; CHECK-NEXT:    vlda.pop.3d x1, [p1, lf1, r25, d0]; paddb.3d [p0], d2; nops ; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; CHECK-NEXT:    vldb x8, [p0, #0]; nopx ; mov r0, dc6; vmac dm0, dm0, x2, x4, r8
 ; CHECK-NEXT:    vldb x6, [p0, #64]; lshl r0, r0, r16; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
 ; CHECK-NEXT:    or r20, r0, r18; mov dj7, r0; vmac dm0, dm0, x2, x4, r8
 ; CHECK-NEXT:    movs dj7, r20; vldb.128 wl2, [p5, dj7]; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
@@ -130,11 +129,11 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ; CHECK-NEXT:    vldb.popx x8, [p1, lf1, r25]
 ; CHECK-NEXT:    vldb.pop.3d x6, [p1, lf1, r25, d0]
 ; CHECK-NEXT:    vldb.popx x8, [p1, lf1, r25]
-; CHECK-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; vmul dm2, x0, x2, r12
-; CHECK-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; add.nc lc, r4, #-6; padds [p0], #128; vmul dm3, x0, x4, r12
-; CHECK-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; movxm ls, #.LBB0_5; vaddmac dm1, dm1, dm2, x10, x8, r10
-; CHECK-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; movxm le, #.L_LEnd0; vaddmac dm0, dm0, dm3, x10, x6, r10
-; CHECK-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopxm ; nopv
+; CHECK-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]
+; CHECK-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; add.nc lc, r4, #-6; padds [p0], #128; vmul dm2, x0, x2, r12
+; CHECK-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; movxm ls, #.LBB0_5; vmul dm3, x0, x4, r12
+; CHECK-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; movxm le, #.L_LEnd0; vaddmac dm1, dm1, dm2, x10, x8, r10
+; CHECK-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopxm ; vaddmac dm0, dm0, dm3, x10, x6, r10
 ; CHECK-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; nopv
 ; CHECK-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x0, x8, x6, r2; nopv
 ; CHECK-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; nopv
@@ -168,7 +167,7 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ;
 ; NO-PROLOGUE-SPLIT-LABEL: conv2d:
 ; NO-PROLOGUE-SPLIT:       // %bb.0: // %newFuncRoot
-; NO-PROLOGUE-SPLIT-NEXT:    paddxm [sp], #64; nopb ; nopx
+; NO-PROLOGUE-SPLIT-NEXT:    paddxm [sp], #64; nopb ; nops ; nopxm ; nopv
 ; NO-PROLOGUE-SPLIT-NEXT:    st p6, [sp, #-64] // 4-byte Folded Spill
 ; NO-PROLOGUE-SPLIT-NEXT:    mova m0, #-68; mov p6, sp
 ; NO-PROLOGUE-SPLIT-NEXT:    padda [p6], m0
@@ -192,22 +191,18 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ; NO-PROLOGUE-SPLIT-NEXT:    lda m3, [p6], #-4
 ; NO-PROLOGUE-SPLIT-NEXT:    lda dj3, [p6, #0]; movx r30, #63; mov dc7, #0
 ; NO-PROLOGUE-SPLIT-NEXT:    lda dn3, [p6, #-4]; movs p6, p1; movx r25, #0; mov p1, p4
-; NO-PROLOGUE-SPLIT-NEXT:    movs dc0, dc7; vldb.popx x4, [p1, lf1, r25]; mov dc4, dc7
-; NO-PROLOGUE-SPLIT-NEXT:    vlda.128 wl2, [p5, #0]; vldb.pop.3d x6, [p1, lf1, r25, d0]
-; NO-PROLOGUE-SPLIT-NEXT:    vlda.128 wl8, [p5, #16]
-; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2, #0]
-; NO-PROLOGUE-SPLIT-NEXT:    vldb x10, [p0, #0]
-; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2, #64]
-; NO-PROLOGUE-SPLIT-NEXT:    nop
-; NO-PROLOGUE-SPLIT-NEXT:    mov s0, r1
-; NO-PROLOGUE-SPLIT-NEXT:    mova r12, #264; or r21, r10, r10; mov r22, r12
-; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3, #0]; vldb x1, [p0, #64]; movx crupsmode, #0; vshuffle x2, x4, x6, r2; vmul dm2, x0, x2, r12
-; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3, #64]; or r10, r3, r3; mov r3, p5
-; NO-PROLOGUE-SPLIT-NEXT:    st p7, [sp, #-60]; mov s1, r5; vaddmac dm1, dm1, dm2, x2, x10, r10 // 4-byte Folded Spill
-; NO-PROLOGUE-SPLIT-NEXT:    movs p5, p2; mov p7, p0; vmul dm3, x0, x8, r12
-; NO-PROLOGUE-SPLIT-NEXT:    movs dc3, dc7; add r0, r0, #-1; mov dc6, dc7
-; NO-PROLOGUE-SPLIT-NEXT:    movs dc2, dc7; or r19, r8, r8; addm.nc r1, r0, #-1
-; NO-PROLOGUE-SPLIT-NEXT:    mova r16, #5; movs dc5, dc7; or r8, r7, r7; mov dc1, dc7
+; NO-PROLOGUE-SPLIT-NEXT:    vlda.128 wl2, [p5, #0]; vldb.popx x4, [p1, lf1, r25]; movs dc0, dc7; mov dc4, dc7
+; NO-PROLOGUE-SPLIT-NEXT:    vlda.128 wl8, [p5, #16]; vldb.pop.3d x6, [p1, lf1, r25, d0]
+; NO-PROLOGUE-SPLIT-NEXT:    vlda x1, [p0, #64]
+; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2, #0]; mov s0, r1
+; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3, #0]; vldb x10, [p0, #0]; mov r21, r10
+; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2, #64]; or r10, r3, r3; mov r3, p5
+; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3, #64]; mov p5, p2
+; NO-PROLOGUE-SPLIT-NEXT:    st p7, [sp, #-60]; add r0, r0, #-1; mov s1, r5 // 4-byte Folded Spill
+; NO-PROLOGUE-SPLIT-NEXT:    mova r12, #264; movs p7, p0; or r22, r12, r12; mov dc3, dc7
+; NO-PROLOGUE-SPLIT-NEXT:    movs dc6, dc7; movx crupsmode, #0; vshuffle x2, x4, x6, r2; vmul dm2, x0, x2, r12
+; NO-PROLOGUE-SPLIT-NEXT:    movs dc2, dc7; or r19, r8, r8; addm.nc r1, r0, #-1; vmul dm3, x0, x8, r12
+; NO-PROLOGUE-SPLIT-NEXT:    mova r16, #5; nopb ; movs dc5, dc7; or r8, r7, r7; mov dc1, dc7; vaddmac dm1, dm1, dm2, x2, x10, r10
 ; NO-PROLOGUE-SPLIT-NEXT:    mova r18, #16; nopb ; movs p4, p3; movx crsrsmode, #0; mov m5, r17; vaddmac dm0, dm0, dm3, x2, x1, r10
 ; NO-PROLOGUE-SPLIT-NEXT:  .LBB0_1: // %for.body.i
 ; NO-PROLOGUE-SPLIT-NEXT:    // =>This Loop Header: Depth=1
@@ -235,24 +230,20 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda x4, [p0, #192]; paddb [p7], m4; padds [p0], #128; nopx ; mov dc4, dc7; vmac dm0, dm0, x2, x4, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda x6, [p0, #128]; paddb.3d [p7], d2; padds.3d [p1], d1; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda x4, [p0, #192]; nopb ; padds [p0], #128; nopx ; mov r0, dc6; vmac dm0, dm0, x2, x4, r8
-; NO-PROLOGUE-SPLIT-NEXT:    nopa ; vldb x10, [p7, #0]; movs p0, r3; lshl r0, r0, r16; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
-; NO-PROLOGUE-SPLIT-NEXT:    vlda x1, [p7, #64]; vldb.popx x4, [p1, lf1, r25]; nops ; or r20, r0, r18; mov dj7, r0; vmac dm0, dm0, x2, x4, r8
+; NO-PROLOGUE-SPLIT-NEXT:    movs p0, r3; vldb x10, [p7, #0]; lshl r0, r0, r16; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; NO-PROLOGUE-SPLIT-NEXT:    vlda x1, [p7, #64]; vldb.popx x4, [p1, lf1, r25]; or r20, r0, r18; mov dj7, r0; vmac dm0, dm0, x2, x4, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda.128 wl2, [p0, dj7]; vldb.pop.3d x6, [p1, lf1, r25, d0]; movs dj7, r20; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
-; NO-PROLOGUE-SPLIT-NEXT:    vlda.128 wl8, [p0, dj7]; nopb ; nopx ; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8
+; NO-PROLOGUE-SPLIT-NEXT:    vlda.128 wl8, [p0, dj7]; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2, #128]; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2, #192]; nopb ; movs p2, p5; movxm p0, #.LBB0_1; vmac dm0, dm0, x2, x4, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3, #128]; vmac dm1, dm1, x2, x6, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3, #192]; mov p3, p4; vmac dm0, dm0, x2, x4, r8
-; NO-PROLOGUE-SPLIT-NEXT:    nop
-; NO-PROLOGUE-SPLIT-NEXT:    vmul dm2, x0, x2, r12
-; NO-PROLOGUE-SPLIT-NEXT:    vshuffle x2, x4, x6, r2; vmul dm3, x0, x8, r12
-; NO-PROLOGUE-SPLIT-NEXT:    nop
-; NO-PROLOGUE-SPLIT-NEXT:    vst.srs.4x dm1, s1, srssign0, [p6], m5; jnzd r1, r1, p0; vaddmac dm1, dm1, dm2, x2, x10, r10
-; NO-PROLOGUE-SPLIT-NEXT:    vst.2d.srs.4x dm0, s1, srssign0, [p6], d3; movx srssign0, #0; vaddmac dm0, dm0, dm3, x2, x1, r10 // Delay Slot 5
-; NO-PROLOGUE-SPLIT-NEXT:    nop // Delay Slot 4
+; NO-PROLOGUE-SPLIT-NEXT:    jnzd r1, r1, p0
+; NO-PROLOGUE-SPLIT-NEXT:    vmul dm2, x0, x2, r12 // Delay Slot 5
+; NO-PROLOGUE-SPLIT-NEXT:    vshuffle x2, x4, x6, r2; vmul dm3, x0, x8, r12 // Delay Slot 4
 ; NO-PROLOGUE-SPLIT-NEXT:    nop // Delay Slot 3
-; NO-PROLOGUE-SPLIT-NEXT:    nop // Delay Slot 2
-; NO-PROLOGUE-SPLIT-NEXT:    nop // Delay Slot 1
+; NO-PROLOGUE-SPLIT-NEXT:    vst.srs.4x dm1, s1, srssign0, [p6], m5; vaddmac dm1, dm1, dm2, x2, x10, r10 // Delay Slot 2
+; NO-PROLOGUE-SPLIT-NEXT:    vst.2d.srs.4x dm0, s1, srssign0, [p6], d3; movx srssign0, #0; vaddmac dm0, dm0, dm3, x2, x1, r10 // Delay Slot 1
 ; NO-PROLOGUE-SPLIT-NEXT:  // %bb.4: // %cooldown.entry
 ; NO-PROLOGUE-SPLIT-NEXT:    vldb.popx x8, [p1, lf1, r25]
 ; NO-PROLOGUE-SPLIT-NEXT:    vldb.pop.3d x6, [p1, lf1, r25, d0]
@@ -295,7 +286,7 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ;
 ; NO-JNZD-LABEL: conv2d:
 ; NO-JNZD:       // %bb.0: // %newFuncRoot
-; NO-JNZD-NEXT:    paddxm [sp], #64; nopb ; nopxm ; nops
+; NO-JNZD-NEXT:    paddxm [sp], #64; nopb ; nops ; nopxm ; nopv
 ; NO-JNZD-NEXT:    st p6, [sp, #-64] // 4-byte Folded Spill
 ; NO-JNZD-NEXT:    mova m0, #-68; mov p6, sp
 ; NO-JNZD-NEXT:    padda [p6], m0
@@ -322,14 +313,13 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ; NO-JNZD-NEXT:    movs dc0, dc7; vldb.popx x10, [p1, lf1, r25]; mov dc4, dc7
 ; NO-JNZD-NEXT:    vldb.pop.3d x1, [p1, lf1, r25, d0]
 ; NO-JNZD-NEXT:    nop
-; NO-JNZD-NEXT:    vldb.128 wl2, [p5, #0]
-; NO-JNZD-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; movx r18, #5; mov r23, r12
-; NO-JNZD-NEXT:    mova r20, #1; vldb x8, [p0, #0]; or r19, r8, r8; mov s0, r1
-; NO-JNZD-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3], #64; vldb x6, [p0, #64]; or r21, r10, r10; mov s1, r5
-; NO-JNZD-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; or r10, r3, r3; mov dc3, dc7
-; NO-JNZD-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vldb.128 wl4, [p5, #16]; or r8, r7, r7; mov dc2, dc7; movs dc6, dc7
-; NO-JNZD-NEXT:    mova r22, #16; movs dc5, dc7; movx crupsmode, #0; vshuffle x10, x10, x1, r2
-; NO-JNZD-NEXT:    mova r12, #264; movs dc1, dc7; movx crsrsmode, #0; mov m5, r17
+; NO-JNZD-NEXT:    mova r20, #1; movx r18, #5; mov r23, r12
+; NO-JNZD-NEXT:    mova r22, #16; vldb.128 wl4, [p5, #16]; or r19, r8, r8; mov s0, r1
+; NO-JNZD-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; or r21, r10, r10; mov s1, r5
+; NO-JNZD-NEXT:    mova r12, #264; vldb.128 wl2, [p5, #0]; or r10, r3, r3; mov dc3, dc7
+; NO-JNZD-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3], #64; vldb x8, [p0, #0]; or r8, r7, r7; mov dc2, dc7; movs dc6, dc7
+; NO-JNZD-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; movs dc5, dc7; movx crupsmode, #0; mov dc1, dc7
+; NO-JNZD-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vldb x6, [p0, #64]; movx crsrsmode, #0; vshuffle x10, x10, x1, r2; movs m5, r17
 ; NO-JNZD-NEXT:  .LBB0_1: // %for.body.i
 ; NO-JNZD-NEXT:    // =>This Loop Header: Depth=1
 ; NO-JNZD-NEXT:    // Child Loop BB0_2 Depth 2
@@ -353,18 +343,18 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ; NO-JNZD-NEXT:    vlda x6, [p4, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
 ; NO-JNZD-NEXT:  // %bb.3: // %for.cond.cleanup54.i
 ; NO-JNZD-NEXT:    // in Loop: Header=BB0_1 Depth=1
-; NO-JNZD-NEXT:    vlda x4, [p4, #192]; paddb.3d [p1], d1; padds [p4], #128; add r0, r0, #-1; mov dc4, dc7; vmac dm0, dm0, x2, x4, r8
-; NO-JNZD-NEXT:    vlda x6, [p4, #128]; paddb [p0], m4; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
-; NO-JNZD-NEXT:    vlda x4, [p4, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p4], #128; nopx ; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8
-; NO-JNZD-NEXT:    vlda.pop.3d x1, [p1, lf1, r25, d0]; paddb.3d [p0], d2; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
-; NO-JNZD-NEXT:    vldb x8, [p0, #0]; mov r16, dc6; vmac dm0, dm0, x2, x4, r8
-; NO-JNZD-NEXT:    vldb x6, [p0, #64]; lshl r16, r16, r18; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
-; NO-JNZD-NEXT:    or r24, r16, r22; mov dj7, r16; vmac dm0, dm0, x2, x4, r8
+; NO-JNZD-NEXT:    vlda x4, [p4, #192]; paddb [p0], m4; padds [p4], #128; add r0, r0, #-1; mov dc4, dc7; vmac dm0, dm0, x2, x4, r8
+; NO-JNZD-NEXT:    vlda x6, [p4, #128]; paddb.3d [p1], d1; padds.3d [p0], d2; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; NO-JNZD-NEXT:    vlda x4, [p4, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p4], #128; nopx ; mov r16, dc6; vmac dm0, dm0, x2, x4, r8
+; NO-JNZD-NEXT:    vlda x8, [p0, #0]; vldb.pop.3d x1, [p1, lf1, r25, d0]; nops ; lshl r16, r16, r18; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; NO-JNZD-NEXT:    vlda x6, [p0, #64]; nopb ; or r24, r16, r22; mov dj7, r16; vmac dm0, dm0, x2, x4, r8
 ; NO-JNZD-NEXT:    movs dj7, r24; vldb.128 wl2, [p5, dj7]; eq r16, r0, r20; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
-; NO-JNZD-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3], #64; vldb.128 wl4, [p5, dj7]; vmac dm0, dm0, x2, x4, r8
-; NO-JNZD-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; vmac dm1, dm1, x2, x6, r8
-; NO-JNZD-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; vmac dm0, dm0, x2, x4, r8
-; NO-JNZD-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vshuffle x10, x10, x1, r2
+; NO-JNZD-NEXT:    vldb.128 wl4, [p5, dj7]; vmac dm0, dm0, x2, x4, r8
+; NO-JNZD-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; NO-JNZD-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8
+; NO-JNZD-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3], #64; vmac dm1, dm1, x2, x6, r8
+; NO-JNZD-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vmac dm0, dm0, x2, x4, r8
+; NO-JNZD-NEXT:    vshuffle x10, x10, x1, r2
 ; NO-JNZD-NEXT:    jz r16, #.LBB0_1
 ; NO-JNZD-NEXT:    nop // Delay Slot 5
 ; NO-JNZD-NEXT:    nop // Delay Slot 4
@@ -375,11 +365,11 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ; NO-JNZD-NEXT:    vldb.popx x8, [p1, lf1, r25]
 ; NO-JNZD-NEXT:    vldb.pop.3d x6, [p1, lf1, r25, d0]
 ; NO-JNZD-NEXT:    vldb.popx x8, [p1, lf1, r25]
-; NO-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; vmul dm2, x0, x2, r12
-; NO-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; add.nc lc, r4, #-6; padds [p0], #128; vmul dm3, x0, x4, r12
-; NO-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; movxm ls, #.LBB0_5; vaddmac dm1, dm1, dm2, x10, x8, r10
-; NO-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; movxm le, #.L_LEnd0; vaddmac dm0, dm0, dm3, x10, x6, r10
-; NO-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopxm ; nopv
+; NO-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]
+; NO-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; add.nc lc, r4, #-6; padds [p0], #128; vmul dm2, x0, x2, r12
+; NO-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; movxm ls, #.LBB0_5; vmul dm3, x0, x4, r12
+; NO-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; movxm le, #.L_LEnd0; vaddmac dm1, dm1, dm2, x10, x8, r10
+; NO-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopxm ; vaddmac dm0, dm0, dm3, x10, x6, r10
 ; NO-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; nopv
 ; NO-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x0, x8, x6, r2; nopv
 ; NO-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; nopv
@@ -410,6 +400,121 @@ define dso_local void @conv2d(i32 %0, ptr %add.ptr3, ptr %ofm, ptr %psum_0_tdm,
 ; NO-JNZD-NEXT:    nop // Delay Slot 3
 ; NO-JNZD-NEXT:    nop // Delay Slot 2
 ; NO-JNZD-NEXT:    nop // Delay Slot 1
+; USE-JNZD-LABEL: conv2d:
+; USE-JNZD:       // %bb.0: // %newFuncRoot
+; USE-JNZD-NEXT:    paddxm [sp], #64; nopb ; nopx
+; USE-JNZD-NEXT:    st p6, [sp, #-64] // 4-byte Folded Spill
+; USE-JNZD-NEXT:    mova m0, #-68; mov p6, sp
+; USE-JNZD-NEXT:    padda [p6], m0
+; USE-JNZD-NEXT:    lda m0, [p6], #-4
+; USE-JNZD-NEXT:    lda dn0, [p6], #-4
+; USE-JNZD-NEXT:    lda dj0, [p6], #-4
+; USE-JNZD-NEXT:    lda dn4, [p6], #-4
+; USE-JNZD-NEXT:    lda dj4, [p6], #-4
+; USE-JNZD-NEXT:    lda m4, [p6], #-4
+; USE-JNZD-NEXT:    lda m1, [p6], #-4
+; USE-JNZD-NEXT:    lda dj1, [p6], #-4
+; USE-JNZD-NEXT:    lda dj5, [p6], #-4
+; USE-JNZD-NEXT:    lda dn1, [p6], #-4
+; USE-JNZD-NEXT:    lda dn5, [p6], #-4
+; USE-JNZD-NEXT:    lda m2, [p6], #-4
+; USE-JNZD-NEXT:    lda dj2, [p6], #-4
+; USE-JNZD-NEXT:    lda dj6, [p6], #-4
+; USE-JNZD-NEXT:    lda dn2, [p6], #-4
+; USE-JNZD-NEXT:    lda dn6, [p6], #-4
+; USE-JNZD-NEXT:    lda r17, [p6], #-4
+; USE-JNZD-NEXT:    lda m3, [p6], #-4
+; USE-JNZD-NEXT:    lda dj3, [p6, #0]; movx r30, #63; mov dc7, #0
+; USE-JNZD-NEXT:    lda dn3, [p6, #-4]; movs p6, p1; movx r25, #0; mov p1, p4
+; USE-JNZD-NEXT:    movs dc0, dc7; vldb.popx x10, [p1, lf1, r25]; mov dc4, dc7
+; USE-JNZD-NEXT:    vldb.pop.3d x1, [p1, lf1, r25, d0]
+; USE-JNZD-NEXT:    or r22, r12, r12; mov r19, r8
+; USE-JNZD-NEXT:    mova r16, #5; or r21, r10, r10; mov s0, r1
+; USE-JNZD-NEXT:    mova r18, #16; vldb.128 wl4, [p5, #16]; or r10, r3, r3; mov s1, r5
+; USE-JNZD-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; movs dc3, dc7; or r8, r7, r7; mov dc6, dc7
+; USE-JNZD-NEXT:    mova r12, #264; vldb.128 wl2, [p5, #0]; add r0, r0, #-1; mov dc5, dc7; movs dc2, dc7
+; USE-JNZD-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3], #64; vldb x8, [p0, #0]; movx crupsmode, #0; addm.nc r1, r0, #-1; movs dc1, dc7
+; USE-JNZD-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; st p7, [sp, #-60]; movxm p4, #.LBB0_1 // 4-byte Folded Spill
+; USE-JNZD-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vldb x6, [p0, #64]; movx crsrsmode, #0; vshuffle x10, x10, x1, r2; movs m5, r17
+; USE-JNZD-NEXT:  .LBB0_1: // %for.body.i
+; USE-JNZD-NEXT:    // =>This Loop Header: Depth=1
+; USE-JNZD-NEXT:    // Child Loop BB0_2 Depth 2
+; USE-JNZD-NEXT:    nopa ; vldb.popx x10, [p1, lf1, r25]; nops ; nopxm ; nopv
+; USE-JNZD-NEXT:    vldb.pop.3d x8, [p1, lf1, r25, d0]
+; USE-JNZD-NEXT:    vldb.popx x10, [p1, lf1, r25]; mov p7, p0
+; USE-JNZD-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; vmul dm2, x0, x2, r12
+; USE-JNZD-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; add.nc lc, r4, #-6; padds [p7], #128; vmul dm3, x0, x4, r12
+; USE-JNZD-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; movxm ls, #.LBB0_2; vaddmac dm1, dm1, dm2, x10, x8, r10
+; USE-JNZD-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p7], #128; movxm le, #.L_LEnd1; vaddmac dm0, dm0, dm3, x10, x6, r10
+; USE-JNZD-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; nops ; nopxm ; nopv
+; USE-JNZD-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p7], #128; nopxm ; nopv
+; USE-JNZD-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x2, x10, x8, r2; nopv
+; USE-JNZD-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p7], #128; nopxm ; nopv
+; USE-JNZD-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; USE-JNZD-NEXT:  .LBB0_2: // %for.body55.i
+; USE-JNZD-NEXT:    // Parent Loop BB0_1 Depth=1
+; USE-JNZD-NEXT:    // => This Inner Loop Header: Depth=2
+; USE-JNZD-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p7], #128; nopxm ; vmac dm0, dm0, x2, x4, r8
+; USE-JNZD-NEXT:  .L_LEnd1:
+; USE-JNZD-NEXT:    vlda x6, [p7, #128]; vldb.pop.3d x8, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; USE-JNZD-NEXT:  // %bb.3: // %for.cond.cleanup54.i
+; USE-JNZD-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; USE-JNZD-NEXT:    vlda x4, [p7, #192]; paddb.3d [p1], d1; padds [p7], #128; nopx ; mov dc4, dc7; vmac dm0, dm0, x2, x4, r8
+; USE-JNZD-NEXT:    vlda x6, [p7, #128]; paddb [p0], m4; nops ; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; USE-JNZD-NEXT:    vlda x4, [p7, #192]; vldb.popx x10, [p1, lf1, r25]; padds [p7], #128; nopx ; mov srssign0, r6; vmac dm0, dm0, x2, x4, r8
+; USE-JNZD-NEXT:    vlda.pop.3d x1, [p1, lf1, r25, d0]; paddb.3d [p0], d2; nops ; nopx ; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; USE-JNZD-NEXT:    nopa ; vldb x8, [p0, #0]; nops ; nopx ; mov r0, dc6; vmac dm0, dm0, x2, x4, r8
+; USE-JNZD-NEXT:    vldb x6, [p0, #64]; lshl r0, r0, r16; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; USE-JNZD-NEXT:    or r20, r0, r18; mov dj7, r0; vmac dm0, dm0, x2, x4, r8
+; USE-JNZD-NEXT:    movs dj7, r20; vldb.128 wl2, [p5, dj7]; vshuffle x2, x10, x8, r2; vmac dm1, dm1, x2, x6, r8
+; USE-JNZD-NEXT:    vldb.128 wl4, [p5, dj7]; vmac dm0, dm0, x2, x4, r8
+; USE-JNZD-NEXT:    vmac dm1, dm1, x2, x6, r8
+; USE-JNZD-NEXT:    vmac dm0, dm0, x2, x4, r8
+; USE-JNZD-NEXT:    jnzd r1, r1, p4
+; USE-JNZD-NEXT:    nop // Delay Slot 5
+; USE-JNZD-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64 // Delay Slot 4
+; USE-JNZD-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64 // Delay Slot 3
+; USE-JNZD-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p3], #64; vst.srs.4x dm1, s1, srssign0, [p6], m5 // Delay Slot 2
+; USE-JNZD-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p3], #64; vst.2d.srs.4x dm0, s1, srssign0, [p6], d3; movx srssign0, #0; vshuffle x10, x10, x1, r2 // Delay Slot 1
+; USE-JNZD-NEXT:  // %bb.4: // %cooldown.entry
+; USE-JNZD-NEXT:    vldb.popx x8, [p1, lf1, r25]
+; USE-JNZD-NEXT:    vldb.pop.3d x6, [p1, lf1, r25, d0]
+; USE-JNZD-NEXT:    vldb.popx x8, [p1, lf1, r25]
+; USE-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]
+; USE-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; add.nc lc, r4, #-6; padds [p0], #128; vmul dm2, x0, x2, r12
+; USE-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; movxm ls, #.LBB0_5; vmul dm3, x0, x4, r12
+; USE-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; movxm le, #.L_LEnd0; vaddmac dm1, dm1, dm2, x10, x8, r10
+; USE-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopxm ; vaddmac dm0, dm0, dm3, x10, x6, r10
+; USE-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; nopv
+; USE-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x0, x8, x6, r2; nopv
+; USE-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; nopv
+; USE-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x0, x8, x6, r2; vmac dm1, dm1, x0, x4, r8
+; USE-JNZD-NEXT:  .LBB0_5: // %for.body55.i.cd
+; USE-JNZD-NEXT:    // =>This Inner Loop Header: Depth=1
+; USE-JNZD-NEXT:    vlda x2, [p0, #192]; vldb.popx x8, [p1, lf1, r25]; padds [p0], #128; nopxm ; vmac dm0, dm0, x0, x2, r8
+; USE-JNZD-NEXT:  .L_LEnd0:
+; USE-JNZD-NEXT:    vlda x4, [p0, #128]; vldb.pop.3d x6, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle x0, x8, x6, r2; vmac dm1, dm1, x0, x4, r8
+; USE-JNZD-NEXT:  // %bb.6: // %cooldown.exit
+; USE-JNZD-NEXT:    vlda x2, [p0, #192]; nopb ; padds [p0], #128; movx crsrsmode, #0; mov s0, r5; vmac dm0, dm0, x0, x2, r8
+; USE-JNZD-NEXT:    vlda x4, [p0, #128]; nopb ; movs dj0, r17; or r12, r22, r22; vshuffle x0, x8, x6, r2; vmac dm1, dm1, x0, x4, r8
+; USE-JNZD-NEXT:    vlda x2, [p0, #192]; nopb ; padds [p0], #128; or r10, r21, r21; mov srssign0, r6; vmac dm0, dm0, x0, x2, r8
+; USE-JNZD-NEXT:    lda p7, [sp, #-60]; vshuffle x0, x8, x6, r2; vmac dm1, dm1, x0, x4, r8 // 4-byte Folded Reload
+; USE-JNZD-NEXT:    vmac dm0, dm0, x0, x2, r8
+; USE-JNZD-NEXT:    vshuffle x0, x8, x6, r2; vmac dm1, dm1, x0, x4, r8
+; USE-JNZD-NEXT:    vmac dm0, dm0, x0, x2, r8
+; USE-JNZD-NEXT:    vshuffle x0, x8, x6, r2; vmac dm1, dm1, x0, x4, r8
+; USE-JNZD-NEXT:    vmac dm0, dm0, x0, x2, r8
+; USE-JNZD-NEXT:    vmac dm1, dm1, x0, x4, r8
+; USE-JNZD-NEXT:    vmac dm0, dm0, x0, x2, r8
+; USE-JNZD-NEXT:    nop
+; USE-JNZD-NEXT:    nop
+; USE-JNZD-NEXT:    lda p6, [sp, #-64] // 4-byte Folded Reload
+; USE-JNZD-NEXT:    ret lr
+; USE-JNZD-NEXT:    vst.srs.4x dm1, s0, srssign0, [p6, #0] // Delay Slot 5
+; USE-JNZD-NEXT:    vst.srs.4x dm0, s0, srssign0, [p6, dj0] // Delay Slot 4
+; USE-JNZD-NEXT:    nop // Delay Slot 3
+; USE-JNZD-NEXT:    nop // Delay Slot 2
+; USE-JNZD-NEXT:    paddxm [sp], #-64; movx srssign0, #0; mov r8, r19 // Delay Slot 1
 newFuncRoot:
   br label %for.body.i
 
diff --git a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_blocks.ll b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_blocks.ll
index 125947b70f3d..dfbc9e84aaec 100644
--- a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_blocks.ll
+++ b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_blocks.ll
@@ -45,7 +45,7 @@ declare i1 @llvm.loop.decrement.i32(i32) #2
 define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, ptr %p_psum, ptr %p_c, ptr %p_bias, i20 %3, i20 %4, i20 %5, i20 %6, i20 %7, i20 %idx.ext.i, i20 %8, i20 %9, i20 %10, i20 %11, i20 %12, i32 %13, i32 %14, i32 %15, i32 %16, i32 %or23.i.i.i.i, <64 x i8> %17, i32 %18, i32 %19, i32 %20, i32 %21, i32 %22, i32 %or22.i.i.i.i, i32 %conv166, i32 %conv.i.i.i.i, i20 %idx.ext.i478, i20 %23, i20 %24, i20 %25, i20 %26, i20 %27, i20 %28) #3 {
 ; CHECK-LABEL: gemm:
 ; CHECK:       // %bb.0: // %newFuncRoot
-; CHECK-NEXT:    paddxm [sp], #64; nopx
+; CHECK-NEXT:    paddxm [sp], #64; nopb ; nopx
 ; CHECK-NEXT:    st p6, [sp, #-64] // 4-byte Folded Spill
 ; CHECK-NEXT:    mova m0, #-68; mov p6, sp
 ; CHECK-NEXT:    padda [p6], m0
@@ -66,34 +66,33 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt
 ; CHECK-NEXT:    lda r17, [p6], #-4
 ; CHECK-NEXT:    lda r19, [p6], #-4
 ; CHECK-NEXT:    lda m5, [p6], #-4
-; CHECK-NEXT:    vlda.ups.2x cml3, s0, upssign1, [p2], #64
-; CHECK-NEXT:    vlda.ups.2x cmh3, s0, upssign1, [p2], #64; vldb.128 wl4, [p4, #0]
-; CHECK-NEXT:    vlda.ups.2x cml2, s0, upssign1, [p2], #64; vldb.128 wl6, [p4, #16]; add r0, r0, #-1; mov r21, r8
-; CHECK-NEXT:    vlda.ups.2x cmh2, s0, upssign1, [p2], #64; vldb x8, [p1], m4; movx r22, #0; mov dc0, #0
-; CHECK-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; or r23, r10, r10; mov s0, r22
-; CHECK-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; movs dc1, dc0; or r10, r5, r5; mov dc5, dc0
-; CHECK-NEXT:    lda m2, [p6], #-4; vldb.3d x10, [p1], d1; movx crupsmode, #0; vbcst.32 x2, r22
-; CHECK-NEXT:    lda dj2, [p6], #-4; movs dc4, dc0; movx r22, #15; addm.nc r5, r0, #-1
-; CHECK-NEXT:    lda dn2, [p6], #-4; vldb x6, [p0], #64; movs m0, p5; vsel.32 x4, x2, x4, r22
-; CHECK-NEXT:    lda m3, [p6], #-4; vldb.3d x8, [p0], d0; vsel.32 x2, x2, x6, r22
-; CHECK-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p2], #64; vshuffle x6, x8, x0, r1
+; CHECK-NEXT:    vldb.128 wl4, [p4, #0]
+; CHECK-NEXT:    vlda.ups.2x cml3, s0, upssign1, [p2], #64; vldb.128 wl6, [p4, #16]; or r21, r8, r8; mov dc2, #0
+; CHECK-NEXT:    vlda.ups.2x cmh3, s0, upssign1, [p2], #64; vldb x8, [p1], m4; movx r22, #0; mov dc0, #0
+; CHECK-NEXT:    vlda.ups.2x cml2, s0, upssign1, [p2], #64; add r0, r0, #-1; vbcst.32 x2, r22
+; CHECK-NEXT:    vlda.ups.2x cmh2, s0, upssign1, [p2], #64; or r23, r10, r10; mov s0, r22
+; CHECK-NEXT:    lda m2, [p6], #-4; movs dc1, dc0; or r10, r5, r5; mov dc5, dc0
+; CHECK-NEXT:    lda dj2, [p6], #-4; vldb.3d x10, [p1], d1; movx r22, #15; addm.nc r5, r0, #-1
+; CHECK-NEXT:    lda dn2, [p6], #-4; movx crupsmode, #0; vsel.32 x4, x2, x4, r22
+; CHECK-NEXT:    lda m3, [p6], #-4; vsel.32 x2, x2, x6, r22
+; CHECK-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; vshuffle x6, x8, x0, r1
 ; CHECK-NEXT:    lda dj3, [p6, #0]; vshuffle x1, x6, x0, r2
 ; CHECK-NEXT:    lda dn3, [p6, #-4]; movxm p6, #.LBB0_1
-; CHECK-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p2], #64; vshuffle x8, x10, x0, r3
-; CHECK-NEXT:    mova dc2, #0; or r24, r12, r12; mov s1, r17
-; CHECK-NEXT:    mova r12, #776; movs dc3, dc2; movx crsrsmode, #0; vshuffle x10, x8, x0, r4
+; CHECK-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; vldb x6, [p0], #64; or r24, r12, r12; mov s1, r17; movs dc4, dc0
+; CHECK-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p2], #64; movs m0, p5; movx r12, #776; vshuffle x8, x10, x0, r3
+; CHECK-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p2], #64; vldb.3d x8, [p0], d0; movx crsrsmode, #0; vshuffle x10, x8, x0, r4; movs dc3, dc2
 ; CHECK-NEXT:  .LBB0_1: // %for.body
 ; CHECK-NEXT:    // =>This Loop Header: Depth=1
 ; CHECK-NEXT:    // Child Loop BB0_2 Depth 2
-; CHECK-NEXT:    nopa ; vldb x9, [p1], m4; movs m0, p5; nopxm ; vmul dm4, x0, x4, r12
+; CHECK-NEXT:    nopa ; vldb x9, [p1], m4; movs m0, p5; nopxm ; nopv
 ; CHECK-NEXT:    vlda.3d x7, [p1], d1; nopb ; nopx
-; CHECK-NEXT:    vldb x5, [p0], #64; vaddmac dm3, dm3, dm4, x6, x1, r10
-; CHECK-NEXT:    vlda.3d x3, [p0], d0; vmul dm4, x0, x2, r12
-; CHECK-NEXT:    movs m0, p5; vldb x9, [p1], m4; vaddmac dm2, dm2, dm4, x8, x1, r10
-; CHECK-NEXT:    vlda.3d x7, [p1], d1; movxm ls, #.LBB0_2; vaddmac dm1, dm1, dm4, x6, x10, r10
-; CHECK-NEXT:    vldb x5, [p0], #64; movxm le, #.L_LEnd1; vaddmac dm0, dm0, dm4, x8, x10, r10
-; CHECK-NEXT:    vlda.3d x3, [p0], d0; vshuffle x1, x9, x0, r7
-; CHECK-NEXT:    nopa ; vldb x9, [p1], m4; movs m0, p5; add.nc lc, r6, #-3; vshuffle x10, x1, x0, r16; nopv
+; CHECK-NEXT:    vldb x5, [p0], #64; vmul dm4, x0, x4, r12
+; CHECK-NEXT:    vlda.3d x3, [p0], d0
+; CHECK-NEXT:    movs m0, p5; vldb x9, [p1], m4; vaddmac dm3, dm3, dm4, x6, x1, r10
+; CHECK-NEXT:    vlda.3d x7, [p1], d1; movxm ls, #.LBB0_2; vmul dm4, x0, x2, r12
+; CHECK-NEXT:    vldb x5, [p0], #64; movxm le, #.L_LEnd1; vaddmac dm2, dm2, dm4, x8, x1, r10
+; CHECK-NEXT:    vlda.3d x3, [p0], d0; vshuffle x1, x9, x0, r7; vaddmac dm1, dm1, dm4, x6, x10, r10
+; CHECK-NEXT:    nopa ; vldb x9, [p1], m4; movs m0, p5; add.nc lc, r6, #-3; vshuffle x10, x1, x0, r16; vaddmac dm0, dm0, dm4, x8, x10, r10
 ; CHECK-NEXT:    vlda.3d x7, [p1], d1; nopb ; nops ; nopx ; vshuffle x8, x7, x0, r18; nopv
 ; CHECK-NEXT:    nopa ; vldb x5, [p0], #64; nops ; nopx ; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8
 ; CHECK-NEXT:    vlda.3d x3, [p0], d0; nopb ; nops ; nopx ; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8
@@ -236,7 +235,7 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda x1, [p1], m4; paddb.2d [p4], d7; movs p2, p7; nopx ; vshuffle x10, x1, x0, r16; vmac dm1, dm1, x5, x6, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    padda [p2], m6; vldb x8, [p0], #64; movs m0, p5; nopx ; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    nopa ; vldb.3d x10, [p0], d0; nops ; nopx ; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8
-; NO-PROLOGUE-SPLIT-NEXT:    vldb.3d x5, [p1], d1; nopx ; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8
+; NO-PROLOGUE-SPLIT-NEXT:    nopa ; vldb.3d x5, [p1], d1; nopx ; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cmh3, s0, upssign1, [p2], #64; vldb.128 wl6, [p4, #0]; vshuffle x10, x1, x0, r16; vmac dm1, dm1, x5, x6, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cml3, s0, upssign1, [p7, dj2]; vldb.128 wl3, [p4, #16]; movs p7, p6; nopx ; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8
 ; NO-PROLOGUE-SPLIT-NEXT:    vlda.ups.2x cml2, s0, upssign1, [p2], #64; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8
@@ -250,11 +249,10 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt
 ; NO-PROLOGUE-SPLIT-NEXT:    vst.srs.4x dm1, s1, srssign0, [p3], #64; vsel.32 x4, x4, x3, r22
 ; NO-PROLOGUE-SPLIT-NEXT:    vst.2d.srs.4x dm0, s1, srssign0, [p3], d3; movx srssign0, #0; vshuffle x1, x5, x0, r3; vaddmac dm3, dm3, dm4, x8, x6, r10
 ; NO-PROLOGUE-SPLIT-NEXT:    movxm p2, #.LBB0_1; vaddmac dm2, dm2, dm4, x10, x6, r10
-; NO-PROLOGUE-SPLIT-NEXT:    vshuffle x6, x1, x0, r4; vmul dm4, x0, x4, r12
-; NO-PROLOGUE-SPLIT-NEXT:    jnzd r5, r5, p2
-; NO-PROLOGUE-SPLIT-NEXT:    vaddmac dm1, dm1, dm4, x8, x6, r10 // Delay Slot 5
-; NO-PROLOGUE-SPLIT-NEXT:    vaddmac dm0, dm0, dm4, x10, x6, r10 // Delay Slot 4
-; NO-PROLOGUE-SPLIT-NEXT:    nop // Delay Slot 3
+; NO-PROLOGUE-SPLIT-NEXT:    jnzd r5, r5, p2; vshuffle x6, x1, x0, r4; vmul dm4, x0, x4, r12
+; NO-PROLOGUE-SPLIT-NEXT:    nop // Delay Slot 5
+; NO-PROLOGUE-SPLIT-NEXT:    vaddmac dm1, dm1, dm4, x8, x6, r10 // Delay Slot 4
+; NO-PROLOGUE-SPLIT-NEXT:    vaddmac dm0, dm0, dm4, x10, x6, r10 // Delay Slot 3
 ; NO-PROLOGUE-SPLIT-NEXT:    nop // Delay Slot 2
 ; NO-PROLOGUE-SPLIT-NEXT:    nop // Delay Slot 1
 ; NO-PROLOGUE-SPLIT-NEXT:  // %bb.4: // %cooldown.entry
@@ -300,7 +298,7 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt
 ;
 ; NO-JNZD-LABEL: gemm:
 ; NO-JNZD:       // %bb.0: // %newFuncRoot
-; NO-JNZD-NEXT:    paddxm [sp], #64; nopxm
+; NO-JNZD-NEXT:    paddxm [sp], #64; nopb ; nopxm ; nops
 ; NO-JNZD-NEXT:    st p6, [sp, #-64] // 4-byte Folded Spill
 ; NO-JNZD-NEXT:    mova m0, #-68; mov p6, sp
 ; NO-JNZD-NEXT:    padda [p6], m0
@@ -322,19 +320,19 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt
 ; NO-JNZD-NEXT:    lda r19, [p6], #-4
 ; NO-JNZD-NEXT:    lda m5, [p6], #-4
 ; NO-JNZD-NEXT:    nop
-; NO-JNZD-NEXT:    vlda.ups.2x cml3, s0, upssign1, [p2], #64; vldb.128 wl4, [p4, #0]; mov dc2, #0
+; NO-JNZD-NEXT:    vlda.ups.2x cml3, s0, upssign1, [p2], #64; vldb.128 wl4, [p4, #0]
 ; NO-JNZD-NEXT:    vlda.ups.2x cmh3, s0, upssign1, [p2], #64; vldb.128 wl6, [p4, #16]; mov r21, r8
-; NO-JNZD-NEXT:    vlda.ups.2x cml2, s0, upssign1, [p2], #64; vldb x8, [p1], m4; movx r24, #0; mov dc0, #0
-; NO-JNZD-NEXT:    vlda.ups.2x cmh2, s0, upssign1, [p2], #64; vbcst.32 x2, r24
-; NO-JNZD-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; movs dc1, dc0; mov dc5, dc0
+; NO-JNZD-NEXT:    vlda.ups.2x cml2, s0, upssign1, [p2], #64; vldb x8, [p1], m4; mov dc2, #0
+; NO-JNZD-NEXT:    vlda.ups.2x cmh2, s0, upssign1, [p2], #64; mov dc0, #0
+; NO-JNZD-NEXT:    vlda.ups.2x cml1, s0, upssign1, [p2], #64; movs dc1, dc0; movx r24, #0; mov dc5, dc0
 ; NO-JNZD-NEXT:    vlda.ups.2x cmh1, s0, upssign1, [p2], #64; vldb.3d x1, [p1], d1; movx crupsmode, #0; mov s0, r24
-; NO-JNZD-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p2], #64; vldb x6, [p0], #64; movx r24, #15; mov m0, p5; movs dc4, dc0
-; NO-JNZD-NEXT:    lda m2, [p6], #-4; vldb.3d x8, [p0], d0; vsel.32 x4, x2, x4, r24
+; NO-JNZD-NEXT:    vlda.ups.2x cml0, s0, upssign1, [p2], #64; movx r24, #15; vbcst.32 x2, r24
+; NO-JNZD-NEXT:    lda m2, [p6], #-4; vsel.32 x4, x2, x4, r24
 ; NO-JNZD-NEXT:    vlda.ups.2x cmh0, s0, upssign1, [p2], #64; or r23, r10, r10; vsel.32 x2, x2, x6, r24
 ; NO-JNZD-NEXT:    lda dj2, [p6], #-4; or r25, r12, r12; vshuffle x8, x8, x0, r1
-; NO-JNZD-NEXT:    lda dn2, [p6], #-4; movx r22, #1; vshuffle x10, x8, x0, r2
-; NO-JNZD-NEXT:    lda m3, [p6], #-4; movx r12, #776; mov s1, r17
-; NO-JNZD-NEXT:    lda dj3, [p6, #0]; or r10, r5, r5; vshuffle x1, x1, x0, r3
+; NO-JNZD-NEXT:    lda dn2, [p6], #-4; vldb x6, [p0], #64; movx r22, #1; vshuffle x10, x8, x0, r2; movs dc4, dc0
+; NO-JNZD-NEXT:    lda m3, [p6], #-4; movs m0, p5; movx r12, #776; mov s1, r17
+; NO-JNZD-NEXT:    lda dj3, [p6, #0]; vldb.3d x8, [p0], d0; or r10, r5, r5; vshuffle x1, x1, x0, r3
 ; NO-JNZD-NEXT:    lda dn3, [p6, #-4]; movs dc3, dc2; movx crsrsmode, #0; vshuffle x1, x1, x0, r4
 ; NO-JNZD-NEXT:  .LBB0_1: // %for.body
 ; NO-JNZD-NEXT:    // =>This Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined-aa.ll b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined-aa.ll
index 502ee192b9be..3994f2eae751 100644
--- a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined-aa.ll
+++ b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined-aa.ll
@@ -106,9 +106,9 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt
 ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup99
 ; CHECK-NEXT:    // in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    padda [p2], m5; paddb.2d [p4], d7; movs m0, p5; add r0, r0, #-1; vshuffle x10, x1, x0, r16; vmac dm1, dm1, x5, x6, r8
-; CHECK-NEXT:    nopa ; vldb x8, [p0], #64; movs p6, p2; nopx ; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8
-; CHECK-NEXT:    nopa ; vldb x10, [p1], m4; nops ; nopx ; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8
-; CHECK-NEXT:    nopa ; vldb.3d x5, [p1], d1; nopx ; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8
+; CHECK-NEXT:    movs p6, p2; vldb x8, [p0], #64; nopx ; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8
+; CHECK-NEXT:    vldb x10, [p1], m4; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8
+; CHECK-NEXT:    vldb.3d x5, [p1], d1; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8
 ; CHECK-NEXT:    vlda.3d x1, [p0], d0; vldb.128 wl6, [p4, #0]; vshuffle x10, x1, x0, r16; vmac dm1, dm1, x5, x6, r8
 ; CHECK-NEXT:    vlda.ups.2x cml3, s0, upssign1, [p6], #64; vldb.128 wl3, [p4, #16]; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8
 ; CHECK-NEXT:    vlda.ups.2x cmh3, s0, upssign1, [p6], #64; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8
@@ -122,12 +122,10 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt
 ; CHECK-NEXT:    vst.srs.4x dm1, s1, srssign0, [p3], #64; vsel.32 x4, x4, x3, r22
 ; CHECK-NEXT:    vst.2d.srs.4x dm0, s1, srssign0, [p3], d3; movx srssign0, #0; vshuffle x10, x5, x0, r3; vaddmac dm3, dm3, dm4, x8, x6, r12
 ; CHECK-NEXT:    vshuffle x6, x10, x0, r4; vaddmac dm2, dm2, dm4, x1, x6, r12
-; CHECK-NEXT:    vmul dm4, x0, x4, r10
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    jnz r0, #.LBB0_1; vaddmac dm1, dm1, dm4, x8, x6, r12
-; CHECK-NEXT:    vaddmac dm0, dm0, dm4, x1, x6, r12 // Delay Slot 5
-; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    nop // Delay Slot 3
+; CHECK-NEXT:    jnz r0, #.LBB0_1; vmul dm4, x0, x4, r10
+; CHECK-NEXT:    nop // Delay Slot 5
+; CHECK-NEXT:    vaddmac dm1, dm1, dm4, x8, x6, r12 // Delay Slot 4
+; CHECK-NEXT:    vaddmac dm0, dm0, dm4, x1, x6, r12 // Delay Slot 3
 ; CHECK-NEXT:    nop // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 ; CHECK-NEXT:  // %bb.4: // %cooldown.entry
diff --git a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined.ll b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined.ll
index c73701b5f2da..c9f84a0887b7 100644
--- a/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined.ll
+++ b/llvm/test/CodeGen/AIE/aie2ps/end-to-end/gemm_int8_outerloop_pipelined.ll
@@ -107,9 +107,9 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt
 ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup99
 ; CHECK-NEXT:    // in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    padda [p2], m5; paddb.2d [p4], d7; movs m0, p5; add r0, r0, #-1; vshuffle x10, x1, x0, r16; vmac dm1, dm1, x5, x6, r8
-; CHECK-NEXT:    nopa ; vldb x8, [p0], #64; movs p6, p2; nopx ; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8
-; CHECK-NEXT:    nopa ; vldb x10, [p1], m4; nops ; nopx ; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8
-; CHECK-NEXT:    nopa ; vldb.3d x5, [p1], d1; nopx ; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8
+; CHECK-NEXT:    movs p6, p2; vldb x8, [p0], #64; nopx ; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8
+; CHECK-NEXT:    vldb x10, [p1], m4; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8
+; CHECK-NEXT:    vldb.3d x5, [p1], d1; vshuffle x1, x9, x0, r7; vmac dm2, dm2, x3, x10, r8
 ; CHECK-NEXT:    vlda.3d x1, [p0], d0; vldb.128 wl6, [p4, #0]; vshuffle x10, x1, x0, r16; vmac dm1, dm1, x5, x6, r8
 ; CHECK-NEXT:    vlda.ups.2x cml3, s0, upssign1, [p6], #64; vldb.128 wl3, [p4, #16]; vshuffle x8, x7, x0, r18; vmac dm0, dm0, x3, x6, r8
 ; CHECK-NEXT:    vlda.ups.2x cmh3, s0, upssign1, [p6], #64; vshuffle x6, x8, x0, r20; vmac dm3, dm3, x5, x10, r8
@@ -123,12 +123,10 @@ define dso_local void @gemm(i32 %0, ptr addrspace(5) %1, ptr addrspace(5) %2, pt
 ; CHECK-NEXT:    vst.srs.4x dm1, s1, srssign0, [p3], #64; vsel.32 x4, x4, x3, r22
 ; CHECK-NEXT:    vst.2d.srs.4x dm0, s1, srssign0, [p3], d3; movx srssign0, #0; vshuffle x10, x5, x0, r3; vaddmac dm3, dm3, dm4, x8, x6, r12
 ; CHECK-NEXT:    vshuffle x6, x10, x0, r4; vaddmac dm2, dm2, dm4, x1, x6, r12
-; CHECK-NEXT:    vmul dm4, x0, x4, r10
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    jnz r0, #.LBB0_1; vaddmac dm1, dm1, dm4, x8, x6, r12
-; CHECK-NEXT:    vaddmac dm0, dm0, dm4, x1, x6, r12 // Delay Slot 5
-; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    nop // Delay Slot 3
+; CHECK-NEXT:    jnz r0, #.LBB0_1; vmul dm4, x0, x4, r10
+; CHECK-NEXT:    nop // Delay Slot 5
+; CHECK-NEXT:    vaddmac dm1, dm1, dm4, x8, x6, r12 // Delay Slot 4
+; CHECK-NEXT:    vaddmac dm0, dm0, dm4, x1, x6, r12 // Delay Slot 3
 ; CHECK-NEXT:    nop // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 ; CHECK-NEXT:  // %bb.4: // %cooldown.entry
diff --git a/llvm/test/CodeGen/AIE/hardware-loops/loop-with-call.ll b/llvm/test/CodeGen/AIE/hardware-loops/loop-with-call.ll
index a31040ee5271..3b6b3ed3e574 100644
--- a/llvm/test/CodeGen/AIE/hardware-loops/loop-with-call.ll
+++ b/llvm/test/CodeGen/AIE/hardware-loops/loop-with-call.ll
@@ -204,8 +204,8 @@ define dso_local void @_Z5test4i(i32 noundef %n) {
 ; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
 ; AIE2P-NEXT:    nop // Delay Slot 3
-; AIE2P-NEXT:    st lr, [sp, #-64] // 4-byte Folded Spill Delay Slot 2
-; AIE2P-NEXT:    nop // Delay Slot 1
+; AIE2P-NEXT:    nop // Delay Slot 2
+; AIE2P-NEXT:    st lr, [sp, #-64] // 4-byte Folded Spill Delay Slot 1
 ; AIE2P-NEXT:  .LBB1_1: // %for.body
 ; AIE2P-NEXT:    // =>This Inner Loop Header: Depth=1
 ; AIE2P-NEXT:    nopa ; nopb ; jl #_Z16addToSymbolTablePKci; nops
@@ -309,10 +309,10 @@ define dso_local void @memcpy_lowered_to_call(ptr nocapture writeonly %a, ptr no
 ; AIE2P-NEXT:    nopa ; nopb ; st r8, [sp, #-60]; ge r0, r1, r0; mov r8, r0; nopv // 4-byte Folded Spill
 ; AIE2P-NEXT:    jnz r0, #.LBB2_3
 ; AIE2P-NEXT:    nopx // Delay Slot 5
-; AIE2P-NEXT:    st lr, [sp, #-64] // 4-byte Folded Spill Delay Slot 4
-; AIE2P-NEXT:    st p6, [sp, #-56] // 4-byte Folded Spill Delay Slot 3
-; AIE2P-NEXT:    st p7, [sp, #-52] // 4-byte Folded Spill Delay Slot 2
-; AIE2P-NEXT:    nop // Delay Slot 1
+; AIE2P-NEXT:    nop // Delay Slot 4
+; AIE2P-NEXT:    st lr, [sp, #-64] // 4-byte Folded Spill Delay Slot 3
+; AIE2P-NEXT:    st p6, [sp, #-56] // 4-byte Folded Spill Delay Slot 2
+; AIE2P-NEXT:    st p7, [sp, #-52] // 4-byte Folded Spill Delay Slot 1
 ; AIE2P-NEXT:  // %bb.1:
 ; AIE2P-NEXT:    movs p6, p0; mov p7, p1
 ; AIE2P-NEXT:  .LBB2_2: // %for.body
diff --git a/llvm/test/CodeGen/AIE/hardware-loops/nested.ll b/llvm/test/CodeGen/AIE/hardware-loops/nested.ll
index 39aa8210245f..8d4b34ef041c 100644
--- a/llvm/test/CodeGen/AIE/hardware-loops/nested.ll
+++ b/llvm/test/CodeGen/AIE/hardware-loops/nested.ll
@@ -24,17 +24,16 @@ define void @nested(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef
 ; AIE2:       // %bb.0: // %for.cond3.preheader.lr.ph
 ; AIE2-NEXT:    nopb ; mova r3, #0; nops ; nopxm ; nopv
 ; AIE2-NEXT:    mova r4, #2; nopx
-; AIE2-NEXT:    movxm p2, #.LBB0_2
 ; AIE2-NEXT:    lda r2, [p0, #0]
+; AIE2-NEXT:    movxm p2, #.LBB0_2
 ; AIE2-NEXT:  .LBB0_1: // %for.cond3.preheader
 ; AIE2-NEXT:    // =>This Loop Header: Depth=1
 ; AIE2-NEXT:    // Child Loop BB0_2 Depth 2
-; AIE2-NEXT:    nopa ; lshl r5, r3, r4; nopm
+; AIE2-NEXT:    nopa ; nopb ; lshl r5, r3, r4; nopm
 ; AIE2-NEXT:    mov dj0, r5
 ; AIE2-NEXT:    lda p3, [p1, dj0]
 ; AIE2-NEXT:    nop
 ; AIE2-NEXT:    nop
-; AIE2-NEXT:    nop
 ; AIE2-NEXT:    mova r6, #0
 ; AIE2-NEXT:    add.nc r5, r1, #-1
 ; AIE2-NEXT:  .LBB0_2: // %for.body6
@@ -73,17 +72,16 @@ define void @nested(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef
 ; AIE2P:       // %bb.0: // %for.cond3.preheader.lr.ph
 ; AIE2P-NEXT:    mova r3, #0; nopb ; nops ; nopxm ; nopv
 ; AIE2P-NEXT:    mova r4, #2; nopx
-; AIE2P-NEXT:    movxm p2, #.LBB0_2
 ; AIE2P-NEXT:    lda r2, [p0, #0]
+; AIE2P-NEXT:    movxm p2, #.LBB0_2
 ; AIE2P-NEXT:  .LBB0_1: // %for.cond3.preheader
 ; AIE2P-NEXT:    // =>This Loop Header: Depth=1
 ; AIE2P-NEXT:    // Child Loop BB0_2 Depth 2
-; AIE2P-NEXT:    nopa ; lshl r5, r3, r4; nopm
+; AIE2P-NEXT:    nopa ; nopb ; lshl r5, r3, r4; nopm
 ; AIE2P-NEXT:    mov dj0, r5
 ; AIE2P-NEXT:    lda p3, [p1, dj0]
 ; AIE2P-NEXT:    nop
 ; AIE2P-NEXT:    nop
-; AIE2P-NEXT:    nop
 ; AIE2P-NEXT:    mova r6, #0
 ; AIE2P-NEXT:    add.nc r5, r1, #-1
 ; AIE2P-NEXT:  .LBB0_2: // %for.body6
@@ -127,12 +125,11 @@ define void @nested(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef
 ; AIE2PS-NEXT:  .LBB0_1: // %for.cond3.preheader
 ; AIE2PS-NEXT:    // =>This Loop Header: Depth=1
 ; AIE2PS-NEXT:    // Child Loop BB0_2 Depth 2
-; AIE2PS-NEXT:    nopa ; lshl r16, r4, r6; nopm
+; AIE2PS-NEXT:    nopa ; nopb ; lshl r16, r4, r6; nopm
 ; AIE2PS-NEXT:    mov dj0, r16
 ; AIE2PS-NEXT:    lda p3, [p1, dj0]
 ; AIE2PS-NEXT:    nop
 ; AIE2PS-NEXT:    nop
-; AIE2PS-NEXT:    nop
 ; AIE2PS-NEXT:    addm.nc r3, r1, #-1
 ; AIE2PS-NEXT:    mova r16, #0
 ; AIE2PS-NEXT:  .LBB0_2: // %for.body6
diff --git a/llvm/test/CodeGen/AIE/schedule/commit-block-schedule.ll b/llvm/test/CodeGen/AIE/schedule/commit-block-schedule.ll
index cb2b638af789..4c89cd3bc47c 100644
--- a/llvm/test/CodeGen/AIE/schedule/commit-block-schedule.ll
+++ b/llvm/test/CodeGen/AIE/schedule/commit-block-schedule.ll
@@ -23,8 +23,8 @@ define void @test_commit_block_schedule(i1 %0) {
 ; CHECK-NEXT:    nopa ; nopb ; nops ; movxm le, #.L_LEnd0; nopv
 ; CHECK-NEXT:    nopa ; nopb ; nops ; nopxm ; nopv
 ; CHECK-NEXT:    nopa ; nopb ; nops ; nopxm ; nopv
-; CHECK-NEXT:    nopa ; nopb ; nops ; nopx ; vmov cml2, cml0; nopv
 ; CHECK-NEXT:    nopa ; nopb ; nops ; nopxm ; nopv
+; CHECK-NEXT:    nopa ; nopb ; nops ; nopx ; vmov cml2, cml0; nopv
 ; CHECK-NEXT:  .LBB0_2: // %for.body54
 ; CHECK-NEXT:    // Parent Loop BB0_1 Depth=1
 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/AIE/switch.ll b/llvm/test/CodeGen/AIE/switch.ll
index 01686e40e019..2ec5147ccf10 100644
--- a/llvm/test/CodeGen/AIE/switch.ll
+++ b/llvm/test/CodeGen/AIE/switch.ll
@@ -164,8 +164,8 @@ define  i32 @test(i8 signext %i) noinline nounwind optnone {
 ; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
 ; AIE2P-NEXT:    nop // Delay Slot 3
-; AIE2P-NEXT:    st lr, [sp, #-64] // 4-byte Folded Spill Delay Slot 2
-; AIE2P-NEXT:    nop // Delay Slot 1
+; AIE2P-NEXT:    nop // Delay Slot 2
+; AIE2P-NEXT:    st lr, [sp, #-64] // 4-byte Folded Spill Delay Slot 1
 ; AIE2P-NEXT:  // %bb.1: // %entry
 ; AIE2P-NEXT:    movxm p0, ##.LJTI0_0
 ; AIE2P-NEXT:    movxm r1, #1048575

From 843f9612b0ba674add454fd7dce8cef61de6a86c Mon Sep 17 00:00:00 2001
From: Martien de Jong <martien.de-jong@xilinx.com>
Date: Tue, 26 May 2026 15:29:38 +0200
Subject: [PATCH 8/9] hNFC][MaxLatencyFinder] Remove old maxlatency computation

o findEarliestRef cs removed
o InterBlockEdges subclasses DataDependencyHelper and override mayAlias to
  implement SafeToIgnoreeMemDeps
o AAResults enters through the scheduling context in the constructor
---
 llvm/lib/Target/AIE/AIEBaseSubtarget.cpp      |  12 +-
 llvm/lib/Target/AIE/AIEBaseSubtarget.h        |   3 +-
 .../Target/AIE/AIEDataDependenceHelper.cpp    |  17 +-
 llvm/lib/Target/AIE/AIEDataDependenceHelper.h |  26 ++-
 llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp   | 172 ++----------------
 llvm/lib/Target/AIE/AIEMaxLatencyFinder.h     |  25 +--
 6 files changed, 60 insertions(+), 195 deletions(-)

diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
index a5c95241b2b1..1cbf15e7065f 100644
--- a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
+++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
@@ -269,8 +269,6 @@ class BiasDepth : public ScheduleDAGMutation {
 };
 
 class RegionEndEdges : public ScheduleDAGMutation {
-  AAResults *AA;
-
   void removeExitSUPreds(ScheduleDAGInstrs *DAG) {
     SUnit &ExitSU = DAG->ExitSU;
     while (!ExitSU.Preds.empty()) {
@@ -278,7 +276,7 @@ class RegionEndEdges : public ScheduleDAGMutation {
     }
   }
   void apply(ScheduleDAGInstrs *DAG) override {
-    AIE::MaxLatencyFinder MaxLatency(DAG, AA);
+    AIE::MaxLatencyFinder MaxLatency(DAG);
     MachineBasicBlock *PrologueMBB = DAG->getBB();
     unsigned int ZOLBundlesCount = 0;
 
@@ -349,7 +347,7 @@ class RegionEndEdges : public ScheduleDAGMutation {
   };
 
 public:
-  RegionEndEdges(AAResults *AA = nullptr) : AA(AA) {}
+  RegionEndEdges() {}
 };
 
 /// This Mutator is responsible for emitting "fixed" SUnits at the top or bottom
@@ -916,7 +914,7 @@ AIEBaseSubtarget::getPostRAMutationsImpl(const Triple &TT, AAResults *AA) {
     // EmitFixedSUnits must run last. Both are applied via applyMutations()
     // inside AIEPostRASchedStrategy::buildGraph, which also suppresses the
     // redundant postProcessDAG() call from ScheduleDAGMI::schedule().
-    Mutations.emplace_back(createRegionEndEdgesMutation(AA));
+    Mutations.emplace_back(createRegionEndEdgesMutation());
     Mutations.emplace_back(std::make_unique<MemoryEdges>(true));
     Mutations.emplace_back(std::make_unique<MachineSchedWAWEdges>());
     Mutations.emplace_back(std::make_unique<BiasDepth>());
@@ -926,8 +924,8 @@ AIEBaseSubtarget::getPostRAMutationsImpl(const Triple &TT, AAResults *AA) {
 }
 
 std::unique_ptr<ScheduleDAGMutation>
-AIEBaseSubtarget::createRegionEndEdgesMutation(AAResults *AA) {
-  return std::make_unique<RegionEndEdges>(AA);
+AIEBaseSubtarget::createRegionEndEdgesMutation() {
+  return std::make_unique<RegionEndEdges>();
 }
 
 std::unique_ptr<ScheduleDAGMutation>
diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.h b/llvm/lib/Target/AIE/AIEBaseSubtarget.h
index 9c0302b4b7d3..9e0146d50d92 100644
--- a/llvm/lib/Target/AIE/AIEBaseSubtarget.h
+++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.h
@@ -91,8 +91,7 @@ class AIEBaseSubtarget : public TargetSubtargetInfo {
   /// Create the RegionEndEdges mutation for use in buildGraph, where it is
   /// invoked directly after the other post-RA mutations and before
   /// createEmitFixedSUnitsMutation (ordering is significant).
-  static std::unique_ptr<ScheduleDAGMutation>
-  createRegionEndEdgesMutation(AAResults *AA);
+  static std::unique_ptr<ScheduleDAGMutation> createRegionEndEdgesMutation();
 
   /// Create the EmitFixedSUnits mutation for use in buildGraph, invoked after
   /// createRegionEndEdgesMutation to preserve the ExitSU-edge ordering
diff --git a/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp b/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp
index ea36c7ae9f9f..3d50710edad4 100644
--- a/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp
+++ b/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp
@@ -73,20 +73,31 @@ void DataDependenceHelper::dumpDot(raw_ostream &OS,
 }
 
 void InterBlockEdges::addNode(MachineInstr *MI) {
-  if (auto Index = DDG.initSUnit(*MI)) {
+  if (auto Index = initSUnit(*MI)) {
     IndexMap &TheMap = Boundary ? SuccMap : PredMap;
     TheMap.emplace(MI, *Index);
   }
 }
 
-void InterBlockEdges::markBoundary() { Boundary = DDG.SUnits.size(); }
+void InterBlockEdges::markBoundary() { Boundary = SUnits.size(); }
+
+bool InterBlockEdges::mayAlias(SUnit *SUa, SUnit *SUb, bool TBAA) {
+  if (SafeToIgnoreMemDeps && Boundary) {
+    // Suppress memory edges that cross the pre/post boundary.
+    const bool AIsPost = SUa->NodeNum >= *Boundary;
+    const bool BIsPost = SUb->NodeNum >= *Boundary;
+    if (AIsPost != BIsPost)
+      return false;
+  }
+  return DataDependenceHelper::mayAlias(SUa, SUb, TBAA);
+}
 
 const SUnit *InterBlockEdges::getPreBoundaryNode(MachineInstr *MI) const {
   const auto Found = PredMap.find(MI);
   if (Found == PredMap.end()) {
     return nullptr;
   }
-  return &DDG.SUnits.at(Found->second);
+  return &SUnits.at(Found->second);
 }
 
 bool InterBlockEdges::isPostBoundaryNode(SUnit *SU) const {
diff --git a/llvm/lib/Target/AIE/AIEDataDependenceHelper.h b/llvm/lib/Target/AIE/AIEDataDependenceHelper.h
index 34b12e9c0a94..58e5d4404639 100644
--- a/llvm/lib/Target/AIE/AIEDataDependenceHelper.h
+++ b/llvm/lib/Target/AIE/AIEDataDependenceHelper.h
@@ -38,6 +38,8 @@ class DataDependenceHelper : public ScheduleDAGInstrs {
   std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
   const MachineSchedContext &Context;
   void schedule() override {};
+
+protected:
   bool mayAlias(SUnit *SUa, SUnit *SUb, bool TBAA) override;
 
 public:
@@ -59,10 +61,14 @@ class DataDependenceHelper : public ScheduleDAGInstrs {
 /// This class generates all edges between nodes in two flow-adjacent regions.
 /// The nodes are added in forward flow order, marking the boundary at the
 /// appropriate point.
-class InterBlockEdges {
-  DataDependenceHelper DDG;
+///
+/// When SafeToIgnoreMemDeps is set, memory-alias edges that cross the
+/// pre/post boundary are suppressed via a mayAlias() override.
+class InterBlockEdges : public DataDependenceHelper {
   // The boundary between Pred and Succ nodes.
   std::optional<unsigned> Boundary;
+  // When true, memory edges crossing the boundary are suppressed.
+  bool SafeToIgnoreMemDeps = false;
 
   /// We can add the same instruction on both sides of the boundary.
   /// We maintain explicit maps to retrieve the corresponding SUnit.
@@ -70,9 +76,13 @@ class InterBlockEdges {
   IndexMap PredMap;
   IndexMap SuccMap;
 
+  bool mayAlias(SUnit *SUa, SUnit *SUb, bool TBAA) override;
+
 public:
-  InterBlockEdges(const MachineSchedContext &Context)
-      : DDG(Context, true, true) {}
+  InterBlockEdges(const MachineSchedContext &Context,
+                  bool SafeToIgnoreMemDeps = false)
+      : DataDependenceHelper(Context, true, true),
+        SafeToIgnoreMemDeps(SafeToIgnoreMemDeps) {}
 
   /// Add a Node to the DAG.
   void addNode(MachineInstr *);
@@ -83,13 +93,9 @@ class InterBlockEdges {
   /// part of the successor.
   void markBoundary();
 
-  /// Create all the edges by interpreting read and write events of the nodes
-  /// in reverse order.
-  void buildEdges() { DDG.buildEdges(); }
-
   /// To iterate forward across the SUnits of the underlying DDG.
-  auto begin() const { return DDG.SUnits.begin(); }
-  auto end() const { return DDG.SUnits.end(); }
+  auto begin() const { return SUnits.begin(); }
+  auto end() const { return SUnits.end(); }
 
   /// The following two methods are used to find the cross-boundary edges,
   /// by starting from a pre-boundary node and selecting its successor edges
diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
index 9677aa989514..1da87996b0e1 100644
--- a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
+++ b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
@@ -74,106 +74,6 @@ bool MaxLatencyFinder::isBottomRegion(MachineInstr *ExitMI) {
   return std::next(It) == CurBB->end();
 }
 
-/// Check whether SrcOp and DstOp might refer to the same value
-static bool overlap(const MachineOperand &SrcOp, const MachineOperand &DstOp,
-                    const TargetRegisterInfo *TRI) {
-  Register SrcReg = SrcOp.getReg();
-  Register DstReg = DstOp.getReg();
-
-  // Use TRI's regsOverlap which handles both physical and virtual registers,
-  // including subregisters and lane masks
-  return TRI->regsOverlap(SrcReg, DstReg);
-}
-
-/// Check whether Dst depends on Src
-static bool depends(const MachineInstr &Src, const MachineInstr &Dst,
-                    const TargetRegisterInfo *TRI, AAResults *AA,
-                    bool SafeToIgnoreMemDeps) {
-
-  const AIEBaseInstrInfo *const TII = static_cast<const AIEBaseInstrInfo *>(
-      Src.getMF()->getSubtarget().getInstrInfo());
-  // Detect dependency between lock and ld/st intructions.
-  if ((TII->isLock(Src.getOpcode()) && (Dst.mayLoadOrStore())) ||
-      (TII->isLock(Dst.getOpcode()) && (Src.mayLoadOrStore()))) {
-    return true;
-  }
-
-  // We detect any common register input/output between Dst and Src
-  for (auto &SrcOp : Src.operands()) {
-    if (!SrcOp.isReg()) {
-      continue;
-    }
-    for (auto &DstOp : Dst.operands()) {
-      if (!DstOp.isReg()) {
-        continue;
-      }
-      // Exclude the RAR case
-      if (SrcOp.isUse() && DstOp.isUse()) {
-        continue;
-      }
-      if (overlap(SrcOp, DstOp, TRI)) {
-        return true;
-      }
-    }
-  }
-
-  // Use alias analysis if available.
-  // The memory latency is accounted for by maxLatency() and any
-  // possible dependence will be corrected for by its scheduled cycle.
-  // (RAW || WAW) ||
-  // (WAR)
-  if ((Src.mayStore() && (Dst.mayLoad() || Dst.mayStore())) ||
-      (Src.mayLoad() && Dst.mayStore())) {
-
-    // For non-part-word memory instructions, use alias analysis (if available)
-    // to determine if Src and Dst may alias. Part-word instructions are always
-    // treated conservatively due to their read-modify-write behavior.
-    auto IsPartWordStore = [&TII](const MachineInstr &MaybePartStore) {
-      return MaybePartStore.mayStore() &&
-             TII->isPartWordMemoryInst(MaybePartStore);
-    };
-
-    if (!IsPartWordStore(Src)) {
-
-      // If it's safe to ignore memory dependencies, skip memory checks.
-      if (SafeToIgnoreMemDeps)
-        return false;
-
-      if (AA)
-        return Src.mayAlias(AA, Dst, true);
-    }
-
-    // Conservative: assume dependency for part-word instructions or when AA
-    // is unavailable
-    return true;
-  }
-
-  return false;
-}
-
-InstrAndCycle findEarliestRef(const MachineInstr &SrcMI,
-                              ArrayRef<MachineBundle> Bundles, int Prune,
-                              AAResults *AA, bool SafeToIgnoreMemDeps) {
-  const TargetRegisterInfo *TRI =
-      SrcMI.getMF()->getSubtarget().getRegisterInfo();
-  int Cycle = 0;
-  for (const auto &Bundle : Bundles) {
-    if (Cycle >= Prune) {
-      LLVM_DEBUG(dbgs() << " prune at " << Cycle << "\n");
-      return {/*MI=*/nullptr, Cycle};
-    }
-    for (MachineInstr *DstMI : Bundle.getInstrs()) {
-      LLVM_DEBUG(dbgs() << " " << *DstMI);
-      if (depends(SrcMI, *DstMI, TRI, AA, SafeToIgnoreMemDeps)) {
-        LLVM_DEBUG(dbgs() << "    depends in cycle=" << Cycle << "\n");
-        return {DstMI, Cycle};
-      }
-    }
-    Cycle++;
-  }
-  return {/*MI=*/nullptr, Cycle};
-}
-
 MaxLatencyFinder::MaxLatencyFinder(const MachineSchedContext &C,
                                    const AIEPostRASchedStrategy *Scheduler,
                                    MachineBasicBlock *CurBB)
@@ -181,14 +81,14 @@ MaxLatencyFinder::MaxLatencyFinder(const MachineSchedContext &C,
                                 C.MF->getSubtarget().getInstrInfo())),
       Itineraries(C.MF->getSubtarget().getInstrItineraryData()),
       TRI(C.MF->getSubtarget().getRegisterInfo()), CurBB(CurBB),
-      InterBlock(true), AA(C.AA), SafeToIgnoreMemDeps(false) {}
+      InterBlock(true) {}
 
 // This is called from different contexts, so we need some case analysis
 // If we have a basic block, we are in a regular MachineScheduler invocation,
 // and we will be able to retrieve its strategy,
 // Otherwise we are an abstract region; Scheduler will be nullptr, which
 // will not be dereferenced.
-MaxLatencyFinder::MaxLatencyFinder(ScheduleDAGInstrs *DAG, AAResults *AA)
+MaxLatencyFinder::MaxLatencyFinder(ScheduleDAGInstrs *DAG)
     : Scheduler(DAG->getBB()
                     ? static_cast<AIEScheduleDAGMI *>(DAG)->getSchedImpl()
                     : nullptr),
@@ -197,20 +97,7 @@ MaxLatencyFinder::MaxLatencyFinder(ScheduleDAGInstrs *DAG, AAResults *AA)
       TRI(DAG->MF.getSubtarget().getRegisterInfo()), CurBB(DAG->getBB()),
       InterBlock(InterBlockLatency && CurBB &&
                  isBottomRegion(DAG->ExitSU.getInstr()) &&
-                 Scheduler->successorsAreScheduled(CurBB)),
-      AA(AA),
-      // This is a current assumption needed to achieve a proper compact
-      // schedule.
-      // A loop is considered a candidate for outer loop pipelining if there are
-      // no memory-carried dependencies. The outer loop pipeliner attaches
-      // related metadata to the loop/epilogue, which we capture here. This
-      // metadata indicates that epilogue stores will not alias with loads from
-      // the peeled iteration. We will further analyze why AA is too
-      // conservative in some cases and remove this assumption when possible.
-      SafeToIgnoreMemDeps(Scheduler && CurBB &&
-                          Scheduler->getInterBlock()
-                              .getBlockState(CurBB)
-                              .isSafeToIgnoreMemDeps()) {
+                 Scheduler->successorsAreScheduled(CurBB)) {
   if (CurBB && Scheduler) {
     const Region &CurRegion =
         Scheduler->getInterBlock().getBlockState(CurBB).getCurrentRegion();
@@ -224,8 +111,14 @@ void MaxLatencyFinder::buildInterBlockEdges(const Region &CurRegion) {
 
   HasUnknownSuccessors = CurBB->succ_empty();
 
+  // When the outer loop pipeliner has annotated the epilogue to indicate that
+  // epilogue stores will not alias with loads from the peeled iteration, we
+  // suppress cross-boundary memory edges in the inter-block DDG accordingly.
+  const bool SafeToIgnoreMemDeps =
+      IB.getBlockState(CurBB).isSafeToIgnoreMemDeps();
+
   for (MachineBasicBlock *SuccBB : CurBB->successors()) {
-    SuccessorEdges &SE = PerSuccEdges.emplace_back(C);
+    SuccessorEdges &SE = PerSuccEdges.emplace_back(C, SafeToIgnoreMemDeps);
 
     // Pre-boundary: free instructions of the current region.
     for (MachineInstr *MI : CurRegion.getFreeInstructions())
@@ -291,33 +184,8 @@ unsigned MaxLatencyFinder::operator()(MachineInstr &MI) {
     return Latency;
   }
 
-  LLVM_DEBUG(dbgs() << "Earliest for: " << MI);
-
-  // Original findEarliestRef-based effective latency (the actual return value).
-  // For each successor block, find the earliest cycle in its top region where
-  // MI creates a dependency; reduce the raw latency by that many cycles.
-  int Earliest = Latency;
-  for (MachineBasicBlock *SuccBB : CurBB->successors()) {
-    const auto &SBS = IB.getBlockState(SuccBB);
-    assert(SBS.isScheduled());
-    if (SBS.getRegions().empty()) {
-      // Empty block: no instructions to find a dependency in, conservative.
-      Earliest = 0;
-      continue;
-    }
-    const std::vector<AIE::MachineBundle> &TopBundles = SBS.getTop().Bundles;
-    Earliest =
-        findEarliestRef(MI, TopBundles, Earliest, AA, SafeToIgnoreMemDeps)
-            .Cycle;
-  }
-  LLVM_DEBUG(dbgs() << "   Earliest=" << Earliest << "\n");
-  const int OldEffectiveLatency = std::max(Latency - Earliest, 1);
-  LLVM_DEBUG(dbgs() << "   EffectiveLatency(old)=" << OldEffectiveLatency
-                    << "\n");
-
-  // New DDG-based computation — logged for comparison while investigating.
-  int NewEffectiveLatency = HasUnknownSuccessors ? Latency : 0;
-  LLVM_DEBUG(dbgs() << "   NewEffectiveLatency init=" << NewEffectiveLatency
+  int EffectiveLatency = HasUnknownSuccessors ? Latency : 0;
+  LLVM_DEBUG(dbgs() << "   EffectiveLatency=" << EffectiveLatency
                     << (HasUnknownSuccessors ? " (HasUnknownSuccessors)"
                                              : " (known successors)")
                     << "\n");
@@ -353,18 +221,16 @@ unsigned MaxLatencyFinder::operator()(MachineInstr &MI) {
                                             : std::to_string(Succ->NodeNum))
                  << ": latency=" << EdgeLat << ", depth=" << Depth
                  << ", remaining=" << Remaining
-                 << ", updating NewEffectiveLatency " << NewEffectiveLatency
-                 << " -> " << std::max(NewEffectiveLatency, Remaining) << "\n");
-      NewEffectiveLatency = std::max(NewEffectiveLatency, Remaining);
+                 << ", updating EffectiveLatency " << EffectiveLatency << " -> "
+                 << std::max(EffectiveLatency, Remaining) << "\n");
+      EffectiveLatency = std::max(EffectiveLatency, Remaining);
     }
   }
-  // Cap at the raw maxLatency of the source instruction, matching the old
-  // computation where the result is naturally bounded by Latency.
-  NewEffectiveLatency = std::min(NewEffectiveLatency, Latency);
-  LLVM_DEBUG(dbgs() << "   EffectiveLatency(new)=" << NewEffectiveLatency
-                    << "\n");
+  // Cap at the raw maxLatency of the source instruction.
+  EffectiveLatency = std::min(EffectiveLatency, Latency);
+  LLVM_DEBUG(dbgs() << "   EffectiveLatency=" << EffectiveLatency << "\n");
 
-  return static_cast<unsigned>(NewEffectiveLatency);
+  return static_cast<unsigned>(EffectiveLatency);
 }
 
 } // namespace llvm::AIE
diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h
index 22bcb5a4c441..058970563287 100644
--- a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h
+++ b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h
@@ -31,20 +31,6 @@ namespace llvm::AIE {
 int maxLatency(const MachineInstr *MI, const AIEBaseInstrInfo &InstrInfo,
                const InstrItineraryData &Itineraries, bool IncludeStages);
 
-struct InstrAndCycle {
-  MachineInstr *MI = nullptr;
-  int Cycle;
-};
-
-/// Find the first dependence on SrcMI in Bundles[0,Prune)
-/// \returns the Cycle in which the dependence happens or a conservative lower
-///          bound and the instruction responsible for the dependency if it is
-///          found.
-InstrAndCycle findEarliestRef(const MachineInstr &SrcMI,
-                              ArrayRef<MachineBundle> Bundles, int Prune,
-                              AAResults *AA = nullptr,
-                              bool SafeToIgnoreMemDeps = false);
-
 /// Interblock dependence information for one specific CFG successor.
 /// Contains a DDG built from the current block's bottom region (pre-boundary)
 /// and the successor's top region (post-boundary). When the successor has been
@@ -64,8 +50,9 @@ class SuccessorEdges {
   int SuccTopRegionLength = 0;
 
 public:
-  explicit SuccessorEdges(const MachineSchedContext &C)
-      : Edges(std::make_unique<InterBlockEdges>(C)) {}
+  explicit SuccessorEdges(const MachineSchedContext &C,
+                          bool SafeToIgnoreMemDeps = false)
+      : Edges(std::make_unique<InterBlockEdges>(C, SafeToIgnoreMemDeps)) {}
 
   // Building interface — called during buildInterBlockEdges().
   void addNode(MachineInstr *MI) { Edges->addNode(MI); }
@@ -100,8 +87,6 @@ class MaxLatencyFinder {
   const MCRegisterInfo *const TRI;
   MachineBasicBlock *const CurBB;
   const bool InterBlock;
-  AAResults *AA;
-  bool SafeToIgnoreMemDeps;
 
   /// One entry per CFG successor of CurBB. SuccessorEdges is moveable
   /// (InterBlockEdges is heap-allocated via unique_ptr inside it), so the
@@ -121,13 +106,13 @@ class MaxLatencyFinder {
 
 public:
   // Constructors
-  // Derive TII, TRI, Itineraries, and AA from the scheduling context, keeping
+  // Derive TII, TRI, and Itineraries from the scheduling context, keeping
   // only Scheduler and CurBB as explicit parameters.
   MaxLatencyFinder(const MachineSchedContext &C,
                    const AIEPostRASchedStrategy *Scheduler,
                    MachineBasicBlock *CurBB);
 
-  MaxLatencyFinder(ScheduleDAGInstrs *DAG, AAResults *AA = nullptr);
+  MaxLatencyFinder(ScheduleDAGInstrs *DAG);
 
   // Find the maximum latency of MI taking successors into account.
   unsigned operator()(MachineInstr &MI);

From fb1c127c42cf4b97ef396dbe6386558dc97e97e5 Mon Sep 17 00:00:00 2001
From: Martien de Jong <martien.de-jong@xilinx.com>
Date: Tue, 26 May 2026 17:40:06 +0200
Subject: [PATCH 9/9] [NFC][AIE] generalize InterBlockEdges with Depth and
 Height maps.

Eliminate SuccessorEdges in favour of InterBlockEdges
---
 .../Target/AIE/AIEDataDependenceHelper.cpp    | 31 +++++++++
 llvm/lib/Target/AIE/AIEDataDependenceHelper.h | 54 +++++++++++++++
 .../Target/AIE/AIEInterBlockScheduling.cpp    | 67 +++++++++++--------
 llvm/lib/Target/AIE/AIEInterBlockScheduling.h |  6 +-
 llvm/lib/Target/AIE/AIEMachineScheduler.cpp   | 12 +---
 llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp   | 14 ++--
 llvm/lib/Target/AIE/AIEMaxLatencyFinder.h     | 60 ++---------------
 7 files changed, 143 insertions(+), 101 deletions(-)

diff --git a/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp b/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp
index 3d50710edad4..80b79cba36c7 100644
--- a/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp
+++ b/llvm/lib/Target/AIE/AIEDataDependenceHelper.cpp
@@ -104,4 +104,35 @@ bool InterBlockEdges::isPostBoundaryNode(SUnit *SU) const {
   return Boundary ? SU->NodeNum >= *Boundary : false;
 }
 
+void InterBlockEdges::recordPostDepth(MachineInstr *MI, int Depth) {
+  const auto Found = SuccMap.find(MI);
+  if (Found == SuccMap.end())
+    return;
+  PostDepths[Found->second] = Depth;
+}
+
+int InterBlockEdges::getPostDepthOr(const SUnit *SU, int Default) const {
+  const auto It = PostDepths.find(SU->NodeNum);
+  return It != PostDepths.end() ? It->second : Default;
+}
+
+void InterBlockEdges::recordPreHeightsFromSuccessors() {
+  for (const auto &[MI, NodeNum] : PredMap) {
+    const SUnit &SU = SUnits.at(NodeNum);
+    int MinHeight = std::numeric_limits<int>::max();
+    for (const SDep &Dep : SU.Succs) {
+      if (!isPostBoundaryNode(Dep.getSUnit()))
+        continue;
+      MinHeight = std::min(MinHeight, int(Dep.getSUnit()->getHeight()));
+    }
+    if (MinHeight != std::numeric_limits<int>::max())
+      PreHeights[NodeNum] = MinHeight;
+  }
+}
+
+int InterBlockEdges::getPreHeight(const SUnit *SU) const {
+  const auto It = PreHeights.find(SU->NodeNum);
+  return It != PreHeights.end() ? It->second : std::numeric_limits<int>::max();
+}
+
 } // end namespace llvm::AIE
diff --git a/llvm/lib/Target/AIE/AIEDataDependenceHelper.h b/llvm/lib/Target/AIE/AIEDataDependenceHelper.h
index 58e5d4404639..dcace95370ad 100644
--- a/llvm/lib/Target/AIE/AIEDataDependenceHelper.h
+++ b/llvm/lib/Target/AIE/AIEDataDependenceHelper.h
@@ -17,6 +17,7 @@
 
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include <limits>
 #include <map>
 #include <optional>
 
@@ -64,6 +65,23 @@ class DataDependenceHelper : public ScheduleDAGInstrs {
 ///
 /// When SafeToIgnoreMemDeps is set, memory-alias edges that cross the
 /// pre/post boundary are suppressed via a mayAlias() override.
+///
+/// The class also provides optional depth and height maps (both keyed by SUnit
+/// NodeNum, so they remain unambiguous when the same MachineInstr* appears on
+/// both sides of the boundary, e.g. in a single-block loop):
+///
+///   PostDepths — top-down cycle of each post-boundary node.  Populated by
+///   recordPostDepth(); queried by getPostDepth().
+///
+///   PreHeights — for each pre-boundary node, the minimum getHeight() of its
+///   post-boundary successors in the DDG.  Populated by
+///   recordPreHeightsFromSuccessors() after buildEdges(); queried by
+///   getPreHeight().
+///
+///   PreRegionLength — total number of bundles in the pre-boundary region.
+///
+///   PostRegionLength — total number of bundles in the post-boundary region,
+///   used to represent the depth of the artificial ExitSU node.
 class InterBlockEdges : public DataDependenceHelper {
   // The boundary between Pred and Succ nodes.
   std::optional<unsigned> Boundary;
@@ -76,6 +94,16 @@ class InterBlockEdges : public DataDependenceHelper {
   IndexMap PredMap;
   IndexMap SuccMap;
 
+  /// Depth (top-down cycle) of post-boundary SUnits, keyed by NodeNum.
+  std::map<unsigned, int> PostDepths;
+  /// For each pre-boundary SUnit, the minimum getHeight() of its
+  /// post-boundary successors (keyed by NodeNum).
+  std::map<unsigned, int> PreHeights;
+  /// Total number of bundles in the pre-boundary region.
+  int PreRegionLength = 0;
+  /// Total number of bundles in the post-boundary region.
+  int PostRegionLength = 0;
+
   bool mayAlias(SUnit *SUa, SUnit *SUb, bool TBAA) override;
 
 public:
@@ -107,6 +135,32 @@ class InterBlockEdges : public DataDependenceHelper {
 
   /// Check whether SU represents an instruction after the boundary.
   bool isPostBoundaryNode(SUnit *SU) const;
+
+  // Post-boundary depth interface.
+  /// Record the top-down cycle of a post-boundary instruction.
+  void recordPostDepth(MachineInstr *MI, int Depth);
+  /// Get the recorded top-down cycle of a post-boundary SUnit, or \p Default
+  /// if no depth has been recorded (e.g. the instruction is beyond the
+  /// conflict horizon).
+  int getPostDepthOr(const SUnit *SU, int Default) const;
+  /// Clear all recorded post-boundary depths.  Call before repopulating.
+  void clearPostDepths() { PostDepths.clear(); }
+
+  // Pre-boundary height interface.
+  /// Compute and store, for each pre-boundary SUnit, the minimum getHeight()
+  /// of its post-boundary successors.  Must be called after buildEdges().
+  void recordPreHeightsFromSuccessors();
+  /// Get the stored height of a pre-boundary SUnit.
+  /// Returns INT_MAX if not recorded (conservative: no loop-carried use).
+  int getPreHeight(const SUnit *SU) const;
+
+  // Pre-boundary region length.
+  void setPreRegionLength(int Length) { PreRegionLength = Length; }
+  int getPreRegionLength() const { return PreRegionLength; }
+
+  // Post-boundary region length (used as depth of the ExitSU node).
+  void setPostRegionLength(int Length) { PostRegionLength = Length; }
+  int getPostRegionLength() const { return PostRegionLength; }
 };
 
 } // namespace AIE
diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
index 14d7712b5587..814405a06e2d 100644
--- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
+++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
@@ -536,7 +536,7 @@ InterBlockScheduling::resourcesConverged(BlockState &BS,
   return nullptr;
 }
 
-MachineInstr *InterBlockScheduling::latencyConverged(BlockState &BS) const {
+MachineInstr *InterBlockScheduling::latencyConverged(BlockState &BS) {
   const auto &SubTarget = BS.TheBlock->getParent()->getSubtarget();
   auto *TII = static_cast<const AIEBaseInstrInfo *>(SubTarget.getInstrInfo());
   auto *ItinData = SubTarget.getInstrItineraryData();
@@ -550,17 +550,18 @@ MachineInstr *InterBlockScheduling::latencyConverged(BlockState &BS) const {
   // If the successor is in Top, we lookup its depth in TopDepth
   const Region &Bottom = BS.getBottom();
   const Region &Top = BS.getTop();
-  const InterBlockEdges &BackEdges = BS.getBoundaryEdges();
+  InterBlockEdges &BackEdges = BS.getBoundaryEdges();
 
-  // Record the depth of all instructions in Top. Don't record the ones that
-  // can't cause problems
-  std::map<MachineInstr *, int> TopDepth;
-  int Depth = 0;
+  // Repopulate the post-boundary depths from the current scheduled bundles of
+  // the top region, capped at the conflict horizon.  Clear first so that stale
+  // values from a previous fixpoint iteration are not retained.
+  BackEdges.clearPostDepths();
+  int TopDepth = 0;
   for (auto &Bundle : Top.Bundles) {
     for (auto *MI : Bundle.getInstrs()) {
-      TopDepth[MI] = Depth;
+      BackEdges.recordPostDepth(MI, TopDepth);
     }
-    if (++Depth > HR->getConflictHorizon()) {
+    if (++TopDepth > HR->getConflictHorizon()) {
       break;
     }
   }
@@ -586,14 +587,14 @@ MachineInstr *InterBlockScheduling::latencyConverged(BlockState &BS) const {
           continue;
         }
         DEBUG_LOOPAWARE(dbgs() << "  Backedge to " << Succ->NodeNum << "\n");
-        auto DepthIt = TopDepth.find(Succ->getInstr());
-        if (DepthIt == TopDepth.end()) {
-          // Over the horizon
-          continue;
-        }
-        DEBUG_LOOPAWARE(dbgs() << "  Depth=" << DepthIt->second << "\n");
+        // Instructions beyond the conflict horizon default to ConflictHorizon,
+        // so that Distance = Height + ConflictHorizon >= 1 + ConflictHorizon,
+        // which is always >= Latency, naturally avoiding false positives.
+        const int SuccDepth =
+            BackEdges.getPostDepthOr(Succ, HR->getConflictHorizon());
+        DEBUG_LOOPAWARE(dbgs() << "  Depth=" << SuccDepth << "\n");
         int Latency = SDep.getSignedLatency();
-        int Distance = Height + DepthIt->second;
+        int Distance = Height + SuccDepth;
         if (Distance < Latency) {
           DEBUG_LOOPAWARE(dbgs() << "  Latency(" << Pred->NodeNum << "->"
                                  << Succ->NodeNum << ")=" << Latency
@@ -1069,26 +1070,34 @@ int InterBlockScheduling::getCyclesToRespectTiming(
   int DistFromLoopEntry = 0;
   int EntryNops = 0;
 
-  auto AddRegionToEdges = [&](const Region &R) {
-    for (auto &Bundle : R.Bundles) {
-      for (MachineInstr *MI : Bundle.getInstrs()) {
-        DistancesFromLoopEntry[MI] = DistFromLoopEntry;
-      }
-      ++DistFromLoopEntry;
-    }
+  auto AddRegionToEdges = [&](const Region &R, bool IsPostBoundary = false) {
+    // Add nodes first so that SuccMap/PredMap are populated before depth
+    // recording (recordPostDepth looks up SuccMap by MachineInstr*).
     // Here we need to iterate using semantic order.
     assert(R.top_fixed_instrs().empty() && "SWP epilogue already emitted?");
     for (MachineInstr *MI : R.getFreeInstructions()) {
       Edges.addNode(MI);
     }
+    for (auto &Bundle : R.Bundles) {
+      for (MachineInstr *MI : Bundle.getInstrs()) {
+        if (IsPostBoundary) {
+          Edges.recordPostDepth(MI, DistFromLoopEntry);
+        } else {
+          DistancesFromLoopEntry[MI] = DistFromLoopEntry;
+        }
+      }
+      ++DistFromLoopEntry;
+    }
   };
 
   // Construction of the superblock containing Loop+Epilogue
   // First part is the loop
   AddRegionToEdges(LoopBS.getBottom());
+  Edges.setPreRegionLength(DistFromLoopEntry);
   Edges.markBoundary();
   // Second part is the epilogue itself
-  AddRegionToEdges(EpilogueBS.getTop());
+  AddRegionToEdges(EpilogueBS.getTop(), /*IsPostBoundary=*/true);
+  Edges.setPostRegionLength(DistFromLoopEntry - Edges.getPreRegionLength());
   Edges.buildEdges();
 
   DEBUG_LOOPAWARE(dumpInterBlock(Edges));
@@ -1108,11 +1117,12 @@ int InterBlockScheduling::getCyclesToRespectTiming(
 
         const int PostBoundOrExitDist =
             (PostBoundaryMI != nullptr)
-                ? DistancesFromLoopEntry[PostBoundaryMI]
-                // When getInstr returns nullptr, we reached
-                // ExitSU. We can consider the DistFromLoopEntry as
-                // depth of the ExitSU.
-                : DistFromLoopEntry;
+                ? Edges.getPostDepthOr(Succ, 0)
+                // When getInstr returns nullptr, we reached ExitSU.
+                // The coordinate system counts from the start of the loop
+                // (same as DistancesFromLoopEntry), so ExitSU is at
+                // pre-region + post-region bundles.
+                : Edges.getPreRegionLength() + Edges.getPostRegionLength();
 
         const int Latency = SDep.getSignedLatency();
         const int Distance =
@@ -1335,6 +1345,7 @@ void BlockState::initInterBlock(const MachineSchedContext &Context,
     BoundaryEdges->addNode(MI);
   }
   BoundaryEdges->buildEdges();
+  BoundaryEdges->recordPreHeightsFromSuccessors();
   DEBUG_LOOPAWARE(dumpInterBlock(*BoundaryEdges));
 }
 
diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
index 737195227930..f8260989ed78 100644
--- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
+++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
@@ -215,6 +215,10 @@ class BlockState {
   const Region &getTop() const { return Regions.back(); }
   Region &getTop() { return Regions.back(); }
   const Region &getBottom() const { return Regions.front(); }
+  InterBlockEdges &getBoundaryEdges() {
+    assert(Kind == BlockType::Loop && BoundaryEdges);
+    return *BoundaryEdges;
+  }
   const InterBlockEdges &getBoundaryEdges() const {
     assert(Kind == BlockType::Loop && BoundaryEdges);
     return *BoundaryEdges;
@@ -303,7 +307,7 @@ class InterBlockScheduling {
 
   /// Return one instruction that needs a higher latency cap, or nullptr if all
   /// latencies converged.
-  MachineInstr *latencyConverged(BlockState &BS) const;
+  MachineInstr *latencyConverged(BlockState &BS);
 
   /// After finding the loops, determine the epilogue blocks.
   void markEpilogueBlocks();
diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
index fd045039ae14..e144eecb29c6 100644
--- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
+++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
@@ -1160,17 +1160,7 @@ int getEarliestLoopCarriedUse(const SUnit &SU,
   assert(SUInCurrentIteration);
   assert(SUInCurrentIteration->getHeight() >= SU.getHeight());
 
-  // Look at loop-carried dependencies to see how early the instruction will be
-  // needed in the next iteration.
-  int EarliestCycle = std::numeric_limits<int>::max();
-  for (const SDep &Succ : SUInCurrentIteration->Succs) {
-    if (!LoopEdges.isPostBoundaryNode(Succ.getSUnit()))
-      continue;
-
-    EarliestCycle = std::min(EarliestCycle, int(Succ.getSUnit()->getHeight()));
-  }
-
-  return EarliestCycle;
+  return LoopEdges.getPreHeight(SUInCurrentIteration);
 }
 
 /// Apply a set of heuristics to a new candidate for PostRA scheduling.
diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
index 1da87996b0e1..1ca79c94e817 100644
--- a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
+++ b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
@@ -118,7 +118,8 @@ void MaxLatencyFinder::buildInterBlockEdges(const Region &CurRegion) {
       IB.getBlockState(CurBB).isSafeToIgnoreMemDeps();
 
   for (MachineBasicBlock *SuccBB : CurBB->successors()) {
-    SuccessorEdges &SE = PerSuccEdges.emplace_back(C, SafeToIgnoreMemDeps);
+    InterBlockEdges &SE = *PerSuccEdges.emplace_back(
+        std::make_unique<InterBlockEdges>(C, SafeToIgnoreMemDeps));
 
     // Pre-boundary: free instructions of the current region.
     for (MachineInstr *MI : CurRegion.getFreeInstructions())
@@ -148,11 +149,11 @@ void MaxLatencyFinder::buildInterBlockEdges(const Region &CurRegion) {
     int Cycle = 0;
     for (const MachineBundle &Bundle : SBS.getTop().Bundles) {
       for (MachineInstr *MI : Bundle.getInstrs()) {
-        SE.recordDepth(MI, Cycle);
+        SE.recordPostDepth(MI, Cycle);
       }
       ++Cycle;
     }
-    SE.setTopRegionLength(Cycle);
+    SE.setPostRegionLength(Cycle);
   }
 }
 
@@ -189,7 +190,8 @@ unsigned MaxLatencyFinder::operator()(MachineInstr &MI) {
                     << (HasUnknownSuccessors ? " (HasUnknownSuccessors)"
                                              : " (known successors)")
                     << "\n");
-  for (SuccessorEdges &SE : PerSuccEdges) {
+  for (auto &SEPtr : PerSuccEdges) {
+    InterBlockEdges &SE = *SEPtr;
     const SUnit *Pred = SE.getPreBoundaryNode(&MI);
     if (!Pred) {
       LLVM_DEBUG(
@@ -211,8 +213,8 @@ unsigned MaxLatencyFinder::operator()(MachineInstr &MI) {
       // top region (all its cycles have elapsed before reaching ExitSU).
       // For a regular instruction node the depth is its scheduled cycle
       // within the block.
-      const int Depth = Succ->isBoundaryNode() ? SE.getTopRegionLength()
-                                               : SE.getDepth(Succ->getInstr());
+      const int Depth = Succ->isBoundaryNode() ? SE.getPostRegionLength()
+                                               : SE.getPostDepthOr(Succ, 0);
       const int EdgeLat = Dep.getSignedLatency();
       const int Remaining = EdgeLat - Depth;
       LLVM_DEBUG(
diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h
index 058970563287..7705cf2ac4c5 100644
--- a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h
+++ b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h
@@ -19,7 +19,6 @@
 #include "AIEDataDependenceHelper.h"
 #include "AIEMachineScheduler.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include <map>
 #include <memory>
 #include <vector>
 
@@ -31,55 +30,6 @@ namespace llvm::AIE {
 int maxLatency(const MachineInstr *MI, const AIEBaseInstrInfo &InstrInfo,
                const InstrItineraryData &Itineraries, bool IncludeStages);
 
-/// Interblock dependence information for one specific CFG successor.
-/// Contains a DDG built from the current block's bottom region (pre-boundary)
-/// and the successor's top region (post-boundary). When the successor has been
-/// scheduled, Depths records the cycle of each post-boundary instruction;
-/// unscheduled instructions are absent from Depths and treated as depth 0
-/// (conservative — no latency reduction).
-class SuccessorEdges {
-  /// Heap-allocated so that SuccessorEdges is moveable; InterBlockEdges
-  /// cannot be safely moved due to internal SUnit pointers.
-  std::unique_ptr<InterBlockEdges> Edges;
-  /// Maps each scheduled post-boundary MachineInstr* to its cycle
-  /// (0-indexed from the top of the successor block).
-  std::map<MachineInstr *, int> Depths;
-  /// Number of bundles in the successor block's top region. Used to reduce
-  /// the ExitSU edge latency by the number of cycles already elapsed in the
-  /// successor before it exits.
-  int SuccTopRegionLength = 0;
-
-public:
-  explicit SuccessorEdges(const MachineSchedContext &C,
-                          bool SafeToIgnoreMemDeps = false)
-      : Edges(std::make_unique<InterBlockEdges>(C, SafeToIgnoreMemDeps)) {}
-
-  // Building interface — called during buildInterBlockEdges().
-  void addNode(MachineInstr *MI) { Edges->addNode(MI); }
-  void markBoundary() { Edges->markBoundary(); }
-  void recordDepth(MachineInstr *MI, int Cycle) { Depths[MI] = Cycle; }
-  void buildEdges() { Edges->buildEdges(); }
-  void setTopRegionLength(int Length) { SuccTopRegionLength = Length; }
-
-  // Query interface — called from MaxLatencyFinder::operator().
-  const SUnit *getPreBoundaryNode(MachineInstr *MI) const {
-    return Edges->getPreBoundaryNode(MI);
-  }
-  bool isPostBoundaryNode(SUnit *SU) const {
-    return Edges->isPostBoundaryNode(SU);
-  }
-  /// Returns the scheduled cycle depth of MI. Returns 0 if MI is not found,
-  /// which is the conservative value for unscheduled instructions (no
-  /// reduction of the edge latency).
-  int getDepth(MachineInstr *MI) const {
-    const auto It = Depths.find(MI);
-    return It != Depths.end() ? It->second : 0;
-  }
-  /// Returns the number of bundles in the scheduled successor block's top
-  /// region. Returns 0 for unscheduled successors (conservative).
-  int getTopRegionLength() const { return SuccTopRegionLength; }
-};
-
 class MaxLatencyFinder {
   const AIEPostRASchedStrategy *const Scheduler;
   const AIEBaseInstrInfo *const TII;
@@ -88,10 +38,10 @@ class MaxLatencyFinder {
   MachineBasicBlock *const CurBB;
   const bool InterBlock;
 
-  /// One entry per CFG successor of CurBB. SuccessorEdges is moveable
-  /// (InterBlockEdges is heap-allocated via unique_ptr inside it), so the
-  /// vector may reallocate freely.
-  std::vector<SuccessorEdges> PerSuccEdges;
+  /// One entry per CFG successor of CurBB.  InterBlockEdges is heap-allocated
+  /// via unique_ptr because it inherits from ScheduleDAGInstrs which is not
+  /// safely moveable.
+  std::vector<std::unique_ptr<InterBlockEdges>> PerSuccEdges;
 
   /// True when CurBB has no CFG successors (e.g. a return block), requiring
   /// the conservative raw latency as a floor.
@@ -100,7 +50,7 @@ class MaxLatencyFinder {
   // Check whether this region connects to the successor blocks.
   bool isBottomRegion(MachineInstr *ExitMI);
 
-  // Build one SuccessorEdges per CFG successor of CurBB and populate
+  // Build one InterBlockEdges per CFG successor of CurBB and populate
   // PerSuccEdges.
   void buildInterBlockEdges(const Region &CurRegion);