Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion llvm/lib/Target/AIE/AIEHazardRecognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates
// (c) Copyright 2023-2026 Advanced Micro Devices, Inc. or its affiliates
//
//===----------------------------------------------------------------------===//
//
Expand Down Expand Up @@ -559,6 +559,11 @@ bool AIEHazardRecognizer::checkConflict(
std::nullopt);
}

bool AIEHazardRecognizer::checkConflict(MachineInstr &MI,
int DeltaCycles) const {
return checkConflict(Scoreboard, MI, DeltaCycles);
}

bool AIEHazardRecognizer::checkConflict(
const ResourceScoreboard<FuncUnitWrapper> &Scoreboard,
const InstrItineraryData *ItinData, unsigned SchedClass, SlotBits SlotSet,
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AIE/AIEHazardRecognizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,8 @@ class AIEHazardRecognizer : public ScheduleHazardRecognizer {
bool checkConflict(const ResourceScoreboard<FuncUnitWrapper> &Scoreboard,
MachineInstr &MI, int DeltaCycles) const;

bool checkConflict(MachineInstr &MI, int DeltaCycles) const;

protected:
ScheduleHazardRecognizer::HazardType getHazardType(const MCInstrDesc &Desc,
int DeltaCycles) const;
Expand Down
214 changes: 204 additions & 10 deletions llvm/lib/Target/AIE/AIEMachineScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,13 @@ static cl::opt<unsigned> ReservedDelaySlots(
"aie-reserved-delay-slots", cl::init(0),
cl::desc("[AIE] Number of delay slots to be left empty"));

static cl::opt<bool> EnableDelaySlotTopDown(
"aie-delay-slot-topdown", cl::init(true),
cl::desc("[AIE] When top-fixed bundles and a delay slot instruction "
"coexist, schedule the whole region top-down and fix up the "
"branch position in leaveRegion() instead of forcing bottom-up "
"cycles"));

/// This is a testing option. Resetting it prevents inter-block conflicts from
/// the scoreboard, so that all interblock scheduling effects can be blamed on
/// the latencies.
Expand Down Expand Up @@ -420,24 +427,41 @@ void AIEPostRASchedStrategy::initialize(ScheduleDAGMI *Dag) {
: NonConservative);
initializeTopScoreBoard();

// Delay slots are scheduled bottom up to be sure the control-flow instruction
// is issued exactly TII->getNumDelaySlots() before the end of the region.
// Compute RegionTopDownCycles first so the delay slot logic can inspect it.
const Region &Reg = InterBlock.getBlockState(CurMBB).getCurrentRegion();
RegionTopDownCycles = Reg.getTopFixedBundles().size();

// Delay slots are normally scheduled bottom-up so the control-flow
// instruction is issued exactly TII->getNumDelaySlots() before the end of
// the region. However, when top-fixed bundles are present and
// -aie-delay-slot-topdown is enabled, we schedule the whole region top-down
// and rely on fixupDelaySlotPosition() (called from leaveRegion) to move
// the branch to the correct position afterwards.
unsigned DelaySlotCycles = 0;
PersistentTopDown = false;
if (MachineInstr *MI = getDelaySlotInstr(RegionBegin, RegionEnd)) {
auto *TII = getTII(CurMBB);
assert(RegionEnd != MI->getParent()->instr_end() &&
TII->isDelayedSchedBarrier(*RegionEnd));
// Schedule bottom-up for at least getNumDelaySlots() cycles, and an extra
// one for the delay slot instruction itself.
DelaySlotCycles = TII->getNumDelaySlots(*MI) + 1;
unsigned Reserved = std::max(ReservedDelaySlots.getValue(),
TII->getNumReservedDelaySlots(*MI));
getAIEHazardRecognizer(Bot)->setReservedCycles(Reserved);

if (RegionTopDownCycles > 0 && EnableDelaySlotTopDown &&
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess we could assign this condition to DelaySlotFixupNeeded directly, then set DelaySlotCycles on it being false.

Reg.getBotFixedBundles().empty()) {
// Top-fixed bundles present, no bot-fixed bundles, and the option is
// enabled: schedule fully top-down and fix up the branch position in
// leaveRegion(). Bot-fixed bundles are incompatible with this path
// because fixupDelaySlotPosition requires BotBundles to be empty.
PersistentTopDown = true;
} else {
// Normal case: force enough bottom-up cycles to place the branch at the
// correct distance from the end of the region.
DelaySlotCycles = TII->getNumDelaySlots(*MI) + 1;
}
}

RegionBottomUpCycles = std::max(BottomUpCycles.getValue(), DelaySlotCycles);
const Region &Reg = InterBlock.getBlockState(CurMBB).getCurrentRegion();
RegionTopDownCycles = Reg.getTopFixedBundles().size();
// Start with top-down when we have TopInsert bundles.
IsTopDown = (RegionBottomUpCycles == 0) || (RegionTopDownCycles > 0);
if (!IsTopDown) {
Expand Down Expand Up @@ -478,8 +502,11 @@ bool AIEPostRASchedStrategy::doesNotProgressInZone(const SchedBoundary &Zone,
if (isFixedSU(SU, !Zone.isTop()))
return true;

// We cannot proceed with delay slot instructions in the top zone.
return Zone.isTop() && SU.getInstr()->hasDelaySlot();
// We cannot proceed with delay slot instructions in the top zone, unless we
// are using the post-scheduling fixup path (PersistentTopDown). In that
// case the branch is allowed to be scheduled anywhere and its position will
// be corrected in leaveRegion().
return Zone.isTop() && SU.getInstr()->hasDelaySlot() && !PersistentTopDown;
}

// This function returns true when it is impossible to continue with top-down
Expand Down Expand Up @@ -528,11 +555,14 @@ SUnit *AIEPostRASchedStrategy::pickNodeAndCycle(
// RegionBottomUpCycles.
LLVM_DEBUG(dbgs() << "*** Switching to top-down ***\n");
IsTopDown = true;
} else if (IsTopDown && RegionTopDownCycles &&
} else if (IsTopDown && RegionTopDownCycles && !PersistentTopDown &&
(Top.getCurrCycle() >= RegionTopDownCycles ||
mustSwitchToBottomUp())) {
// We have scheduled all top-fixed instructions, filling as many slots as
// possible. Now it is time to proceed with the bottom-up approach.
// Note: when PersistentTopDown is true (top-fixed bundles + delay slot),
// we stay top-down for the entire region and fix up the branch position
// in leaveRegion().
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need a lot of explaining comment. I think that's caused by using a necessary repair (DelaySlotFixupNeeded) as an implicit scheduling mode. Perhaps it would be clearer to rename to 'PersistentTopDown' and derive the necessary repair from it.

LLVM_DEBUG(dbgs() << "*** Switching to bottom-up ***\n");
IsTopDown = false;
}
Expand Down Expand Up @@ -842,6 +872,157 @@ void AIEPostRASchedStrategy::enterRegion(MachineBasicBlock *BB,
RegionEnd = End;
}

/// Return the index of the first bundle in \p Bundles that contains \p MI,
/// or -1 if not found.
static int findInBundles(ArrayRef<AIE::MachineBundle> Bundles,
const MachineInstr *MI) {
auto It = llvm::find_if(Bundles, [MI](const AIE::MachineBundle &B) {
return llvm::is_contained(B.getInstrs(), MI);
});
assert(It != Bundles.end() && "MI not found in any bundle");
return static_cast<unsigned>(std::distance(Bundles.begin(), It));
}

/// Return the MBB iterator at which \p BranchMI should be spliced: just after
/// the last non-BranchMI instruction in bundles[0..\p PlacedIdx], searching
/// backward from \p PlacedIdx.
static MachineBasicBlock::iterator
computeSplicePoint(ArrayRef<AIE::MachineBundle> Bundles, unsigned PlacedIdx,
MachineInstr *BranchMI, MachineBasicBlock *MBB) {
for (int I = static_cast<int>(PlacedIdx); I >= 0; --I) {
const auto &Instrs = Bundles[I].getInstrs();
auto It =
llvm::find_if(llvm::reverse(Instrs),
[BranchMI](MachineInstr *MI) { return MI != BranchMI; });
if (It != Instrs.rend())
return getBundleEnd((*It)->getIterator());
}
return MBB->end();
}

/// Remove \p MI from \p Bundle, keeping Instrs, SlotMap and OccupiedSlots
/// in sync.
static void removeFromBundle(AIE::MachineBundle &Bundle, MachineInstr *MI) {
auto &Instrs = Bundle.Instrs;
Instrs.erase(std::remove(Instrs.begin(), Instrs.end(), MI), Instrs.end());
// Remove from SlotMap and update OccupiedSlots.
auto MapIt = llvm::find_if(Bundle.SlotMap,
[MI](const auto &P) { return P.second == MI; });
assert(MapIt != Bundle.SlotMap.end() && "MI not found in bundle SlotMap");
const auto *SlotInfo = Bundle.FormatInterface->getSlotInfo(MapIt->first);
assert(SlotInfo && "No SlotInfo for slot containing MI");
Bundle.OccupiedSlots &= ~SlotInfo->getSlotSet();
Bundle.SlotMap.erase(MapIt);
}

void AIEPostRASchedStrategy::fixupDelaySlotPosition(
std::vector<AIE::MachineBundle> &TopBundles,
std::vector<AIE::MachineBundle> &BotBundles, MachineInstr *BranchMI,
unsigned NumDelaySlots) {

// Only reached when PersistentTopDown is true: fully top-down region,
// no bot-fixed bundles. All NOPs go into TopBundles, keeping
// Top.getCurrCycle() in sync.
assert(BotBundles.empty() &&
"fixupDelaySlotPosition: BotBundles must be empty on entry");

const AIEBaseMCFormats *FmtIface = getTII(CurMBB)->getFormatInterface();
AIEHazardRecognizer *TopHR = getAIEHazardRecognizer(Top);
// Appends one empty NOP to TopBundles and advances Top's scoreboard.
auto AppendNop = [&]() {
Top.bumpCycle(Top.getCurrCycle() + 1);
TopBundles.emplace_back(FmtIface);
};

const int BranchIdx = findInBundles(TopBundles, BranchMI);

// May increase as NOPs are appended or before the re-placement runs.
unsigned BundlesAfterBranch =
TopBundles.size() - static_cast<unsigned>(BranchIdx) - 1;

LLVM_DEBUG({
dbgs() << "fixupDelaySlotPosition: BranchIdx=" << BranchIdx
<< " BundlesAfterBranch=" << BundlesAfterBranch
<< " NumDelaySlots=" << NumDelaySlots << "\n";
for (unsigned I = 0; I < TopBundles.size(); ++I) {
dbgs() << " Bundle[" << I << "]:";
for (MachineInstr *MI : TopBundles[I].getInstrs())
dbgs() << " "
<< MI->getMF()->getSubtarget().getInstrInfo()->getName(
MI->getOpcode());
dbgs() << "\n";
}
});

// Append NOPs until exactly NumDelaySlots bundles follow the branch.
// If the inter-zone scoreboard is also clean afterwards, we are done.
// Otherwise fall through to resolve the conflict by moving the branch
// forward.
while (BundlesAfterBranch < NumDelaySlots) {
LLVM_DEBUG(dbgs() << "fixupDelaySlotPosition: appending empty bundle\n");
AppendNop();
BundlesAfterBranch++;
}

if (BundlesAfterBranch == NumDelaySlots &&
!checkInterZoneConflicts(BotBundles)) {
LLVM_DEBUG(dbgs() << "fixupDelaySlotPosition: done, position is correct "
"and scoreboard are aligned.\n");
return;
}

// Branch too early (or inter-zone conflict after NOP padding) – extract it
// and re-place at a conflict-free slot. Each conflict appends a NOP
// (advancing Top's scoreboard), maintaining exactly NumDelaySlots bundles
// after the final placement.
const unsigned MoveDown = BundlesAfterBranch - NumDelaySlots;
const unsigned TargetIdx = static_cast<unsigned>(BranchIdx) + MoveDown;

LLVM_DEBUG(dbgs() << "fixupDelaySlotPosition: extracting branch from bundle "
<< BranchIdx << " and placing at/after bundle " << TargetIdx
<< "\n");

// Remove BranchMI from its current bundle.
removeFromBundle(TopBundles[BranchIdx], BranchMI);

// Scan forward from TargetIdx. Delta = -(NumDelaySlots + 1) is constant:
// each AppendNop advances the Top scoreboard ring by one slot, so successive
// checks probe cycles TargetIdx, TargetIdx+1, ... The branch never conflicts
// with its own old booking at a later cycle.
const int Delta = -(static_cast<int>(NumDelaySlots) + 1);
unsigned PlacedIdx = TargetIdx;
while (TopHR->checkConflict(*BranchMI, Delta) ||
checkInterZoneConflicts(BotBundles)) {
LLVM_DEBUG(dbgs() << "fixupDelaySlotPosition: conflict at bundle "
<< PlacedIdx << ", appending empty bundle\n");
AppendNop();
++PlacedIdx;
}

LLVM_DEBUG(dbgs() << "fixupDelaySlotPosition: placing branch at bundle "
<< PlacedIdx << "\n");

// Add BranchMI to the chosen bundle and record its resource bookings in the
// Top scoreboard so subsequent inter-zone checks are accurate.
TopBundles[PlacedIdx].add(BranchMI);
Copy link
Copy Markdown
Collaborator Author

@andcarminati andcarminati May 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should emit the instruction in the scoreboard here for the last check safety.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done!

TopHR->emitInScoreboard(*BranchMI, BranchMI->getDesc(), Delta);

// Physically move BranchMI to its correct position in the MBB.
CurMBB->splice(computeSplicePoint(TopBundles, PlacedIdx, BranchMI, CurMBB),
CurMBB, BranchMI->getIterator());

// Now that the branch's resource bookings are in the Top scoreboard,
// re-check for inter-zone conflicts caused by the branch itself. If one is
// found, append a NOP and recurse (MoveDown = 1). Terminates because the
// scoreboard has finite depth.
if (checkInterZoneConflicts(BotBundles)) {
LLVM_DEBUG(dbgs() << "fixupDelaySlotPosition: post-placement inter-zone "
"conflict, appending NOP and retrying\n");
AppendNop();
fixupDelaySlotPosition(TopBundles, BotBundles, BranchMI, NumDelaySlots);
}
}

void AIEPostRASchedStrategy::leaveRegion(const SUnit &ExitSU) {
LLVM_DEBUG(dbgs() << " << leaveRegion\n");

Expand All @@ -863,6 +1044,19 @@ void AIEPostRASchedStrategy::leaveRegion(const SUnit &ExitSU) {
std::vector<AIE::MachineBundle> TopBundles = computeAndFinalizeBundles(Top);
std::vector<AIE::MachineBundle> BotBundles = computeAndFinalizeBundles(Bot);
handleRegionConflicts(ExitSU, TopBundles, BotBundles);

// When the delay slot instruction was scheduled top-down (because top-fixed
// bundles were present), fix up its position in the bundle sequence so that
// exactly NumDelaySlots bundles follow it.
if (PersistentTopDown) {
if (MachineInstr *BranchMI = getDelaySlotInstr(RegionBegin, RegionEnd)) {
const auto *TII = getTII(CurMBB);
const unsigned NumDelaySlots = TII->getNumDelaySlots(*BranchMI);
fixupDelaySlotPosition(TopBundles, BotBundles, BranchMI, NumDelaySlots);
}
PersistentTopDown = false;
}

assert(BS.getCurrentRegion().Bundles.empty());
BS.addBundles(TopBundles);
BS.addBundles(BotBundles);
Expand Down
18 changes: 18 additions & 0 deletions llvm/lib/Target/AIE/AIEMachineScheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,11 @@ class AIEPostRASchedStrategy : public PostGenericScheduler {
/// Keeps track of the current zone used for scheduling. See getSchedZone().
bool IsTopDown = true;

/// When true, the delay slot instruction was allowed to be scheduled in the
/// top-down phase (because top-fixed bundles are present). After scheduling,
/// fixupDelaySlotPosition() will reposition it correctly.
bool PersistentTopDown = false;

MachineBasicBlock *CurMBB = nullptr;
MachineBasicBlock::iterator RegionBegin = nullptr;
MachineBasicBlock::iterator RegionEnd = nullptr;
Expand Down Expand Up @@ -176,6 +181,19 @@ class AIEPostRASchedStrategy : public PostGenericScheduler {
// After scheduling a block, fill in nops, apply bundling, etc.
void commitBlockSchedule(MachineBasicBlock *BB);

/// After top-down scheduling of a region that contains a delay slot
/// instruction, fix up its position in the bundle sequence so that exactly
/// NumDelaySlots bundles follow it.
///
/// If the branch ended up too early (too many bundles after it), we try to
/// move it down toward the end, checking for resource hazards at each step.
/// If moving is not possible, we fall back to appending empty (NOP) bundles
/// at the end. If the branch ended up too late (too few bundles after it),
/// we simply append the missing empty bundles.
void fixupDelaySlotPosition(std::vector<AIE::MachineBundle> &TopBundles,
std::vector<AIE::MachineBundle> &BotBundles,
MachineInstr *BranchMI, unsigned NumDelaySlots);

// This function returns true when it is impossible to continue with top-down
// without entering an infinite loop because the only remaining instructions
// cannot be scheduled in the top zone.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ body: |
; CHECK-NEXT: - Prologue: bb.0
; CHECK-NEXT: - PrologueBundles: '10'
; CHECK-NEXT: - Epilogue: bb.2.for.cond.cleanup
; CHECK-NEXT: - EpilogueBundles: '16'
; CHECK-NEXT: - EpilogueBundles: '11'
; CHECK-NEXT: ...
bb.0:
successors: %bb.1(0x80000000)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ body: |
; CHECK-NEXT: - Prologue: bb.1.outer.loop.header
; CHECK-NEXT: - PrologueBundles: '28'
; CHECK-NEXT: - Epilogue: bb.3.outer.loop.latch
; CHECK-NEXT: - EpilogueBundles: '33'
; CHECK-NEXT: - EpilogueBundles: '37'
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

JNZ is conflicting with the move slot here.

; CHECK-NEXT: ...
bb.0.newFuncRoot (align 16):
successors: %bb.1(0x80000000)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
; CHECK-NEXT: .L_LEnd1:
; CHECK-NEXT: nopb ; nopa ; vst.srs.s8.s32 cm1, s0, [p1], #32; nopx ; vmin_ge.s16 x8, r16, x6, x0; nopv
; CHECK-NEXT: // %bb.4: // %for.cond.cleanup
; CHECK-NEXT: nopx ; vmax_lt.s16 x10, r16, x8, x2; vmac cm1, cm0, x10, x4, r0
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmax_lt.s16 x10, r16, x8, x2; vmac cm1, cm0, x10, x4, r0
; CHECK-NEXT: vst.srs.s8.s32 cm1, s0, [p1], #32; vmin_ge.s16 x8, r16, x6, x0
; CHECK-NEXT: vmax_lt.s16 x10, r16, x8, x2; vmac cm1, cm0, x10, x4, r0
; CHECK-NEXT: vst.srs.s8.s32 cm1, s0, [p1], #32; vmin_ge.s16 x8, r16, x6, x0
Expand All @@ -81,12 +81,9 @@
; CHECK-NEXT: vmac cm1, cm0, x10, x4, r0
; CHECK-NEXT: vst.srs.s8.s32 cm1, s0, [p1], #32
; CHECK-NEXT: nop
; CHECK-NEXT: vst.srs.s8.s32 cm1, s0, [p1], #32
; CHECK-NEXT: nop
; CHECK-NEXT: vst.srs.s8.s32 cm1, s0, [p1], #32
; CHECK-NEXT: ret lr
; CHECK-NEXT: vst.srs.s8.s32 cm1, s0, [p1], #32; ret lr
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: vst.srs.s8.s32 cm1, s0, [p1], #32 // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ body: |
; CHECK-NEXT: - Prologue: bb.0
; CHECK-NEXT: - PrologueBundles: '14'
; CHECK-NEXT: - Epilogue: bb.2.for.cond.cleanup
; CHECK-NEXT: - EpilogueBundles: '20'
; CHECK-NEXT: - EpilogueBundles: '17'
; CHECK-NEXT: ...
bb.0:
successors: %bb.1(0x80000000)
Expand Down
Loading
Loading