Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 33 additions & 12 deletions llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ static cl::opt<bool> LoopEpilogueAnalysis(
"aie-loop-epilogue-analysis", cl::init(true),
cl::desc("[AIE] Perform Loop/Epilogue analysis with loop scheduling"));

static cl::opt<int> MaxExpensiveIterations(
"aie-loop-aware-expensive-iterations", cl::init(25),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a rationale behind this number?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes and no. I feel anything over 50 is too much, and anything below 10 is not enough if we need to move a couple of instructions up by 2-3 cycles. So 25 felt like a good compromise. And this works well for loops with an II between 5 and 10 cycles, which is the territory of the PreRA pipeliner for us.

cl::desc("[AIE] Perform Loop/Epilogue analysis with loop scheduling"));

namespace llvm::AIE {

void dumpInterBlock(const InterBlockEdges &Edges) {
Expand Down Expand Up @@ -166,10 +170,13 @@ bool InterBlockScheduling::leaveBlock() {
if (BS.Kind == BlockType::Loop && !updateFixPoint(BS)) {
BS.FixPoint.NumIters++;
// Iterate on CurrentBlock
// We will first try to increase the latency margin for one instruction at
// a time, before increasing that margin for all instructions at once.
// If we are very unlucky, we may step both the latency margin and
// the resource margin to the max. Any more indicates failure to converge,
// and we abort to prevent an infinite loop.
if (BS.FixPoint.NumIters > 2 * HR->getConflictHorizon()) {
if (BS.FixPoint.NumIters >
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Considering your change, does this error become more common without this increase?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh it never triggered, I just changed the condition to account for the extra iterations, otherwise we would fail thinking we are in an infinite loop.

MaxExpensiveIterations + 2 * HR->getConflictHorizon()) {
report_fatal_error("Inter-block scheduling did not converge.");
}
return false;
Expand Down Expand Up @@ -219,7 +226,7 @@ bool InterBlockScheduling::resourcesConverged(BlockState &BS) const {
return true;
}

bool InterBlockScheduling::latencyConverged(BlockState &BS) const {
MachineInstr *InterBlockScheduling::latencyConverged(BlockState &BS) const {
const auto &SubTarget = BS.TheBlock->getParent()->getSubtarget();
auto *TII = static_cast<const AIEBaseInstrInfo *>(SubTarget.getInstrInfo());
auto *ItinData = SubTarget.getInstrItineraryData();
Expand Down Expand Up @@ -283,7 +290,7 @@ bool InterBlockScheduling::latencyConverged(BlockState &BS) const {
<< " not met (" << Distance << ")\n");
DEBUG_LOOPAWARE(dbgs() << " " << Succ->NodeNum << ": "
<< *Succ->getInstr());
return false;
return Pred->getInstr();
}
}
}
Expand All @@ -296,7 +303,7 @@ bool InterBlockScheduling::latencyConverged(BlockState &BS) const {
// upperbound of the latency safety margin that should be provided by
// the epilogue
BS.FixPoint.MaxLatencyExtent = MaxExtent;
return true;
return nullptr;
}

bool InterBlockScheduling::updateFixPoint(BlockState &BS) {
Expand All @@ -316,11 +323,20 @@ bool InterBlockScheduling::updateFixPoint(BlockState &BS) {
// Iterate on CurMBB
return false;
}
if (!latencyConverged(BS)) {
BS.FixPoint.LatencyMargin++;

if (MachineInstr *MINeedsHigherCap = latencyConverged(BS)) {
auto Res = BS.FixPoint.PerMILatencyMargin.try_emplace(MINeedsHigherCap, 0);
// Increase the latency margin per instruction, unless we already iterated
// more than MaxExpensiveIterations without converging.
if (BS.FixPoint.NumIters <= MaxExpensiveIterations) {
++Res.first->second;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

perhaps assert that we don't exceed MaxLatency. (or ConflictHorizon)

} else {
BS.FixPoint.LatencyMargin++;
}
DEBUG_LOOPAWARE(dbgs() << " not converged: latency RM="
<< BS.FixPoint.ResourceMargin << " LM=>"
<< BS.FixPoint.LatencyMargin << "\n");
<< BS.FixPoint.ResourceMargin
<< " LM=" << BS.FixPoint.LatencyMargin
<< " MIM=" << Res.first->second << "\n");
// Iterate on CurMBB
return false;
}
Expand All @@ -341,13 +357,18 @@ bool InterBlockScheduling::successorsAreScheduled(
});
}

std::optional<int>
InterBlockScheduling::getLatencyCap(MachineBasicBlock *BB) const {
auto &BS = getBlockState(BB);
std::optional<int> InterBlockScheduling::getLatencyCap(MachineInstr &MI) const {
auto &BS = getBlockState(MI.getParent());
if (BS.Kind != BlockType::Loop) {
return {};
}
return BS.FixPoint.LatencyMargin;
if (BS.FixPoint.LatencyMargin)
return BS.FixPoint.LatencyMargin;
if (const auto *It = BS.FixPoint.PerMILatencyMargin.find(&MI);
It != BS.FixPoint.PerMILatencyMargin.end()) {
return It->second;
}
return 0;
}

std::optional<int>
Expand Down
8 changes: 6 additions & 2 deletions llvm/lib/Target/AIE/AIEInterBlockScheduling.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ class FixedpointState {
public:
bool IsScheduled = false;
int LatencyMargin = 0;
SmallMapVector<MachineInstr *, int, 8> PerMILatencyMargin;
int ResourceMargin = 0;
// Results from the convergence test
int MaxLatencyExtent = 0;
Expand Down Expand Up @@ -245,7 +246,10 @@ class InterBlockScheduling {

/// The two components of the convergence test
bool resourcesConverged(BlockState &BS) const;
bool latencyConverged(BlockState &BS) const;

/// Return one instruction that needs a higher latency cap, or nullptr if all
/// latencies converged.
MachineInstr *latencyConverged(BlockState &BS) const;

/// After finding the loops, determine the epilogue blocks
void markEpilogueBlocks();
Expand Down Expand Up @@ -301,7 +305,7 @@ class InterBlockScheduling {
/// Return the maximum interblock latency we need to account for
/// for the given successor. This represents the latency margin we assume for
/// an unscheduled successor.
std::optional<int> getLatencyCap(MachineBasicBlock *BB) const;
std::optional<int> getLatencyCap(MachineInstr &MI) const;

/// Return the maximum number of cycles to block for the given successor.
/// This represents the resource usage we assume for an unscheduled successor.
Expand Down
54 changes: 41 additions & 13 deletions llvm/lib/Target/AIE/AIEMachineScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ static cl::opt<bool>
cl::desc("Track reg pressure more accurately and "
"delay some instructions to avoid spills."));
static cl::opt<unsigned> NumCriticalFreeRegs(
"aie-premisched-near-critical-regs", cl::init(4),
"aie-premisched-near-critical-regs", cl::init(2),
Copy link
Collaborator Author

@gbossu gbossu Aug 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: I'm reducing the limit here, but then this get multiplied by the number of pressure units required by the reg class. E.g. the number of free units we try to maintain for W is 2, for X it is 4, and for Y it is 8.

cl::desc("Number of free registers below which premisched should actively "
"try to reduce the pressure."));

Expand Down Expand Up @@ -761,6 +761,33 @@ bool AIEPostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
return false;
}

void AIEPreRASchedStrategy::initialize(ScheduleDAGMI *DAG) {
GenericScheduler::initialize(DAG);

// Cache the threshold for each pressure set.
const std::vector<unsigned> &RegionMaxPressure =
static_cast<ScheduleDAGMILive *>(DAG)->getRegPressure().MaxSetPressure;
PSetThresholds.clear();
for (unsigned PSet = 0, EndPSet = RegionMaxPressure.size(); PSet < EndPSet;
++PSet) {
unsigned MaxPressure = RegionMaxPressure[PSet];
Copy link
Collaborator

@andcarminati andcarminati Aug 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: const unsigned MaxPressure

unsigned Limit = Context->RegClassInfo->getRegPressureSetLimit(PSet);

// If the region has a maximum pressure that exceeds the target threshold,
// artificially reduce that threshold to force more conservative scheduling.
if (MaxPressure > Limit) {
unsigned ExtraPressure = MaxPressure - Limit;
if (Limit > ExtraPressure)
Limit -= ExtraPressure;
else
Limit = 0;
LLVM_DEBUG(dbgs() << TRI->getRegPressureSetName(PSet)
<< " Decreased Threshold to " << Limit << "\n");
}
PSetThresholds.push_back(Limit);
}
}

void AIEPreRASchedStrategy::enterRegion(MachineBasicBlock *BB,
MachineBasicBlock::iterator Begin,
MachineBasicBlock::iterator End,
Expand Down Expand Up @@ -874,8 +901,9 @@ bool AIEPreRASchedStrategy::isAvailableNode(SUnit &SU, SchedBoundary &Zone,
}

unsigned CurrPressure = BotRPT.getRegSetPressureAtPos()[WorstPC.getPSet()];
if (CurrPressure + WorstPC.getUnitInc() <
TRI->getRegPressureSetLimit(*CurMBB->getParent(), WorstPC.getPSet())) {
if (CurrPressure + WorstPC.getUnitInc() +
(NumCriticalFreeRegs * WorstPC.getUnitInc()) <
PSetThresholds[WorstPC.getPSet()]) {
// Worsening pressure, but still within limits, keep node as available
return true;
}
Expand Down Expand Up @@ -960,10 +988,11 @@ bool AIEPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
if (!PC.isValid())
return false;
unsigned CurrPressure = BotRPT.getRegSetPressureAtPos()[PC.getPSet()];
unsigned Threshold =
TRI->getRegPressureSetLimit(*CurMBB->getParent(), PC.getPSet());
return Threshold <= NumCriticalFreeRegs ||
CurrPressure >= Threshold - NumCriticalFreeRegs;
unsigned Threshold = PSetThresholds[PC.getPSet()];
unsigned NumCriticalFreeUnits =
NumCriticalFreeRegs * std::abs(PC.getUnitInc());
return Threshold <= NumCriticalFreeUnits ||
CurrPressure >= Threshold - NumCriticalFreeUnits;
};
PressureChange TryCandPC =
getPressureChange(estimatePressureDiff(*TryCand.SU, BotRPT));
Expand All @@ -972,13 +1001,12 @@ bool AIEPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
if ((IsNearCritical(TryCandPC) || IsNearCritical(CandPC)) &&
tryPressure(TryCandPC, CandPC, TryCand, Cand, RegMax, TRI, DAG->MF))
return TryCand.Reason != NoCand;
}

// Avoid increasing the max pressure of the entire region.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
Cand, RegMax, TRI, DAG->MF))
return TryCand.Reason != NoCand;
// Avoid increasing the max pressure of the entire region.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CHECK: isTrackingPressure() is trivially true here.

if (tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax,
TryCand, Cand, RegMax, TRI, DAG->MF))
return TryCand.Reason != NoCand;
}

// Fall through to original instruction order.
if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AIE/AIEMachineScheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ class AIEPreRASchedStrategy : public GenericScheduler {
public:
AIEPreRASchedStrategy(const MachineSchedContext *C) : GenericScheduler(C) {}

void initialize(ScheduleDAGMI *DAG) override;

void enterRegion(MachineBasicBlock *BB, MachineBasicBlock::iterator Begin,
MachineBasicBlock::iterator End, unsigned RegionInstrs);
void leaveRegion(const SUnit &ExitSU);
Expand Down Expand Up @@ -182,6 +184,8 @@ class AIEPreRASchedStrategy : public GenericScheduler {
/// pressure-reducing SU to be scheduled first.
/// SUDelayerMap[0] = 2 means that SU(0) is waiting on SU(2).
std::vector<unsigned> SUDelayerMap;

std::vector<unsigned> PSetThresholds;
};

/// An extension to ScheduleDAGMI that provides callbacks on region entry/exit
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ unsigned MaxLatencyFinder::operator()(MachineInstr &MI) {
// scheduling a loop.
const AIE::InterBlockScheduling &IB = Scheduler->getInterBlock();
if (!InterBlock) {
if (auto Cap = IB.getLatencyCap(CurBB)) {
if (auto Cap = IB.getLatencyCap(MI)) {
LLVM_DEBUG(dbgs() << "Capped at " << *Cap << "\n");
Latency = std::min(Latency, *Cap);
}
Expand Down
31 changes: 15 additions & 16 deletions llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-dyn-stackalloc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -150,34 +150,33 @@ define void @test_huge_stack(i32 noundef %n) #0 {
; CHECK-NEXT: mov p2, p7
; CHECK-NEXT: mov p6, p7
; CHECK-NEXT: paddb [p0], m0
; CHECK-NEXT: paddb [p2], #-32
; CHECK-NEXT: paddb [p6], #-32
; CHECK-NEXT: movxm m0, #-40032
; CHECK-NEXT: st r0, [p0, #0]
; CHECK-NEXT: lda r0, [p0, #0]
; CHECK-NEXT: mov r16, p2
; CHECK-NEXT: paddb [p2], m0
; CHECK-NEXT: mov p0, sp
; CHECK-NEXT: st p0, [p2, #0]
; CHECK-NEXT: mov p0, p1
; CHECK-NEXT: mov p2, p7
; CHECK-NEXT: paddb [p2], #-24
; CHECK-NEXT: mov r16, p2
; CHECK-NEXT: st p0, [p6, #0]
; CHECK-NEXT: mov p0, p7
; CHECK-NEXT: paddb [p0], #-24
; CHECK-NEXT: lshl r2, r0, r2
; CHECK-NEXT: st r0, [p2], #4
; CHECK-NEXT: st r0, [p0], #4
; CHECK-NEXT: add r2, r2, #31
; CHECK-NEXT: st r1, [p2, #0]
; CHECK-NEXT: and r2, r2, r3
; CHECK-NEXT: st r1, [p0, #0]
; CHECK-NEXT: jl #extern_call
; CHECK-NEXT: mov m0, r2 // Delay Slot 5
; CHECK-NEXT: paddb [p1], m0 // Delay Slot 4
; CHECK-NEXT: movxm m0, #-40032 // Delay Slot 3
; CHECK-NEXT: paddb [p6], m0 // Delay Slot 2
; CHECK-NEXT: mov p0, p1 // Delay Slot 5
; CHECK-NEXT: and r2, r2, r3 // Delay Slot 4
; CHECK-NEXT: mov m0, r2 // Delay Slot 3
; CHECK-NEXT: paddb [p1], m0 // Delay Slot 2
; CHECK-NEXT: mov sp, p1 // Delay Slot 1
; CHECK-NEXT: nopb ; nopa ; nops ; jl #extern_call; nopv
; CHECK-NEXT: nopa ; nopx // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: mov p0, p6 // Delay Slot 1
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov p0, r16; nopv
; CHECK-NEXT: lda p0, [p0, #0]; nopx
; CHECK-NEXT: mov p0, r16 // Delay Slot 1
; CHECK-NEXT: lda p0, [p6, #0]; nopx
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
Expand Down
31 changes: 15 additions & 16 deletions llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -150,34 +150,33 @@ define void @test_huge_stack(i32 noundef %n) #0 {
; CHECK-NEXT: mov p2, p7
; CHECK-NEXT: mov p6, p7
; CHECK-NEXT: paddb [p0], m0
; CHECK-NEXT: paddb [p2], #-32
; CHECK-NEXT: paddb [p6], #-32
; CHECK-NEXT: movxm m0, #-40032
; CHECK-NEXT: st r0, [p0, #0]
; CHECK-NEXT: lda r0, [p0, #0]
; CHECK-NEXT: mov r16, p2
; CHECK-NEXT: paddb [p2], m0
; CHECK-NEXT: mov p0, sp
; CHECK-NEXT: st p0, [p2, #0]
; CHECK-NEXT: mov p0, p1
; CHECK-NEXT: mov p2, p7
; CHECK-NEXT: paddb [p2], #-24
; CHECK-NEXT: mov r16, p2
; CHECK-NEXT: st p0, [p6, #0]
; CHECK-NEXT: mov p0, p7
; CHECK-NEXT: paddb [p0], #-24
; CHECK-NEXT: lshl r2, r0, r2
; CHECK-NEXT: st r0, [p2], #4
; CHECK-NEXT: st r0, [p0], #4
; CHECK-NEXT: add r2, r2, #31
; CHECK-NEXT: st r1, [p2, #0]
; CHECK-NEXT: and r2, r2, r3
; CHECK-NEXT: st r1, [p0, #0]
; CHECK-NEXT: jl #extern_call
; CHECK-NEXT: mov m0, r2 // Delay Slot 5
; CHECK-NEXT: paddb [p1], m0 // Delay Slot 4
; CHECK-NEXT: movxm m0, #-40032 // Delay Slot 3
; CHECK-NEXT: paddb [p6], m0 // Delay Slot 2
; CHECK-NEXT: mov p0, p1 // Delay Slot 5
; CHECK-NEXT: and r2, r2, r3 // Delay Slot 4
; CHECK-NEXT: mov m0, r2 // Delay Slot 3
; CHECK-NEXT: paddb [p1], m0 // Delay Slot 2
; CHECK-NEXT: mov sp, p1 // Delay Slot 1
; CHECK-NEXT: nopb ; nopa ; nops ; jl #extern_call; nopv
; CHECK-NEXT: nopa ; nopx // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: mov p0, p6 // Delay Slot 1
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov p0, r16; nopv
; CHECK-NEXT: lda p0, [p0, #0]; nopx
; CHECK-NEXT: mov p0, r16 // Delay Slot 1
; CHECK-NEXT: lda p0, [p6, #0]; nopx
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
Expand Down
Loading