diff --git a/llvm/lib/Target/AIE/AIE2Subtarget.h b/llvm/lib/Target/AIE/AIE2Subtarget.h index b614e44f34ac..34a5857108b1 100644 --- a/llvm/lib/Target/AIE/AIE2Subtarget.h +++ b/llvm/lib/Target/AIE/AIE2Subtarget.h @@ -59,6 +59,9 @@ class AIE2Subtarget : public AIE2GenSubtargetInfo, public AIEBaseSubtarget { StringRef FS, StringRef ABIName, const TargetMachine &TM); bool enableMachineScheduler() const override { return true; } + bool enableMachinePipeliner() const override { + return AIEBaseSubtarget::enableMachinePipeliner(); + } bool enablePostRAScheduler() const override { return true; } bool enablePostRAMachineScheduler() const override { return true; } bool forcePostRAScheduling() const override { return true; } diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp index 84ee36401a04..9bd2d935a136 100644 --- a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp +++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp @@ -58,6 +58,11 @@ static cl::opt WAWStickyRegistersMemOpsThreshold( cl::desc("Number of memory instructions to enable the register exclusion " "heuristic in WAW sticky registers dep. removal")); +static cl::opt ForcePostPipeliner( + "aie-force-postpipeliner", + cl::desc( + "Force using AIE's post-pipeliner instead of the MachinePipeliner"), + cl::init(false), cl::Hidden); // These are debugging/testing options. // aie-latency-margin defines the latency that will be given to ExitSU edges. @@ -848,3 +853,7 @@ AIEBaseSubtarget::getSMSMutationsImpl(const Triple &TT) { } return Mutations; } + +bool AIEBaseSubtarget::enableMachinePipeliner() const { + return !ForcePostPipeliner; +} diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.h b/llvm/lib/Target/AIE/AIEBaseSubtarget.h index e6e6198138aa..5bd004bc1e43 100644 --- a/llvm/lib/Target/AIE/AIEBaseSubtarget.h +++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.h @@ -85,6 +85,10 @@ class AIEBaseSubtarget { /// Required DAG mutations during software pipelining. static std::vector> getSMSMutationsImpl(const Triple &TT); + + /// Whether to enable the pre-RA MachinePipeliner. This can be disabled to let + /// the post-RA pipeliner handle the scheduling. + bool enableMachinePipeliner() const; }; } // namespace llvm diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp index 2be0a2060d82..4491fed02709 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp @@ -1079,9 +1079,13 @@ MachineBasicBlock *AIEPreRASchedStrategy::nextBlock() { // The prescheduler also clutters the view of the postpipeliner, so we skip // such blocks here. auto Skip = [](MachineBasicBlock *Block) { - return PreSchedFollowsSkipPipeliner && Block && - AIELoopUtils::isSingleMBBLoop(Block) && - AIELoopUtils::getPipelinerDisabled(*Block); + if (!Block) + return false; + bool PrePipelinerDisabled = + AIELoopUtils::getPipelinerDisabled(*Block) || + !Block->getParent()->getSubtarget().enableMachinePipeliner(); + return PreSchedFollowsSkipPipeliner && + AIELoopUtils::isSingleMBBLoop(Block) && PrePipelinerDisabled; }; do { diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp index f7c04c551d58..43955e01e38c 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// // This file contains a simple post-RA pipeliner. It tries to wrap the linear @@ -17,6 +17,8 @@ #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/Support/MathExtras.h" +#include +#include #define DEBUG_TYPE "postpipeliner" #define DEBUG_SUMMARY(X) DEBUG_WITH_TYPE("postpipeliner-summary", X) @@ -28,6 +30,10 @@ static cl::opt Heuristic("aie-postpipeliner-heuristic", cl::desc("Select one specific post-pipeliner heuristic"), cl::init(-1), cl::Hidden); +static cl::opt + HeuristicRuns("aie-postpipeliner-heuristic-runs", + cl::desc("Number of runs for heuristics that converge"), + cl::init(20), cl::Hidden); PipelineScheduleVisitor::~PipelineScheduleVisitor() {} @@ -134,9 +140,11 @@ int PostPipeliner::getResMII(MachineBasicBlock &LoopBlock) { // This assigns Cycle of SU, Earliest of its successors and Latest of its // predecessors void PostPipeliner::scheduleNode(SUnit &SU, int Cycle) { - LLVM_DEBUG(dbgs() << "PostPipeline " << SU.NodeNum << " in cycle " << Cycle - << ". "); + LLVM_DEBUG(dbgs() << "PostPipelined SU" << SU.NodeNum << " in cycle " << Cycle + << ": " << *SU.getInstr()); Info[SU.NodeNum].Cycle = Cycle; + + LLVM_DEBUG(dbgs() << " Pushed succs Earliest: "); for (auto &Dep : SU.Succs) { int Latency = Dep.getSignedLatency(); auto *Succ = Dep.getSUnit(); @@ -146,12 +154,14 @@ void PostPipeliner::scheduleNode(SUnit &SU, int Cycle) { const int SNum = Succ->NodeNum; const int NewEarliest = Cycle + Latency; if (NewEarliest > Info[SNum].Earliest) { + LLVM_DEBUG(dbgs() << "SU" << SNum << " from " << Info[SNum].Earliest + << " to " << NewEarliest << " ; "); Info[SNum].LastEarliestPusher = SU.NodeNum; Info[SNum].Earliest = NewEarliest; Info[SU.NodeNum].NumPushedEarliest++; - LLVM_DEBUG(dbgs() << SNum << " to " << Info[SNum].Earliest << " -; "); } } + LLVM_DEBUG(dbgs() << "\n Pushed preds Latest: "); for (auto &Dep : SU.Preds) { int Latency = Dep.getSignedLatency(); auto *Pred = Dep.getSUnit(); @@ -161,10 +171,11 @@ void PostPipeliner::scheduleNode(SUnit &SU, int Cycle) { const int PNum = Pred->NodeNum; const int NewLatest = Cycle - Latency; if (NewLatest < Info[PNum].Latest) { + LLVM_DEBUG(dbgs() << "SU" << PNum << " from " << Info[PNum].Latest + << " to " << NewLatest << " ; "); Info[PNum].LastLatestPusher = SU.NodeNum; Info[PNum].Latest = NewLatest; Info[SU.NodeNum].NumPushedLatest++; - LLVM_DEBUG(dbgs() << PNum << " to - " << Info[PNum].Latest << "; "); } } LLVM_DEBUG(dbgs() << "\n"); @@ -194,24 +205,66 @@ int PostPipeliner::fit(MachineInstr *MI, int First, int Last, int II) { return -1; } +// Account for predecessor that require the same resources by pushing Earliest +// further. +void PostPipeliner::biasForLocalResourceContention(NodeInfo &NI, + const SUnit &SU) { + SlotCounts Slots(NI.Slots); + int PredEarliest = std::numeric_limits::max(); + SmallSet UniqueAncestors; + int Count = 0; + + for (const SDep &Dep : SU.Preds) { + if (Dep.getKind() != SDep::Data) { + continue; + } + int P = Dep.getSUnit()->NodeNum; + const NodeInfo &Pred = Info[P]; + auto [It, Inserted] = UniqueAncestors.insert(P); + if (Inserted) { + Slots += Pred.Slots; + Count++; + } + PredEarliest = std::min(PredEarliest, Pred.Earliest); + } + + // When we need more slots than we have data predecessors, we have local + // resource contention that we can safely account for in Earliest. + if (Count > 0 && Slots.max() > Count) { + int NewEarliest = std::max(NI.Earliest, PredEarliest + Slots.max() - 1); + LLVM_DEBUG(dbgs() << " SU" << SU.NodeNum << " MaxSlots=" << Slots.max() + << ": Earliest " << NI.Earliest << " -> " << NewEarliest + << "\n"); + NI.Earliest = NewEarliest; + } +} + void PostPipeliner::computeForward() { // The forward order defines a topological sort, so we can compute // Earliest and Ancestors in a single forward sweep for (int K = 0; K < NInstr; K++) { + LLVM_DEBUG(dbgs() << "computeForward SU" << K << "\n"); auto &Me = Info[K]; SUnit &SU = DAG->SUnits[K]; + + // Give a more realistic Earliest if preds require similar resources. + biasForLocalResourceContention(Me, SU); + + // Accumulate all data predecessors. for (auto &Dep : SU.Preds) { if (Dep.getKind() != SDep::Data) { continue; } int P = Dep.getSUnit()->NodeNum; assert(P < K); + const NodeInfo &Pred = Info[P]; Me.Ancestors.insert(P); - auto &Pred = Info[P]; for (int Anc : Pred.Ancestors) { Me.Ancestors.insert(Anc); } } + + // Propagate Earliest to successors for (auto &Dep : SU.Succs) { auto *Succ = Dep.getSUnit(); if (Succ->isBoundaryNode()) { @@ -219,9 +272,13 @@ void PostPipeliner::computeForward() { } auto &SInfo = Info[Succ->NodeNum]; const int NewEarliest = Me.Earliest + Dep.getSignedLatency(); + if (NewEarliest != SInfo.Earliest) { + LLVM_DEBUG(dbgs() << " SU" << Succ->NodeNum << " : Earliest " + << SInfo.Earliest << " -> " + << std::max(SInfo.Earliest, NewEarliest) << "\n"); + } SInfo.Earliest = std::max(SInfo.Earliest, NewEarliest); } - Me.Slots = getSlotCounts(*SU.getInstr(), TII); } } @@ -261,6 +318,11 @@ bool PostPipeliner::computeBackward() { bool PostPipeliner::computeLoopCarriedParameters() { + // Initialize slot counts. + for (int K = 0; K < NTotalInstrs; K++) { + Info[K].Slots = getSlotCounts(*DAG->SUnits[K].getInstr(), TII); + } + // Forward properties like Earliest and Ancestors. computeForward(); @@ -300,6 +362,9 @@ bool PostPipeliner::computeLoopCarriedParameters() { const int KNextIter = K + NInstr; const int Earliest = Info[KNextIter].Earliest - II; Info[K].Earliest = std::max(Info[K].Earliest, Earliest); + LLVM_DEBUG(dbgs() << "SU" << K << " LCD: Earliest=" << Info[K].Earliest + << "(Modulo SU" << KNextIter + << " Earliest=" << Info[KNextIter].Earliest << ")\n"); } // Make Earliest of the second iteration push up Latest of the first @@ -323,10 +388,17 @@ bool PostPipeliner::computeLoopCarriedParameters() { } // Save the static values for ease of reset - for (auto &N : Info) { + for (auto &N : Info.Nodes) { N.StaticEarliest = N.Earliest; N.StaticLatest = N.Latest; } + + LLVM_DEBUG(dbgs() << "Final Earliest - Latest:\n"); + for (int K = 0; K < NTotalInstrs; K++) { + auto &Me = Info[K]; + LLVM_DEBUG(dbgs() << " SU" << K << " : " << Me.Earliest << " - " + << Me.Latest << "\n"); + } return true; } @@ -343,24 +415,23 @@ int PostPipeliner::computeMinScheduleLength() const { return MinLength; } -void dumpGraph(int NInstr, const std::vector &Info, - ScheduleDAGInstrs *DAG) { +void dumpGraph(const ScheduleInfo &Info, ScheduleDAGInstrs *DAG) { dbgs() << "digraph {\n"; - for (int K = 0; K < NInstr; K++) { + for (int K = 0; K < Info.NInstr; K++) { auto &SU = DAG->SUnits[K]; for (auto &Dep : SU.Succs) { auto *Succ = Dep.getSUnit(); int S = Succ->NodeNum; - if (S % NInstr == K) { + if (S % Info.NInstr == K) { continue; } dbgs() << "\tSU" << K << " -> " << "SU" << S; - if (S >= NInstr) { - dbgs() << "_" << S % NInstr; + if (S >= Info.NInstr) { + dbgs() << "_" << S % Info.NInstr; } if (Dep.getKind() == SDep::Data) { dbgs() << " [color=red] "; @@ -380,6 +451,25 @@ void dumpGraph(int NInstr, const std::vector &Info, dbgs() << "}\n"; } +void dumpIntervals(const ScheduleInfo &Info, int MinLength) { + dbgs() << "Intervals:\n"; + for (int K = 0; K < Info.NInstr; K++) { + std::string Head = "SU" + std::to_string(K); + dbgs() << Head; + for (int I = Head.length() - 6; I < MinLength; I++) { + if (I == 0) { + dbgs() << "|"; + } + if (I >= Info[K].Earliest && I <= MinLength + Info[K].Latest) { + dbgs() << "*"; + } else { + dbgs() << " "; + } + } + dbgs() << "\n"; + } +} + int PostPipeliner::mostUrgent(PostPipelinerStrategy &Strategy) { assert(FirstUnscheduled <= LastUnscheduled); while (Info[FirstUnscheduled].Scheduled) { @@ -425,8 +515,8 @@ void PostPipeliner::resetSchedule(bool FullReset) { auto &N = Info[K]; N.reset(FullReset); if (K < NInstr) { - N.Earliest = N.StaticEarliest; - N.Latest = N.StaticLatest; + N.Earliest = N.TweakedEarliest ? *N.TweakedEarliest : N.StaticEarliest; + N.Latest = N.TweakedLatest ? *N.TweakedLatest : N.StaticLatest; } } @@ -447,8 +537,12 @@ bool PostPipeliner::scheduleFirstIteration(PostPipelinerStrategy &Strategy) { const int Actual = Strategy.fromTop() ? fit(MI, Earliest, Latest + 1, II) : fit(MI, Latest, Earliest - 1, II); if (Actual < 0) { - // out of resources for this II; LLVM_DEBUG(dbgs() << "Out of resources\n"); + + // The node might have been given too tight Earliest/Latest attributes. + // Relax those to give another chance for scheduling this II. + Info[N].TweakedEarliest = {}; + Info[N].TweakedLatest = {}; return false; } Strategy.selected(SU); @@ -458,6 +552,7 @@ bool PostPipeliner::scheduleFirstIteration(PostPipelinerStrategy &Strategy) { LLVM_DEBUG(dbgs() << " Emit in " << Cycle << "\n"); for (int N = 0; N < NCopies; N++) { if (N > 0 && HR.checkConflict(Scoreboard, *MI, Cycle)) { + LLVM_DEBUG(dbgs() << "Conflict in iteration N=" << N << "\n"); return false; } @@ -476,16 +571,16 @@ bool PostPipeliner::scheduleFirstIteration(PostPipelinerStrategy &Strategy) { } namespace { -void dumpEarliestChain(const std::vector &Info, int N) { +void dumpEarliestChain(const ScheduleInfo &Info, int N) { auto Prev = Info[N].LastEarliestPusher; if (Prev) { dumpEarliestChain(Info, *Prev); } - dbgs() << " --> " << N << " @" << Info[N].Cycle << "\n"; + dbgs() << " --> SU" << N << " @" << Info[N].Cycle << "\n"; } } // namespace -bool PostPipeliner::scheduleOtherIterations() { +bool PostPipeliner::scheduleOtherIterations(PostPipelinerStrategy &Strategy) { // Make sure that all the copies can be placed at II from the previous one. // This looks like overkill, but it accommodates dependences that span // multiple loop edges. Without these, the pattern should repeat after the @@ -494,16 +589,43 @@ bool PostPipeliner::scheduleOtherIterations() { for (int K = 0; K < NInstr; K++) { const int N = L + K; SUnit &SU = DAG->SUnits[N]; + NodeInfo &Node = Info[N]; + const SUnit &ModuloSU = DAG->SUnits[N - NInstr]; + NodeInfo &ModuloNode = Info[N - NInstr]; + // Earliest tracks the latencies of the loop carried deps - const int Earliest = Info[N].Earliest; + const int Earliest = Node.Earliest; // Insert supplies the modulo condition. - const int Insert = Info[N - NInstr].Cycle + II; + const int Insert = ModuloNode.Cycle + II; // All iterations following the first one should fit exactly if (Earliest > Insert) { - LLVM_DEBUG(dbgs() << " Latency not met for " << N - << "(Earliest=" << Earliest << ")\n"; - dumpEarliestChain(Info, N);); + LLVM_DEBUG(dbgs() << "Latency not met for SU" << N << " in cycle " + << Insert << " (Earliest=" << Earliest + << " ModuloNode=SU" << N - NInstr << ")\n"; + dumpEarliestChain(Info, N)); + if (Strategy.mobility(ModuloSU) > 0) { + // The modulo Node can be delayed + ModuloNode.TweakedEarliest = ModuloNode.Earliest + 1; + LLVM_DEBUG(dbgs() << " Try to delay SU" << N - NInstr + << " with TweakedEarliest= " + << ModuloNode.TweakedEarliest << "\n"); + return false; + } + if (Node.LastEarliestPusher && *Node.LastEarliestPusher < NInstr) { + // The modulo Node cannot be delayed. + // Instead, prioritise whatever pushed us. + NodeInfo &Pusher = Info[*Node.LastEarliestPusher]; + if (Strategy.mobility(DAG->SUnits[*Node.LastEarliestPusher]) > 0) { + ModuloNode.TweakedEarliest = {}; + Pusher.TweakedLatest = Pusher.Latest - 1; + LLVM_DEBUG(dbgs() + << " Try to prioritise SU" << *Node.LastEarliestPusher + << " with TweakedLatest= " << Pusher.TweakedLatest + << "\n"); + return false; + } + } return false; } @@ -513,10 +635,19 @@ bool PostPipeliner::scheduleOtherIterations() { return true; } +int getMinOutputLat(ArrayRef Edges) { + int Min = std::numeric_limits::max(); + for (const SDep &Dep : Edges) { + if (Dep.getKind() != SDep::Output) + continue; + Min = std::min(Min, Dep.getSignedLatency()); + } + return Min; +} + class DefaultStrategy : public PostPipelinerStrategy { public: - DefaultStrategy(ScheduleDAGMI &DAG, std::vector &Info, - int LatestBias) + DefaultStrategy(ScheduleDAGMI &DAG, ScheduleInfo &Info, int LatestBias) : PostPipelinerStrategy(DAG, Info, LatestBias) {} bool better(const SUnit &A, const SUnit &B) override { return Info[A.NodeNum].Latest < Info[B.NodeNum].Latest; @@ -525,6 +656,7 @@ class DefaultStrategy : public PostPipelinerStrategy { class ConfigStrategy : public PostPipelinerStrategy { bool TopDown = true; + bool Alternate = false; public: enum PriorityComponent { @@ -533,6 +665,8 @@ class ConfigStrategy : public PostPipelinerStrategy { Critical, Sibling, LCDLatest, + DepLength, // Schedule "as deep as possible" first + Liveness, // Minimise liveness by looking at output deps Size }; static std::string getPriorityName(PriorityComponent Component) { @@ -547,11 +681,22 @@ class ConfigStrategy : public PostPipelinerStrategy { return "Sibling"; case PriorityComponent::LCDLatest: return "LcdLatest"; + case PriorityComponent::DepLength: + return "DepLength"; + case PriorityComponent::Liveness: + return "Liveness"; default: break; } return "Size - Illegal"; } + struct Configuration { + int ExtraStages = 0; + bool TopDown = true; + bool Alternate = false; + int Runs = 0; + SmallVector Components; + }; private: std::string Name; @@ -583,6 +728,14 @@ class ConfigStrategy : public PostPipelinerStrategy { auto &IB = Info[B.NodeNum]; return IA.LCDLatest < IB.LCDLatest; }, + [&](const SUnit &A, const SUnit &B) { + return A.getDepth() > B.getDepth(); + }, + [&](const SUnit &A, const SUnit &B) { + // This tries to minimise live ranges of registers by favouring + // nodes that have successors with negative latencies. + return getMinOutputLat(A.Succs) < getMinOutputLat(B.Succs); + }, }; std::vector Priority; @@ -628,15 +781,20 @@ class ConfigStrategy : public PostPipelinerStrategy { PredSiblingScheduled.insert(PDep.getSUnit()->NodeNum); } } + if (Alternate) { + TopDown = !TopDown; + } } public: std::string name() override { return Name; } - ConfigStrategy(ScheduleDAGInstrs &DAG, std::vector &Info, - int Length, bool TopDown, + ConfigStrategy(ScheduleDAGInstrs &DAG, ScheduleInfo &Info, int Length, + bool TopDown, bool Alternate, ArrayRef Components) - : PostPipelinerStrategy(DAG, Info, Length), TopDown(TopDown) { - Name = "Config_" + std::to_string(Length) + "_" + std::to_string(TopDown); + : PostPipelinerStrategy(DAG, Info, Length), TopDown(TopDown), + Alternate(Alternate) { + Name = "Config_" + std::to_string(Length) + "_" + std::to_string(TopDown) + + "_" + std::to_string(Alternate); for (auto Comp : Components) { Name += "_" + getPriorityName(Comp); Priority.emplace_back(Comp); @@ -644,59 +802,49 @@ class ConfigStrategy : public PostPipelinerStrategy { } }; -static const struct { - int ExtraStages; - bool TopDown; - bool Rerun; - ConfigStrategy::PriorityComponent Components[3]; -} Strategies[] = { +using Prio = ConfigStrategy::PriorityComponent; +static const ConfigStrategy::Configuration Strategies[] = { // Loosely speaking, a lower value of the first parameter targets // a lower stage count, which benefits code size. - // Rerurn is only useful for heuristics that use it, e.g. Critical - {1, true, false, {ConfigStrategy::NodeNum}}, - {1, true, false, {ConfigStrategy::Latest}}, - {1, true, true, {ConfigStrategy::Critical}}, - {1, true, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}}, - {0, false, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}}, - {1, false, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}}, - // This is pure bottom up - {1, false, false, {ConfigStrategy::NodeNum}}, + // Runs>1 is only useful for heuristics that use it, e.g. Critical + // {ExtraStages, TopDown, Alternate, Runs, PriorityComponents} + {1, true, false, 1, {Prio::NodeNum}}, + {1, true, false, HeuristicRuns, {Prio::Latest}}, + {1, true, false, HeuristicRuns, {Prio::Critical}}, + {1, true, false, HeuristicRuns, {Prio::Latest, Prio::Sibling}}, + {1, true, false, HeuristicRuns, {Prio::DepLength, Prio::Latest}}, + {1, true, false, HeuristicRuns, {Prio::Critical, Prio::LCDLatest}}, + {1, true, false, HeuristicRuns, {Prio::Liveness, Prio::Latest}}, + {1, true, false, HeuristicRuns, {Prio::Latest, Prio::Liveness}}, + // Bottom-up strategies + {0, false, false, 2, {Prio::Critical, Prio::LCDLatest}}, + {1, false, false, 2, {Prio::Critical, Prio::LCDLatest}}, + {1, false, false, 1, {Prio::NodeNum}}, // pure bottom up }; bool PostPipeliner::tryHeuristics() { int MinLength = computeMinScheduleLength(); - DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength << "\n"); int HeuristicIndex = 0; - for (auto &[ExtraStages, TopDown, Rerun, Components] : Strategies) { + for (const auto &Config : Strategies) { if (Heuristic >= 0 && Heuristic != HeuristicIndex++) { continue; } - ConfigStrategy S(*DAG, Info, MinLength + ExtraStages * II, TopDown, - Components); + ConfigStrategy S(*DAG, Info, MinLength + Config.ExtraStages * II, + Config.TopDown, Config.Alternate, Config.Components); resetSchedule(/*FullReset=*/true); - DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name() << "\n"); - if (scheduleFirstIteration(S) && scheduleOtherIterations()) { - DEBUG_SUMMARY(dbgs() << " Strategy " << S.name() << " found II=" << II + for (int Run = 0; Run < Config.Runs; Run++) { + DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name() << " run=" << Run << "\n"); - return true; - } - - DEBUG_SUMMARY(dbgs() << " failed\n"); - if (!Rerun) { - continue; - } - - // Rerun with dynamic information retained - resetSchedule(/*FullReset=*/false); - DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name() - << " with critical path"); - if (scheduleFirstIteration(S) && scheduleOtherIterations()) { - DEBUG_SUMMARY(dbgs() << " found II=" << II << "\n"); - return true; + if (scheduleFirstIteration(S) && scheduleOtherIterations(S)) { + DEBUG_SUMMARY(dbgs() << " Strategy " << S.name() << " run=" << Run + << " found II=" << II << "\n"); + return true; + } + resetSchedule(/*FullReset=*/false); } - DEBUG_SUMMARY(dbgs() << " failed\n"); + DEBUG_SUMMARY(dbgs() << " Strategy " << S.name() << " failed\n"); } DEBUG_SUMMARY(dbgs() << "=== II=" << II << " Failed ===\n"); return false; @@ -718,15 +866,14 @@ bool PostPipeliner::schedule(ScheduleDAGMI &TheDAG, int InitiationInterval) { Depth = NCopies * II + HR.getPipelineDepth(); Scoreboard.reset(Depth); - Info.clear(); - Info.resize(NTotalInstrs); + Info.init(NInstr, NCopies); LLVM_DEBUG(for (int I = 0; I < NInstr; I++) { dbgs() << I << " " << *DAG->SUnits[I].getInstr(); }); - LLVM_DEBUG(dumpGraph(NInstr, Info, DAG)); + LLVM_DEBUG(dumpGraph(Info, DAG)); computeLoopCarriedParameters(); - + LLVM_DEBUG(dumpIntervals(Info, computeMinScheduleLength())); if (!tryHeuristics()) { LLVM_DEBUG(dbgs() << "PostPipeliner: No schedule found\n"); return false; @@ -836,6 +983,8 @@ void NodeInfo::reset(bool FullReset) { Earliest = 0; Latest = -1; if (FullReset) { + TweakedEarliest = {}; + TweakedLatest = {}; NumPushedEarliest = 0; NumPushedLatest = 0; LastEarliestPusher = {}; diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.h b/llvm/lib/Target/AIE/AIEPostPipeliner.h index 2ddb8ef79a44..63cb496ffe8d 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.h +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.h @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// // This file contains a simple post-RA pipeliner. It tries to wrap the linear @@ -65,6 +65,11 @@ class NodeInfo { int StaticEarliest = 0; int StaticLatest = -1; + /// "Tweaked" numbers for \p Earliest and \p Latest to use for the next + /// iteration of this strategy. + std::optional TweakedEarliest; + std::optional TweakedLatest; + // Slots necessary for this instruction. SlotCounts Slots; @@ -90,14 +95,27 @@ class NodeInfo { void reset(bool FullReset); }; +class ScheduleInfo { +public: + std::vector Nodes; + int NInstr; + void init(int NOrig, int NCopies) { + NInstr = NOrig; + Nodes.clear(); + Nodes.resize(NInstr * NCopies); + } + NodeInfo &operator[](int N) { return Nodes[N]; } + const NodeInfo &operator[](int N) const { return Nodes[N]; } +}; + class PostPipelinerStrategy { protected: ScheduleDAGInstrs &DAG; - std::vector &Info; + ScheduleInfo &Info; int LatestBias = 0; public: - PostPipelinerStrategy(ScheduleDAGInstrs &DAG, std::vector &Info, + PostPipelinerStrategy(ScheduleDAGInstrs &DAG, ScheduleInfo &Info, int LatestBias) : DAG(DAG), Info(Info), LatestBias(LatestBias) {}; virtual ~PostPipelinerStrategy() {}; @@ -111,6 +129,7 @@ class PostPipelinerStrategy { virtual int latest(const SUnit &N) { return Info[N.NodeNum].Latest + LatestBias; } + virtual int mobility(const SUnit &N) { return latest(N) - earliest(N); } // Select from top or from bottom. virtual bool fromTop() { return true; } // Report a final selection. This marks the start of selecting a new node. @@ -139,9 +158,10 @@ class PostPipeliner { int FirstUnscheduled = 0; int LastUnscheduled = -1; - /// Holds the cycle of each SUnit. The following should hold: + /// Holds the scheduling information for each instruction. The following + /// should hold: /// Cycle(N) mod II == Cycle(N % NInstr) mod II - std::vector Info; + ScheduleInfo Info; // The scoreboard and its depth ResourceScoreboard Scoreboard; @@ -179,6 +199,7 @@ class PostPipeliner { bool computeLoopCarriedParameters(); /// Helpers of computeLoopCarriedParameters() + void biasForLocalResourceContention(NodeInfo &NI, const SUnit &SU); void computeForward(); bool computeBackward(); @@ -200,7 +221,7 @@ class PostPipeliner { bool scheduleFirstIteration(PostPipelinerStrategy &Strategy); /// Check that all copied instructions can run in the same modulo cycle - bool scheduleOtherIterations(); + bool scheduleOtherIterations(PostPipelinerStrategy &Strategy); /// Reset dynamic scheduling data. /// If FullReset is set, also reset information collected from earlier diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PSubtarget.h b/llvm/lib/Target/AIE/aie2p/AIE2PSubtarget.h index a6ffbc8603d5..e78903a1032b 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PSubtarget.h +++ b/llvm/lib/Target/AIE/aie2p/AIE2PSubtarget.h @@ -53,6 +53,9 @@ class AIE2PSubtarget : public AIE2PGenSubtargetInfo, public AIEBaseSubtarget { AIE2PSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS, StringRef ABIName, const TargetMachine &TM); bool enableMachineScheduler() const override { return true; } + bool enableMachinePipeliner() const override { + return AIEBaseSubtarget::enableMachinePipeliner(); + } bool enablePostRAScheduler() const override { return true; } bool enablePostRAMachineScheduler() const override { return true; } bool forcePostRAScheduling() const override { return true; } diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir index 51f7b0931268..ba1c58b94b69 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir @@ -25,68 +25,68 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: nopa ; vldb wh8, [p0, #32]; nopx ; mov p7, p5 + ; CHECK-NEXT: vlda wh9, [p4, #416]; nopxm + ; CHECK-NEXT: vlda wh7, [p4, #352] + ; CHECK-NEXT: vlda wl7, [p4, #320] + ; CHECK-NEXT: vlda wl9, [p4, #384] + ; CHECK-NEXT: vlda wh11, [p4, #480] + ; CHECK-NEXT: vlda wl11, [p4, #448]; mov p7, p5 + ; CHECK-NEXT: vldb wh8, [p0, #32]; mov p4, p7 ; CHECK-NEXT: vldb wl8, [p0], m4 ; CHECK-NEXT: vldb wh10, [p0, #32] ; CHECK-NEXT: vldb wl10, [p0], m4 ; CHECK-NEXT: vldb wh1, [p0, #32] - ; CHECK-NEXT: vldb wl1, [p0], m4 - ; CHECK-NEXT: vldb wh3, [p0, #32]; add.nc lc, r0, #-1 - ; CHECK-NEXT: vldb.3d wl3, [p0], d1; movxm ls, #.LBB0_2 - ; CHECK-NEXT: vlda wh7, [p4, #352]; vshift.align x0, x0, s0, x8, r3 - ; CHECK-NEXT: vlda wl7, [p4, #320]; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; vlda wh9, [p4, #416]; nops ; nopx ; vshift.align x2, x2, s0, x10, r3; nopv - ; CHECK-NEXT: nopb ; vlda wl9, [p4, #384]; nops ; nopx ; vshuffle x8, x0, x2, r9; nopv - ; CHECK-NEXT: vldb wh5, [p5, #32]; nopa ; nops ; nopx ; vshift.align x4, x4, s0, x1, r3; nopv - ; CHECK-NEXT: nopb ; vlda wl5, [p5], #256; nops ; nopx ; vshuffle x5, x0, x2, r25; nopv - ; CHECK-NEXT: nopb ; vlda wh11, [p4, #480]; nops ; nopx ; vshift.align x6, x6, s0, x3, r3; nopv - ; CHECK-NEXT: nopb ; vlda wl11, [p4, #448]; nops ; nopx ; vshuffle x3, x4, x6, r9; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x10, x4, x6, r25; vmac.f bml4, bml4, x8, x7, r29 - ; CHECK-NEXT: nopa ; vshuffle x1, x3, x5, r13 - ; CHECK-NEXT: vshuffle x3, x3, x5, r24; vmac.f bmh1, bmh1, x8, x9, r29 - ; CHECK-NEXT: mov r3, p0; vmac.f bmh0, bmh0, x1, x9, r29 - ; CHECK-NEXT: and r3, r3, r0; mov p4, p7; vmac.f bmh7, bmh7, x8, x5, r29 - ; CHECK-NEXT: add r3, r3, #34; vmac.f bmh5, bmh5, x1, x5, r29 - ; CHECK-NEXT: vmac.f bml2, bml2, x3, x5, r29 - ; CHECK-NEXT: vmac.f bml0, bml0, x10, x5, r29 + ; CHECK-NEXT: vldb wl1, [p0], m4; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb wh3, [p0, #32]; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb.3d wl3, [p0], d1; movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshift.align x0, x0, s0, x8, r3; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshift.align x2, x2, s0, x10, r3; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x5, x0, x2, r25; nopv + ; CHECK-NEXT: vldb wh5, [p5, #32]; nopa ; nops ; nopx ; vshuffle x8, x0, x2, r9; nopv + ; CHECK-NEXT: nopb ; vlda wl5, [p5], #256; nops ; nopx ; vshift.align x4, x4, s0, x1, r3; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshift.align x6, x6, s0, x3, r3; vmac.f bmh1, bmh1, x8, x9, r29 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopx ; mov p7, p5; vmac.f bmh3, bmh3, x3, x9, r29 - ; CHECK-NEXT: nopa ; vldb wl8, [p0], m4; nopx ; vmac.f bmh2, bmh2, x10, x9, r29 - ; CHECK-NEXT: vldb wh10, [p0, #32]; vmac.f bml3, bml3, x1, x7, r29 - ; CHECK-NEXT: vldb wl10, [p0], m4; vmac.f bml6, bml6, x3, x7, r29 - ; CHECK-NEXT: vldb wh1, [p0, #32]; vmac.f bml5, bml5, x10, x7, r29 - ; CHECK-NEXT: vldb wl1, [p0], m4; vmac.f bmh6, bmh6, x8, x11, r29 - ; CHECK-NEXT: vldb wh3, [p0, #32]; vmac.f bmh4, bmh4, x1, x11, r29 - ; CHECK-NEXT: vldb.3d wl3, [p0], d1; vmac.f bml1, bml1, x3, x11, r29 - ; CHECK-NEXT: vlda wh7, [p4, #352]; vshift.align x0, x0, s0, x8, r3; vmac.f bmh8, bmh8, x10, x11, r29 - ; CHECK-NEXT: vlda wl7, [p4, #320] - ; CHECK-NEXT: vlda wh9, [p4, #416]; vshift.align x2, x2, s0, x10, r3 - ; CHECK-NEXT: vlda wl9, [p4, #384]; vshuffle x8, x0, x2, r9 - ; CHECK-NEXT: vldb wh5, [p5, #32]; vshift.align x4, x4, s0, x1, r3 - ; CHECK-NEXT: vlda wl5, [p5], #256; vshuffle x5, x0, x2, r25 - ; CHECK-NEXT: vlda wh11, [p4, #480]; vshift.align x6, x6, s0, x3, r3 - ; CHECK-NEXT: vlda wl11, [p4, #448]; vshuffle x3, x4, x6, r9 - ; CHECK-NEXT: vshuffle x10, x4, x6, r25; vmac.f bml4, bml4, x8, x7, r29 - ; CHECK-NEXT: vshuffle x1, x3, x5, r13 - ; CHECK-NEXT: vshuffle x3, x3, x5, r24; vmac.f bmh1, bmh1, x8, x9, r29 - ; CHECK-NEXT: mov r3, p0; vmac.f bmh0, bmh0, x1, x9, r29 - ; CHECK-NEXT: and r3, r3, r0; mov p4, p7; vmac.f bmh7, bmh7, x8, x5, r29 - ; CHECK-NEXT: add r3, r3, #34; vmac.f bmh5, bmh5, x1, x5, r29 - ; CHECK-NEXT: vmac.f bml2, bml2, x3, x5, r29 + ; CHECK-NEXT: vlda wh9, [p4, #416]; vshuffle x10, x4, x6, r25; vmac.f bml4, bml4, x8, x7, r29 + ; CHECK-NEXT: vlda wh7, [p4, #352]; vshuffle x3, x4, x6, r9; vmac.f bmh6, bmh6, x8, x11, r29 + ; CHECK-NEXT: vlda wl7, [p4, #320]; vshuffle x1, x3, x5, r13; vmac.f bmh2, bmh2, x10, x9, r29 + ; CHECK-NEXT: vlda wl9, [p4, #384]; vshuffle x3, x3, x5, r24; vmac.f bml5, bml5, x10, x7, r29 + ; CHECK-NEXT: vlda wh11, [p4, #480]; mov r3, p0; vmac.f bmh0, bmh0, x1, x9, r29 + ; CHECK-NEXT: vlda wl11, [p4, #448]; and r3, r3, r0; mov p7, p5; vmac.f bmh3, bmh3, x3, x9, r29 + ; CHECK-NEXT: vldb wh8, [p0, #32]; add r3, r3, #34; mov p4, p7; vmac.f bml3, bml3, x1, x7, r29 + ; CHECK-NEXT: vldb wl8, [p0], m4; vmac.f bml6, bml6, x3, x7, r29 + ; CHECK-NEXT: vldb wh10, [p0, #32]; vmac.f bmh4, bmh4, x1, x11, r29 + ; CHECK-NEXT: vldb wl10, [p0], m4; vmac.f bml1, bml1, x3, x11, r29 + ; CHECK-NEXT: vldb wh1, [p0, #32]; vmac.f bmh8, bmh8, x10, x11, r29 + ; CHECK-NEXT: vldb wl1, [p0], m4; vmac.f bmh7, bmh7, x8, x5, r29 + ; CHECK-NEXT: vldb wh3, [p0, #32]; vmac.f bmh5, bmh5, x1, x5, r29 + ; CHECK-NEXT: vldb.3d wl3, [p0], d1; vmac.f bml2, bml2, x3, x5, r29 + ; CHECK-NEXT: vshift.align x0, x0, s0, x8, r3; vmac.f bml0, bml0, x10, x5, r29 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vshift.align x2, x2, s0, x10, r3 + ; CHECK-NEXT: vshuffle x5, x0, x2, r25 + ; CHECK-NEXT: vldb wh5, [p5, #32]; vshuffle x8, x0, x2, r9 + ; CHECK-NEXT: vlda wl5, [p5], #256; vshift.align x4, x4, s0, x1, r3 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bml0, bml0, x10, x5, r29 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshift.align x6, x6, s0, x3, r3; vmac.f bmh1, bmh1, x8, x9, r29 ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup - ; CHECK-NEXT: vmac.f bmh3, bmh3, x3, x9, r29 - ; CHECK-NEXT: vmac.f bmh2, bmh2, x10, x9, r29 - ; CHECK-NEXT: vmac.f bml3, bml3, x1, x7, r29 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x10, x4, x6, r25; vmac.f bml4, bml4, x8, x7, r29 + ; CHECK-NEXT: nopa ; nopx ; vshuffle x3, x4, x6, r9; vmac.f bmh6, bmh6, x8, x11, r29 + ; CHECK-NEXT: vshuffle x1, x3, x5, r13; vmac.f bmh2, bmh2, x10, x9, r29 + ; CHECK-NEXT: vshuffle x3, x3, x5, r24; vmac.f bml5, bml5, x10, x7, r29 + ; CHECK-NEXT: mov r3, p0; vmac.f bmh0, bmh0, x1, x9, r29 + ; CHECK-NEXT: and r3, r3, r0; vmac.f bmh3, bmh3, x3, x9, r29 + ; CHECK-NEXT: add r3, r3, #34; vmac.f bml3, bml3, x1, x7, r29 ; CHECK-NEXT: vmac.f bml6, bml6, x3, x7, r29 - ; CHECK-NEXT: vmac.f bml5, bml5, x10, x7, r29 - ; CHECK-NEXT: vmac.f bmh6, bmh6, x8, x11, r29 ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x11, r29 ; CHECK-NEXT: vmac.f bml1, bml1, x3, x11, r29 ; CHECK-NEXT: vmac.f bmh8, bmh8, x10, x11, r29 + ; CHECK-NEXT: vmac.f bmh7, bmh7, x8, x5, r29 + ; CHECK-NEXT: vmac.f bmh5, bmh5, x1, x5, r29 + ; CHECK-NEXT: vmac.f bml2, bml2, x3, x5, r29 + ; CHECK-NEXT: vmac.f bml0, bml0, x10, x5, r29 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-2.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-2.mir new file mode 100644 index 000000000000..b00c4711aa3f --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-2.mir @@ -0,0 +1,228 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \ +# RUN: --debug-only=postpipeliner-summary -o - | FileCheck %s + + +# derived from conv2d_bf16_0 +# Same register allocation as what comes out of the register re-allocator. + +--- | + define dso_local void @conv2d(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: conv2d: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: vldb wh7, [p5, #32]; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl7, [p5], #64 + ; CHECK-NEXT: vldb wh8, [p4, #32] + ; CHECK-NEXT: vldb wl8, [p4], m4 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb wh10, [p4, #32] + ; CHECK-NEXT: vldb wl10, [p4], m4 + ; CHECK-NEXT: vldb wh1, [p4, #32] + ; CHECK-NEXT: vldb wl1, [p4], m4; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb wh3, [p4, #32]; movxm le, #.L_LEnd0 + ; CHECK-NEXT: vldb.3d wl3, [p4], d0; vshift.align x0, x0, s0, x8, r21 + ; CHECK-NEXT: mov r3, p4 + ; CHECK-NEXT: and r3, r3, r6; add.nc lc, r0, #-1 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshift.align x2, x2, s0, x10, r21; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x5, x0, x2, r25; nopv + ; CHECK-NEXT: vldb wh5, [p5, #32]; nopa ; nops ; nopx ; vshift.align x4, x4, s0, x1, r21; nopv + ; CHECK-NEXT: vldb wl5, [p5], #64; nopa ; nops ; nopx ; vshuffle x8, x0, x2, r10; nopv + ; CHECK-NEXT: vldb wh9, [p5, #32]; nopa ; nops ; add r21, r3, #34; vshift.align x6, x6, s0, x3, r21; nopv + ; CHECK-NEXT: vldb wl9, [p5], #64; nopa ; nops ; nopx ; vshuffle x3, x4, x6, r10; vmac.f bmh4, bmh4, x8, x7, r14 + ; CHECK-NEXT: vldb wh11, [p5, #32]; nopx ; vshuffle x10, x4, x6, r25 + ; CHECK-NEXT: vldb wl11, [p5], #64; vshuffle x1, x3, x5, r15 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wh7, [p5, #32]; nopx ; vshuffle x3, x3, x5, r24; vmac.f bml0, bml0, x10, x7, r14 + ; CHECK-NEXT: vldb wl7, [p5], #64; vmac.f bmh3, bmh3, x1, x7, r14 + ; CHECK-NEXT: vldb wh8, [p4, #32]; vmac.f bml5, bml5, x10, x5, r14 + ; CHECK-NEXT: vldb wl8, [p4], m4; vmac.f bml2, bml2, x8, x5, r14 + ; CHECK-NEXT: vmac.f bml1, bml1, x3, x7, r14 + ; CHECK-NEXT: vldb wh10, [p4, #32]; vmac.f bmh2, bmh2, x8, x9, r14 + ; CHECK-NEXT: vldb wl10, [p4], m4; vmac.f bml3, bml3, x10, x11, r14 + ; CHECK-NEXT: vldb wh1, [p4, #32]; vmac.f bmh7, bmh7, x8, x11, r14 + ; CHECK-NEXT: vldb wl1, [p4], m4; vmac.f bmh1, bmh1, x1, x9, r14 + ; CHECK-NEXT: vldb wh3, [p4, #32]; vmac.f bmh6, bmh6, x3, x9, r14 + ; CHECK-NEXT: vldb.3d wl3, [p4], d0; vshift.align x0, x0, s0, x8, r21; vmac.f bmh5, bmh5, x10, x9, r14 + ; CHECK-NEXT: mov r3, p4; vmac.f bmh8, bmh8, x1, x5, r14 + ; CHECK-NEXT: and r3, r3, r6; vmac.f bml6, bml6, x3, x5, r14 + ; CHECK-NEXT: vshift.align x2, x2, s0, x10, r21; vmac.f bmh0, bmh0, x1, x11, r14 + ; CHECK-NEXT: vshuffle x5, x0, x2, r25; vmac.f bml4, bml4, x3, x11, r14 + ; CHECK-NEXT: vldb wh5, [p5, #32]; vshift.align x4, x4, s0, x1, r21 + ; CHECK-NEXT: vldb wl5, [p5], #64; vshuffle x8, x0, x2, r10 + ; CHECK-NEXT: vldb wh9, [p5, #32]; add r21, r3, #34; vshift.align x6, x6, s0, x3, r21 + ; CHECK-NEXT: vldb wl9, [p5], #64; vshuffle x3, x4, x6, r10; vmac.f bmh4, bmh4, x8, x7, r14 + ; CHECK-NEXT: vldb wh11, [p5, #32]; vshuffle x10, x4, x6, r25 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: vldb wl11, [p5], #64; nopa ; nops ; nopx ; vshuffle x1, x3, x5, r15; nopv + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; nopx ; vshuffle x3, x3, x5, r24; vmac.f bml0, bml0, x10, x7, r14 + ; CHECK-NEXT: vmac.f bmh3, bmh3, x1, x7, r14 + ; CHECK-NEXT: vmac.f bml5, bml5, x10, x5, r14 + ; CHECK-NEXT: vmac.f bml2, bml2, x8, x5, r14 + ; CHECK-NEXT: vmac.f bml1, bml1, x3, x7, r14 + ; CHECK-NEXT: vmac.f bmh2, bmh2, x8, x9, r14 + ; CHECK-NEXT: vmac.f bml3, bml3, x10, x11, r14 + ; CHECK-NEXT: vmac.f bmh7, bmh7, x8, x11, r14 + ; CHECK-NEXT: vmac.f bmh1, bmh1, x1, x9, r14 + ; CHECK-NEXT: vmac.f bmh6, bmh6, x3, x9, r14 + ; CHECK-NEXT: vmac.f bmh5, bmh5, x10, x9, r14 + ; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x5, r14 + ; CHECK-NEXT: vmac.f bml6, bml6, x3, x5, r14 + ; CHECK-NEXT: vmac.f bmh0, bmh0, x1, x11, r14 + ; CHECK-NEXT: vmac.f bml4, bml4, x3, x11, r14 + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: conv2d +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOV_RLC_imm10_pseudo 0 + $r1 = GE $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc3, $dc4, $dc5, $dc7, $dj0, $dj3, $dj4, $dj7, $dn0, $dn4, $dn7, $m0, $m3, $m4, $m5, $m7, $p0, $p1, $p2, $p3, $p4, $p5, $p6, $r1, $r2, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r13, $r14, $r15, $r16, $r17, $r18, $r19, $r20, $r21, $r22, $r23, $r24, $r25, $r26, $r27, $r28, $r29, $r30, $r31, $s0, $x0, $x2, $x4, $x6, $d0_3d:0x000000000003C870, $dn3, $dn1, $dn5, $dj5 + + renamable $wh8 = VLD_idx_imm_3x32_pseudo renamable $p4, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wl8, renamable $p4 = VLD_pstm_pseudo killed renamable $p4, renamable $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wh10 = VLD_idx_imm_3x32_pseudo renamable $p4, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wl10, renamable $p4 = VLD_pstm_pseudo killed renamable $p4, renamable $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wh1 = VLD_idx_imm_3x32_pseudo renamable $p4, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wl1, renamable $p4 = VLD_pstm_pseudo killed renamable $p4, renamable $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wh3 = VLD_idx_imm_3x32_pseudo renamable $p4, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl3, $p4, $dc0, $dc4 = VLD_3D_pseudo killed $p4, $d0_3d :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $x0 = VSHIFT_ALIGN killed renamable $x0, renamable $s0, killed renamable $x8, renamable $r21 + renamable $x2 = VSHIFT_ALIGN killed renamable $x2, renamable $s0, killed renamable $x10, renamable $r21 + renamable $x4 = VSHIFT_ALIGN killed renamable $x4, renamable $s0, killed renamable $x1, renamable $r21 + renamable $x6 = VSHIFT_ALIGN killed renamable $x6, renamable $s0, killed renamable $x3, killed renamable $r21 + renamable $x8 = VSHUFFLE renamable $x0, renamable $x2, renamable $r10 + renamable $x3 = VSHUFFLE renamable $x4, renamable $x6, renamable $r10 + renamable $x5 = VSHUFFLE renamable $x0, renamable $x2, renamable $r25 + renamable $x10 = VSHUFFLE renamable $x4, renamable $x6, renamable $r25 + renamable $x1 = VSHUFFLE renamable $x3, renamable $x5, renamable $r15 + renamable $x3 = VSHUFFLE killed renamable $x3, killed renamable $x5, renamable $r24 + renamable $wh7 = VLD_idx_imm_3x32_pseudo renamable $p5, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wl7, renamable $p5 = VLD_pstm_imm_4x32_pseudo killed renamable $p5, 64 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wh5 = VLD_idx_imm_3x32_pseudo renamable $p5, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wl5, renamable $p5 = VLD_pstm_imm_4x32_pseudo killed renamable $p5, 64 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wh9 = VLD_idx_imm_3x32_pseudo renamable $p5, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wl9, renamable $p5 = VLD_pstm_imm_4x32_pseudo killed renamable $p5, 64 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wh11 = VLD_idx_imm_3x32_pseudo renamable $p5, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wl11, renamable $p5 = VLD_pstm_imm_4x32_pseudo killed renamable $p5, 64 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $bmh4 = VMAC_F_vmac_bm_core_dense killed renamable $bmh4, renamable $x8, renamable $x7, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bmh3 = VMAC_F_vmac_bm_core_dense killed renamable $bmh3, renamable $x1, renamable $x7, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bml1 = VMAC_F_vmac_bm_core_dense killed renamable $bml1, renamable $x3, renamable $x7, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bml0 = VMAC_F_vmac_bm_core_dense killed renamable $bml0, renamable $x10, killed renamable $x7, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bmh2 = VMAC_F_vmac_bm_core_dense killed renamable $bmh2, renamable $x8, renamable $x9, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bmh1 = VMAC_F_vmac_bm_core_dense killed renamable $bmh1, renamable $x1, renamable $x9, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bmh6 = VMAC_F_vmac_bm_core_dense killed renamable $bmh6, renamable $x3, renamable $x9, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bmh5 = VMAC_F_vmac_bm_core_dense killed renamable $bmh5, renamable $x10, killed renamable $x9, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bml2 = VMAC_F_vmac_bm_core_dense killed renamable $bml2, renamable $x8, renamable $x5, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bmh8 = VMAC_F_vmac_bm_core_dense killed renamable $bmh8, renamable $x1, renamable $x5, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bml6 = VMAC_F_vmac_bm_core_dense killed renamable $bml6, renamable $x3, renamable $x5, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bml5 = VMAC_F_vmac_bm_core_dense killed renamable $bml5, renamable $x10, killed renamable $x5, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bmh7 = VMAC_F_vmac_bm_core_dense killed renamable $bmh7, killed renamable $x8, renamable $x11, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bmh0 = VMAC_F_vmac_bm_core_dense killed renamable $bmh0, killed renamable $x1, renamable $x11, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bml4 = VMAC_F_vmac_bm_core_dense killed renamable $bml4, killed renamable $x3, renamable $x11, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bml3 = VMAC_F_vmac_bm_core_dense killed renamable $bml3, killed renamable $x10, killed renamable $x11, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + $r3 = MOV_mv_scl $p4 + renamable $r3 = AND killed renamable $r3, renamable $r6 + renamable $r21 = nuw nsw ADD_add_r_ri killed renamable $r3, 34, implicit-def dead $srcarry + + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-feasibleRA.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-feasibleRA.mir new file mode 100644 index 000000000000..412ebddb8e52 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-feasibleRA.mir @@ -0,0 +1,223 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \ +# RUN: --debug-only=postpipeliner-summary -o - | FileCheck %s + + +# derived from conv2d_bf16_0 +# Registers have been allocated in a way that II=16 is feasible. + +--- | + define dso_local void @conv2d(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: conv2d: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: nopa ; vldb wh10, [p4, #32]; nopx + ; CHECK-NEXT: vldb wl10, [p4], m4 + ; CHECK-NEXT: vldb wh10, [p5, #32] + ; CHECK-NEXT: vldb wh7, [p4, #32] + ; CHECK-NEXT: vldb wl7, [p4], m4 + ; CHECK-NEXT: vldb wl10, [p5], #64 + ; CHECK-NEXT: vldb wh7, [p5, #32] + ; CHECK-NEXT: vldb wh8, [p4, #32]; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb wl8, [p4], m4; vshift.align x3, x3, s0, x10, r21 + ; CHECK-NEXT: vldb wh11, [p4, #32]; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb.3d wl11, [p4], d0; movxm le, #.L_LEnd0 + ; CHECK-NEXT: vldb wl7, [p5], #64; nopa ; nops ; nopx ; vshift.align x2, x2, s0, x7, r21; nopv + ; CHECK-NEXT: vldb wh9, [p5, #32]; nopa ; nops ; nopx ; mov r3, p4; nopv + ; CHECK-NEXT: vldb wl9, [p5], #64; nopa ; nops ; and r3, r3, r6; nopm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x6, x3, x2, r10; nopv + ; CHECK-NEXT: vldb wh4, [p5, #32]; nopa ; nops ; nopx ; vshift.align x1, x1, s0, x8, r21; nopv + ; CHECK-NEXT: vldb wl4, [p5], #64; nopa ; nops ; nopx ; vshuffle x8, x3, x2, r25; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; add r21, r3, #34; vshift.align x0, x0, s0, x11, r21; nopv + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wh10, [p4, #32]; nopa ; nops ; nopx ; vshuffle x4, x1, x0, r10; vmac.f bmh4, bmh4, x8, x10, r14 + ; CHECK-NEXT: vldb wl10, [p4], m4; nopx ; vshuffle x11, x1, x0, r25; vmac.f bmh2, bmh2, x8, x7, r14 + ; CHECK-NEXT: vldb wh10, [p5, #32]; vshuffle x5, x4, x6, r15; vmac.f bml2, bml2, x8, x9, r14 + ; CHECK-NEXT: vldb wh7, [p4, #32]; vshuffle x6, x4, x6, r24; vmac.f bml1, bml1, x11, x10, r14 + ; CHECK-NEXT: vldb wl7, [p4], m4; vmac.f bmh3, bmh3, x5, x10, r14 + ; CHECK-NEXT: vldb wl10, [p5], #64; vmac.f bml0, bml0, x6, x10, r14 + ; CHECK-NEXT: vldb wh7, [p5, #32]; vmac.f bmh1, bmh1, x5, x7, r14 + ; CHECK-NEXT: vldb wh8, [p4, #32]; vmac.f bmh6, bmh6, x11, x7, r14 + ; CHECK-NEXT: vldb wl8, [p4], m4; vshift.align x3, x3, s0, x10, r21; vmac.f bmh5, bmh5, x6, x7, r14 + ; CHECK-NEXT: vldb wh11, [p4, #32]; vmac.f bmh8, bmh8, x5, x9, r14 + ; CHECK-NEXT: vldb.3d wl11, [p4], d0; vmac.f bml6, bml6, x11, x9, r14 + ; CHECK-NEXT: vldb wl7, [p5], #64; vshift.align x2, x2, s0, x7, r21; vmac.f bml5, bml5, x6, x9, r14 + ; CHECK-NEXT: vldb wh9, [p5, #32]; mov r3, p4; vmac.f bmh7, bmh7, x8, x4, r14 + ; CHECK-NEXT: vldb wl9, [p5], #64; and r3, r3, r6; vmac.f bmh0, bmh0, x5, x4, r14 + ; CHECK-NEXT: vshuffle x6, x3, x2, r10; vmac.f bml4, bml4, x11, x4, r14 + ; CHECK-NEXT: vldb wh4, [p5, #32]; vshift.align x1, x1, s0, x8, r21; vmac.f bml3, bml3, x6, x4, r14 + ; CHECK-NEXT: vldb wl4, [p5], #64; vshuffle x8, x3, x2, r25 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopb ; nopa ; nops ; add r21, r3, #34; vshift.align x0, x0, s0, x11, r21; nopv + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; nopx ; vshuffle x4, x1, x0, r10; vmac.f bmh4, bmh4, x8, x10, r14 + ; CHECK-NEXT: vshuffle x11, x1, x0, r25; vmac.f bmh2, bmh2, x8, x7, r14 + ; CHECK-NEXT: vshuffle x5, x4, x6, r15; vmac.f bml2, bml2, x8, x9, r14 + ; CHECK-NEXT: vshuffle x6, x4, x6, r24; vmac.f bml1, bml1, x11, x10, r14 + ; CHECK-NEXT: vmac.f bmh3, bmh3, x5, x10, r14 + ; CHECK-NEXT: vmac.f bml0, bml0, x6, x10, r14 + ; CHECK-NEXT: vmac.f bmh1, bmh1, x5, x7, r14 + ; CHECK-NEXT: vmac.f bmh6, bmh6, x11, x7, r14 + ; CHECK-NEXT: vmac.f bmh5, bmh5, x6, x7, r14 + ; CHECK-NEXT: vmac.f bmh8, bmh8, x5, x9, r14 + ; CHECK-NEXT: vmac.f bml6, bml6, x11, x9, r14 + ; CHECK-NEXT: vmac.f bml5, bml5, x6, x9, r14 + ; CHECK-NEXT: vmac.f bmh7, bmh7, x8, x4, r14 + ; CHECK-NEXT: vmac.f bmh0, bmh0, x5, x4, r14 + ; CHECK-NEXT: vmac.f bml4, bml4, x11, x4, r14 + ; CHECK-NEXT: vmac.f bml3, bml3, x6, x4, r14 + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: conv2d +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOV_RLC_imm10_pseudo 0 + $r1 = GE $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc3, $dc4, $dc5, $dc7, $dj0, $dj3, $dj4, $dj7, $dn0, $dn4, $dn7, $m0, $m3, $m4, $m5, $m7, $p0, $p1, $p2, $p3, $p4, $p5, $p6, $r1, $r2, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r13, $r14, $r15, $r16, $r17, $r18, $r19, $r20, $r21, $r22, $r23, $r24, $r25, $r26, $r27, $r28, $r29, $r30, $r31, $s0, $x0, $x1, $x2, $x3, $d0_3d:0x000000000003C870, $dn3, $dn1, $dn5, $dj5 + + renamable $wh10 = VLD_idx_imm_3x32_pseudo renamable $p4, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wl10, renamable $p4 = VLD_pstm_pseudo killed renamable $p4, renamable $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wh7 = VLD_idx_imm_3x32_pseudo renamable $p4, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wl7, renamable $p4 = VLD_pstm_pseudo killed renamable $p4, renamable $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wh8 = VLD_idx_imm_3x32_pseudo renamable $p4, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wl8, renamable $p4 = VLD_pstm_pseudo killed renamable $p4, renamable $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wh11 = VLD_idx_imm_3x32_pseudo renamable $p4, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl11, $p4, $dc0, $dc4 = VLD_3D_pseudo killed $p4, $d0_3d :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $x3 = VSHIFT_ALIGN killed renamable $x3, renamable $s0, killed renamable $x10, renamable $r21 + renamable $x2 = VSHIFT_ALIGN killed renamable $x2, renamable $s0, killed renamable $x7, renamable $r21 + renamable $x1 = VSHIFT_ALIGN killed renamable $x1, renamable $s0, killed renamable $x8, renamable $r21 + renamable $x0 = VSHIFT_ALIGN killed renamable $x0, renamable $s0, killed renamable $x11, killed renamable $r21 + renamable $x6 = VSHUFFLE renamable $x3, renamable $x2, renamable $r10 + renamable $x4 = VSHUFFLE renamable $x1, renamable $x0, renamable $r10 + renamable $x8 = VSHUFFLE renamable $x3, renamable $x2, renamable $r25 + renamable $x11 = VSHUFFLE renamable $x1, renamable $x0, renamable $r25 + renamable $x5 = VSHUFFLE renamable $x4, renamable $x6, renamable $r15 + renamable $x6 = VSHUFFLE killed renamable $x4, killed renamable $x6, renamable $r24 + renamable $wh10 = VLD_idx_imm_3x32_pseudo renamable $p5, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wl10, renamable $p5 = VLD_pstm_imm_4x32_pseudo killed renamable $p5, 64 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wh7 = VLD_idx_imm_3x32_pseudo renamable $p5, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wl7, renamable $p5 = VLD_pstm_imm_4x32_pseudo killed renamable $p5, 64 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wh9 = VLD_idx_imm_3x32_pseudo renamable $p5, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wl9, renamable $p5 = VLD_pstm_imm_4x32_pseudo killed renamable $p5, 64 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wh4 = VLD_idx_imm_3x32_pseudo renamable $p5, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $wl4, renamable $p5 = VLD_pstm_imm_4x32_pseudo killed renamable $p5, 64 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + renamable $bmh4 = VMAC_F_vmac_bm_core_dense killed renamable $bmh4, renamable $x8, renamable $x10, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bmh3 = VMAC_F_vmac_bm_core_dense killed renamable $bmh3, renamable $x5, renamable $x10, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bml1 = VMAC_F_vmac_bm_core_dense killed renamable $bml1, renamable $x11, renamable $x10, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bml0 = VMAC_F_vmac_bm_core_dense killed renamable $bml0, renamable $x6, killed renamable $x10, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bmh2 = VMAC_F_vmac_bm_core_dense killed renamable $bmh2, renamable $x8, renamable $x7, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bmh1 = VMAC_F_vmac_bm_core_dense killed renamable $bmh1, renamable $x5, renamable $x7, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bmh6 = VMAC_F_vmac_bm_core_dense killed renamable $bmh6, renamable $x11, renamable $x7, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bmh5 = VMAC_F_vmac_bm_core_dense killed renamable $bmh5, renamable $x6, killed renamable $x7, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bml2 = VMAC_F_vmac_bm_core_dense killed renamable $bml2, renamable $x8, renamable $x9, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bmh8 = VMAC_F_vmac_bm_core_dense killed renamable $bmh8, renamable $x5, renamable $x9, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bml6 = VMAC_F_vmac_bm_core_dense killed renamable $bml6, renamable $x11, renamable $x9, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bml5 = VMAC_F_vmac_bm_core_dense killed renamable $bml5, renamable $x6, killed renamable $x9, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bmh7 = VMAC_F_vmac_bm_core_dense killed renamable $bmh7, killed renamable $x8, renamable $x4, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bmh0 = VMAC_F_vmac_bm_core_dense killed renamable $bmh0, killed renamable $x5, renamable $x4, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bml4 = VMAC_F_vmac_bm_core_dense killed renamable $bml4, killed renamable $x11, renamable $x4, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + renamable $bml3 = VMAC_F_vmac_bm_core_dense killed renamable $bml3, killed renamable $x6, killed renamable $x4, renamable $r14, implicit-def dead $srfpflags, implicit $crfpmask + $r3 = MOV_mv_scl $p4 + renamable $r3 = AND killed renamable $r3, renamable $r6 + renamable $r21 = nuw nsw ADD_add_r_ri killed renamable $r3, 34, implicit-def dead $srcarry + + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir index 325109402812..27ef11c95b80 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir @@ -25,66 +25,65 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: vldb wh7, [p7, #32]; mov p4, p2 - ; CHECK-NEXT: vlda wl7, [p7], #256; paddb [p4], #320; nopx ; mov p5, p7 + ; CHECK-NEXT: nopa ; vldb wh7, [p7, #32]; nopx ; mov p4, p2; nops ; CHECK-NEXT: vldb wh8, [p0, #32] - ; CHECK-NEXT: vldb wl8, [p0], m4 + ; CHECK-NEXT: vldb wl8, [p0], m4; mov p5, p7 ; CHECK-NEXT: vldb wh10, [p0, #32] ; CHECK-NEXT: vldb wl10, [p0], m4 - ; CHECK-NEXT: vldb wh1, [p0, #32] - ; CHECK-NEXT: vldb wl1, [p0], m4; add.nc lc, r0, #-1 - ; CHECK-NEXT: vldb wh3, [p0, #32]; movxm ls, #.LBB0_2 - ; CHECK-NEXT: vldb.3d wl3, [p0], d1; movxm le, #.L_LEnd0 + ; CHECK-NEXT: vlda wl7, [p7], #256; paddb [p4], #320 + ; CHECK-NEXT: vldb wh1, [p0, #32]; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb wl1, [p0], m4; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb wh3, [p0, #32]; movxm le, #.L_LEnd0 + ; CHECK-NEXT: vldb.3d wl3, [p0], d1; nopa ; nops ; nopx ; vshift.align x0, x0, s0, x8, r3; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov r1, p0; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; and r2, r1, r0; vshift.align x0, x0, s0, x8, r3; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshift.align x2, x2, s0, x10, r3; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; and r2, r1, r0; vshift.align x2, x2, s0, x10, r3; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x8, x0, x2, r9; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x5, x0, x2, r25; nopv - ; CHECK-NEXT: nopb ; vlda wh5, [p2, #352]; nops ; nopx ; vshift.align x4, x4, s0, x1, r3; nopv - ; CHECK-NEXT: vldb wl5, [p4], #64; nopa ; nops ; nopx ; vshuffle x8, x0, x2, r9; nopv + ; CHECK-NEXT: nopb ; vlda wh5, [p2, #352]; nops ; nopx ; vshift.align x4, x4, s0, x1, r3; vmac.f bmh7, bmh7, x8, x7, r29 + ; CHECK-NEXT: nopa ; vldb wl5, [p4], #64; nopx ; mov p2, p5 ; CHECK-NEXT: vldb wh9, [p4, #32]; add r3, r2, #34; vshift.align x6, x6, s0, x3, r3 - ; CHECK-NEXT: vldb wl9, [p4], #64; vshuffle x3, x4, x6, r9; vmac.f bmh7, bmh7, x8, x7, r29 + ; CHECK-NEXT: vldb wl9, [p4], #64; vshuffle x3, x4, x6, r9 ; CHECK-NEXT: vldb wl11, [p4, #0]; vshuffle x10, x4, x6, r25 ; CHECK-NEXT: vldb wh11, [p4, #32]; vshuffle x1, x3, x5, r13 ; CHECK-NEXT: vshuffle x3, x3, x5, r24; vmac.f bml0, bml0, x10, x7, r29 - ; CHECK-NEXT: mov p2, p5; vmac.f bmh5, bmh5, x1, x7, r29 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vldb wh7, [p7, #32]; nopa ; nops ; nopx ; mov p4, p2; vmac.f bml2, bml2, x3, x7, r29 - ; CHECK-NEXT: vlda wl7, [p7], #256; paddb [p4], #320; nopx ; mov p5, p7; vmac.f bml4, bml4, x8, x5, r29 - ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh1, bmh1, x8, x9, r29 - ; CHECK-NEXT: vldb wl8, [p0], m4; vmac.f bmh0, bmh0, x1, x9, r29 - ; CHECK-NEXT: vldb wh10, [p0, #32]; vmac.f bmh8, bmh8, x10, x11, r29 - ; CHECK-NEXT: vldb wl10, [p0], m4; vmac.f bmh6, bmh6, x8, x11, r29 - ; CHECK-NEXT: vldb wh1, [p0, #32]; vmac.f bmh3, bmh3, x3, x9, r29 - ; CHECK-NEXT: vldb wl1, [p0], m4; vmac.f bmh2, bmh2, x10, x9, r29 - ; CHECK-NEXT: vldb wh3, [p0, #32]; vmac.f bml3, bml3, x1, x5, r29 - ; CHECK-NEXT: vldb.3d wl3, [p0], d1; vmac.f bml6, bml6, x3, x5, r29 - ; CHECK-NEXT: mov r1, p0; vmac.f bml5, bml5, x10, x5, r29 - ; CHECK-NEXT: and r2, r1, r0; vshift.align x0, x0, s0, x8, r3; vmac.f bmh4, bmh4, x1, x11, r29 - ; CHECK-NEXT: vshift.align x2, x2, s0, x10, r3; vmac.f bml1, bml1, x3, x11, r29 - ; CHECK-NEXT: vshuffle x5, x0, x2, r25 - ; CHECK-NEXT: vlda wh5, [p2, #352]; vshift.align x4, x4, s0, x1, r3 - ; CHECK-NEXT: vldb wl5, [p4], #64; vshuffle x8, x0, x2, r9 + ; CHECK-NEXT: vldb wh7, [p7, #32]; nopa ; nops ; nopx ; mov p4, p2; vmac.f bmh5, bmh5, x1, x7, r29 + ; CHECK-NEXT: nopa ; vldb wh8, [p0, #32]; nopx ; vmac.f bml2, bml2, x3, x7, r29 + ; CHECK-NEXT: vldb wl8, [p0], m4; mov p5, p7; vmac.f bml4, bml4, x8, x5, r29 + ; CHECK-NEXT: vldb wh10, [p0, #32]; vmac.f bml5, bml5, x10, x5, r29 + ; CHECK-NEXT: vldb wl10, [p0], m4; vmac.f bmh1, bmh1, x8, x9, r29 + ; CHECK-NEXT: vlda wl7, [p7], #256; paddb [p4], #320; vmac.f bmh6, bmh6, x8, x11, r29 + ; CHECK-NEXT: vldb wh1, [p0, #32]; vmac.f bmh8, bmh8, x10, x11, r29 + ; CHECK-NEXT: vldb wl1, [p0], m4; vmac.f bmh0, bmh0, x1, x9, r29 + ; CHECK-NEXT: vldb wh3, [p0, #32]; vmac.f bmh3, bmh3, x3, x9, r29 + ; CHECK-NEXT: vldb.3d wl3, [p0], d1; vshift.align x0, x0, s0, x8, r3; vmac.f bmh2, bmh2, x10, x9, r29 + ; CHECK-NEXT: mov r1, p0; vmac.f bml3, bml3, x1, x5, r29 + ; CHECK-NEXT: and r2, r1, r0; vshift.align x2, x2, s0, x10, r3; vmac.f bml6, bml6, x3, x5, r29 + ; CHECK-NEXT: vshuffle x8, x0, x2, r9; vmac.f bmh4, bmh4, x1, x11, r29 + ; CHECK-NEXT: vshuffle x5, x0, x2, r25; vmac.f bml1, bml1, x3, x11, r29 + ; CHECK-NEXT: vlda wh5, [p2, #352]; vshift.align x4, x4, s0, x1, r3; vmac.f bmh7, bmh7, x8, x7, r29 + ; CHECK-NEXT: vldb wl5, [p4], #64; mov p2, p5 ; CHECK-NEXT: vldb wh9, [p4, #32]; add r3, r2, #34; vshift.align x6, x6, s0, x3, r3 - ; CHECK-NEXT: vldb wl9, [p4], #64; vshuffle x3, x4, x6, r9; vmac.f bmh7, bmh7, x8, x7, r29 + ; CHECK-NEXT: vldb wl9, [p4], #64; vshuffle x3, x4, x6, r9 ; CHECK-NEXT: vldb wl11, [p4, #0]; vshuffle x10, x4, x6, r25 ; CHECK-NEXT: vldb wh11, [p4, #32]; vshuffle x1, x3, x5, r13 - ; CHECK-NEXT: vshuffle x3, x3, x5, r24; vmac.f bml0, bml0, x10, x7, r29 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov p2, p5; vmac.f bmh5, bmh5, x1, x7, r29 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x3, x3, x5, r24; vmac.f bml0, bml0, x10, x7, r29 ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh5, bmh5, x1, x7, r29 ; CHECK-NEXT: vmac.f bml2, bml2, x3, x7, r29 ; CHECK-NEXT: vmac.f bml4, bml4, x8, x5, r29 + ; CHECK-NEXT: vmac.f bml5, bml5, x10, x5, r29 ; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x9, r29 - ; CHECK-NEXT: vmac.f bmh0, bmh0, x1, x9, r29 - ; CHECK-NEXT: vmac.f bmh8, bmh8, x10, x11, r29 ; CHECK-NEXT: vmac.f bmh6, bmh6, x8, x11, r29 + ; CHECK-NEXT: vmac.f bmh8, bmh8, x10, x11, r29 + ; CHECK-NEXT: vmac.f bmh0, bmh0, x1, x9, r29 ; CHECK-NEXT: vmac.f bmh3, bmh3, x3, x9, r29 ; CHECK-NEXT: vmac.f bmh2, bmh2, x10, x9, r29 ; CHECK-NEXT: vmac.f bml3, bml3, x1, x5, r29 ; CHECK-NEXT: vmac.f bml6, bml6, x3, x5, r29 - ; CHECK-NEXT: vmac.f bml5, bml5, x10, x5, r29 ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x11, r29 ; CHECK-NEXT: vmac.f bml1, bml1, x3, x11, r29 ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir index 0f5200dd62aa..1e752ef0d90a 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir @@ -24,70 +24,67 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl9, [p1], m5; nopxm ; CHECK-NEXT: vldb wh9, [p1], m6 + ; CHECK-NEXT: vldb wh6, [p0, #96] + ; CHECK-NEXT: vldb wh8, [p0, #32] ; CHECK-NEXT: vldb wl5, [p1], m5 - ; CHECK-NEXT: vldb wh5, [p1], m6 - ; CHECK-NEXT: vldb wl7, [p1], m5 - ; CHECK-NEXT: vldb wh7, [p1], m6 ; CHECK-NEXT: vldb wl8, [p0, #0] - ; CHECK-NEXT: vldb wh8, [p0, #32] - ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2 - ; CHECK-NEXT: vldb wh6, [p0, #96]; padds [p0], m4 - ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2 - ; CHECK-NEXT: vldb wl10, [p0], #32; add.nc lc, r0, #-1 - ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2 + ; CHECK-NEXT: vldb wh5, [p1], m6 + ; CHECK-NEXT: vldb wl6, [p0, #64]; padds [p0], m4 + ; CHECK-NEXT: padds [p0], #128; vldb wl7, [p1], m5; vshuffle x11, x9, x9, r2 + ; CHECK-NEXT: vldb wl10, [p0], #32 + ; CHECK-NEXT: vldb wh10, [p0], #32; add.nc lc, r0, #-1 ; CHECK-NEXT: vldb wl3, [p0], #32; movxm ls, #.LBB0_2 ; CHECK-NEXT: vldb.3d wh3, [p0], d0; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh7, [p1], m6; nopa ; nops ; nopx ; vshuffle x5, x5, x5, r2; nopv ; CHECK-NEXT: vldb wl3, [p1], m5; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: vldb.3d wh3, [p1], d1; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x1, x8, x10, r4; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x0, x8, x10, r16; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x1, x8, x10, r4; nopv + ; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh1, bmh1, x0, x11, r3 + ; CHECK-NEXT: vshuffle x6, x6, x3, r16; vmac.f bmh0, bmh0, x1, x11, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x0, x11, r3 - ; CHECK-NEXT: vldb wh9, [p1], m6; nopx ; vmac.f bmh2, bmh2, x10, x11, r3 - ; CHECK-NEXT: vldb wl5, [p1], m5; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 - ; CHECK-NEXT: vldb wh5, [p1], m6; vmac.f bmh4, bmh4, x1, x5, r3 - ; CHECK-NEXT: vldb wl7, [p1], m5; vmac.f bmh5, bmh5, x0, x5, r3 - ; CHECK-NEXT: vldb wh7, [p1], m6; vmac.f bmh6, bmh6, x10, x5, r3 - ; CHECK-NEXT: vldb wl8, [p0, #0]; vmac.f bmh7, bmh7, x6, x5, r3 - ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3 - ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2; vmac.f bml0, bml0, x0, x7, r3 - ; CHECK-NEXT: padds [p0], m4; vldb wh6, [p0, #96]; vmac.f bml1, bml1, x10, x7, r3 - ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2; vmac.f bml2, bml2, x6, x7, r3 - ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml3, bml3, x1, x3, r3 - ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2; vmac.f bml5, bml5, x0, x3, r3 - ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bml6, bml6, x10, x3, r3 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml4, bml4, x6, x3, r3 - ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopx ; vshuffle x7, x7, x7, r2; vmac.f bmh5, bmh5, x0, x5, r3 + ; CHECK-NEXT: nopa ; vldb wh9, [p1], m6; nopx ; vshuffle x3, x3, x3, r2; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vldb wh6, [p0, #96]; vmac.f bml2, bml2, x6, x7, r3 + ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml0, bml0, x0, x7, r3 + ; CHECK-NEXT: vldb wl8, [p0, #0]; vmac.f bml1, bml1, x10, x7, r3 + ; CHECK-NEXT: vldb wh5, [p1], m6; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: padds [p0], m4; vldb wl6, [p0, #64]; vmac.f bmh2, bmh2, x10, x11, r3 + ; CHECK-NEXT: padds [p0], #128; vldb wl7, [p1], m5; vshuffle x11, x9, x9, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bmh8, bmh8, x1, x7, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bml3, bml3, x1, x3, r3 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml5, bml5, x0, x3, r3 + ; CHECK-NEXT: vldb wh7, [p1], m6; vshuffle x5, x5, x5, r2; vmac.f bml6, bml6, x10, x3, r3 ; CHECK-NEXT: vldb wl3, [p1], m5 ; CHECK-NEXT: vldb.3d wh3, [p1], d1 ; CHECK-NEXT: nop - ; CHECK-NEXT: vshuffle x1, x8, x10, r4 ; CHECK-NEXT: vshuffle x0, x8, x10, r16 + ; CHECK-NEXT: vshuffle x1, x8, x10, r4 + ; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh1, bmh1, x0, x11, r3 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh0, bmh0, x1, x11, r3 ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup - ; CHECK-NEXT: vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x0, x11, r3 + ; CHECK-NEXT: nopa ; nopx ; vshuffle x7, x7, x7, r2; vmac.f bmh5, bmh5, x0, x5, r3 + ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vmac.f bml2, bml2, x6, x7, r3 + ; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: vmac.f bml0, bml0, x0, x7, r3 + ; CHECK-NEXT: vmac.f bml1, bml1, x10, x7, r3 + ; CHECK-NEXT: vmac.f bmh6, bmh6, x10, x5, r3 ; CHECK-NEXT: vmac.f bmh2, bmh2, x10, x11, r3 - ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vmac.f bmh3, bmh3, x6, x11, r3 ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 - ; CHECK-NEXT: vmac.f bmh5, bmh5, x0, x5, r3 - ; CHECK-NEXT: vmac.f bmh6, bmh6, x10, x5, r3 - ; CHECK-NEXT: vmac.f bmh7, bmh7, x6, x5, r3 ; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x7, r3 - ; CHECK-NEXT: vmac.f bml0, bml0, x0, x7, r3 - ; CHECK-NEXT: vmac.f bml1, bml1, x10, x7, r3 - ; CHECK-NEXT: vmac.f bml2, bml2, x6, x7, r3 ; CHECK-NEXT: vmac.f bml3, bml3, x1, x3, r3 ; CHECK-NEXT: vmac.f bml5, bml5, x0, x3, r3 ; CHECK-NEXT: vmac.f bml6, bml6, x10, x3, r3 - ; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir index 626878424dcd..39e4376b0c9c 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir @@ -24,70 +24,67 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl9, [p1], m5 ; CHECK-NEXT: vldb wh9, [p1], m6 + ; CHECK-NEXT: vldb wh6, [p0, #96] + ; CHECK-NEXT: vldb wh8, [p0, #32] ; CHECK-NEXT: vldb wl5, [p1], m5 ; CHECK-NEXT: vldb wh5, [p1], m6 - ; CHECK-NEXT: vldb wl7, [p1], m5 - ; CHECK-NEXT: vldb wh7, [p1], m6 ; CHECK-NEXT: vldb wl8, [p0, #0] - ; CHECK-NEXT: vldb wh8, [p0, #32] - ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2 - ; CHECK-NEXT: vldb wh6, [p0, #96]; padds [p0], m4 - ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2 - ; CHECK-NEXT: vldb wl10, [p0], #32; add.nc lc, r0, #-1 - ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2 + ; CHECK-NEXT: vldb wl6, [p0, #64]; padds [p0], m4 + ; CHECK-NEXT: padds [p0], #128; vldb wl7, [p1], m5; vshuffle x11, x9, x9, r2 + ; CHECK-NEXT: vldb wl10, [p0], #32 + ; CHECK-NEXT: vldb wh10, [p0], #32; add.nc lc, r0, #-1 ; CHECK-NEXT: vldb wl3, [p0], #32; movxm ls, #.LBB0_2 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x5, x5, x5, r2 + ; CHECK-NEXT: vldb wh7, [p1], m6; movxm le, #.L_LEnd0 ; CHECK-NEXT: vldb wl3, [p1], m5; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: vldb.3d wh3, [p1], d1; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x1, x8, x10, r4; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x2, x2, x10, r16; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x0, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x1, x8, x10, r4; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x0, x6, x3, r4; vmac.f bmh5, bmh5, x2, x5, r3 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh0, bmh0, x1, x11, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x2, x11, r3 - ; CHECK-NEXT: vldb wh9, [p1], m6; nopx ; vmac.f bmh2, bmh2, x0, x11, r3 - ; CHECK-NEXT: vldb wl5, [p1], m5; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 - ; CHECK-NEXT: vldb wh5, [p1], m6; vmac.f bmh4, bmh4, x1, x5, r3 - ; CHECK-NEXT: vldb wl7, [p1], m5; vmac.f bmh5, bmh5, x2, x5, r3 - ; CHECK-NEXT: vldb wh7, [p1], m6; vmac.f bmh6, bmh6, x0, x5, r3 - ; CHECK-NEXT: vldb wl8, [p0, #0]; vmac.f bmh7, bmh7, x6, x5, r3 - ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3 - ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2; vmac.f bml0, bml0, x2, x7, r3 - ; CHECK-NEXT: padds [p0], m4; vldb wh6, [p0, #96]; vmac.f bml1, bml1, x0, x7, r3 - ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2; vmac.f bml2, bml2, x6, x7, r3 - ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml3, bml3, x1, x3, r3 - ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2; vmac.f bml5, bml5, x2, x3, r3 - ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bml6, bml6, x0, x3, r3 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml4, bml4, x6, x3, r3 - ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopx ; vshuffle x7, x7, x7, r2; vmac.f bmh6, bmh6, x0, x5, r3 + ; CHECK-NEXT: nopa ; vldb wh9, [p1], m6; nopx ; vshuffle x3, x3, x3, r2; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vldb wh6, [p0, #96]; vmac.f bml2, bml2, x6, x7, r3 + ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml0, bml0, x2, x7, r3 + ; CHECK-NEXT: vldb wh5, [p1], m6; vmac.f bml1, bml1, x0, x7, r3 + ; CHECK-NEXT: vldb wl8, [p0, #0]; vmac.f bmh1, bmh1, x2, x11, r3 + ; CHECK-NEXT: padds [p0], m4; vldb wl6, [p0, #64]; vmac.f bmh2, bmh2, x0, x11, r3 + ; CHECK-NEXT: padds [p0], #128; vldb wl7, [p1], m5; vshuffle x11, x9, x9, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bmh8, bmh8, x1, x7, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bml3, bml3, x1, x3, r3 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x5, x5, x5, r2; vmac.f bml5, bml5, x2, x3, r3 + ; CHECK-NEXT: vldb wh7, [p1], m6; vmac.f bml6, bml6, x0, x3, r3 ; CHECK-NEXT: vldb wl3, [p1], m5 ; CHECK-NEXT: vldb.3d wh3, [p1], d1 ; CHECK-NEXT: nop - ; CHECK-NEXT: vshuffle x1, x8, x10, r4 ; CHECK-NEXT: vshuffle x2, x2, x10, r16 + ; CHECK-NEXT: vshuffle x1, x8, x10, r4 + ; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh5, bmh5, x2, x5, r3 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x0, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh0, bmh0, x1, x11, r3 ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup - ; CHECK-NEXT: vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x2, x11, r3 + ; CHECK-NEXT: nopa ; nopx ; vshuffle x7, x7, x7, r2; vmac.f bmh6, bmh6, x0, x5, r3 + ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vmac.f bml2, bml2, x6, x7, r3 + ; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: vmac.f bml0, bml0, x2, x7, r3 + ; CHECK-NEXT: vmac.f bml1, bml1, x0, x7, r3 + ; CHECK-NEXT: vmac.f bmh1, bmh1, x2, x11, r3 ; CHECK-NEXT: vmac.f bmh2, bmh2, x0, x11, r3 - ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vmac.f bmh3, bmh3, x6, x11, r3 ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 - ; CHECK-NEXT: vmac.f bmh5, bmh5, x2, x5, r3 - ; CHECK-NEXT: vmac.f bmh6, bmh6, x0, x5, r3 - ; CHECK-NEXT: vmac.f bmh7, bmh7, x6, x5, r3 ; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x7, r3 - ; CHECK-NEXT: vmac.f bml0, bml0, x2, x7, r3 - ; CHECK-NEXT: vmac.f bml1, bml1, x0, x7, r3 - ; CHECK-NEXT: vmac.f bml2, bml2, x6, x7, r3 ; CHECK-NEXT: vmac.f bml3, bml3, x1, x3, r3 ; CHECK-NEXT: vmac.f bml5, bml5, x2, x3, r3 ; CHECK-NEXT: vmac.f bml6, bml6, x0, x3, r3 - ; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir index f305ff55d071..eda9aa247572 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir @@ -26,46 +26,61 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: vldb wl8, [p1], m5; nopx + ; CHECK-NEXT: vldb wl8, [p1], m5; nopxm ; CHECK-NEXT: vldb wh8, [p1], m6 - ; CHECK-NEXT: vldb wl9, [p1], m5 ; CHECK-NEXT: vldb wl0, [p0, #0] + ; CHECK-NEXT: vldb wl9, [p1], m5 ; CHECK-NEXT: vldb wh0, [p0, #32] ; CHECK-NEXT: vldb wl1, [p0, #64] - ; CHECK-NEXT: padds [p0], m4; vldb wh1, [p0, #96]; add.nc lc, r0, #-1 - ; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb wh1, [p0, #96]; padds [p0], m4 + ; CHECK-NEXT: vldb wh9, [p1], m6; padds [p0], #128 ; CHECK-NEXT: vldb wl2, [p0], #32; vshuffle x8, x8, x8, r6 - ; CHECK-NEXT: vldb wh2, [p0], #32; movxm le, #.L_LEnd0 - ; CHECK-NEXT: vldb wl3, [p0], #32; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl10, [p1], m5; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wh10, [p1], m6; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl11, [p1], m5; nopa ; nops ; nopx ; vshuffle x9, x9, x9, r6; nopv - ; CHECK-NEXT: vldb.3d wh11, [p1], d1; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x4, x0, x2, r3; nopv + ; CHECK-NEXT: vldb wh2, [p0], #32 + ; CHECK-NEXT: vldb wl3, [p0], #32 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: vldb wl10, [p1], m5 + ; CHECK-NEXT: vldb wh10, [p1], m6 + ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x9, x9, x9, r6 + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; add.nc lc, r0, #-2 + ; CHECK-NEXT: vldb wl8, [p1], m5; vshuffle x4, x0, x2, r3 + ; CHECK-NEXT: vldb wh8, [p1], m6; vshuffle x5, x0, x2, r16 + ; CHECK-NEXT: vldb wl0, [p0, #0]; vshuffle x6, x1, x3, r3; vmac.f bmh0, bmh0, x4, x8, r2 + ; CHECK-NEXT: vldb wl9, [p1], m5; vshuffle x7, x1, x3, r16; vmac.f bmh2, bmh2, x5, x8, r2 + ; CHECK-NEXT: vldb wh0, [p0, #32]; vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: vldb wl1, [p0, #64]; movxm ls, #.LBB0_2; vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: padds [p0], m4; vldb wh1, [p0, #96]; vshuffle x11, x11, x11, r6; vmac.f bmh4, bmh0, x4, x9, r2 + ; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; movxm le, #.L_LEnd0; vmac.f bmh6, bmh2, x5, x9, r2 + ; CHECK-NEXT: vldb wl2, [p0], #32; nopa ; nops ; nopx ; vshuffle x8, x8, x8, r6; vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vldb wh2, [p0], #32; nopa ; nops ; nopxm ; vmac.f bmh7, bmh3, x7, x9, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; nopa ; nops ; nopxm ; vmac.f bmh8, bmh0, x4, x10, r2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; nopa ; nops ; nopxm ; vmac.f bml0, bmh1, x6, x10, r2 + ; CHECK-NEXT: vldb wl10, [p1], m5; nopa ; nops ; nopxm ; vmac.f bml1, bmh2, x5, x10, r2 + ; CHECK-NEXT: vldb wh10, [p1], m6; nopa ; nops ; nopxm ; vmac.f bml2, bmh3, x7, x10, r2 + ; CHECK-NEXT: vldb wl11, [p1], m5; nopa ; nops ; nopx ; vshuffle x9, x9, x9, r6; vmac.f bml3, bmh0, x4, x11, r2 + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; nopa ; nops ; nopxm ; vmac.f bml4, bmh1, x6, x11, r2 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vldb wl8, [p1], m5; vshuffle x5, x0, x2, r16 - ; CHECK-NEXT: vldb wh8, [p1], m6; nopx ; vshuffle x6, x1, x3, r3; vmac.f bmh0, bmh0, x4, x8, r2 + ; CHECK-NEXT: nopa ; vldb wl8, [p1], m5; nopx ; vshuffle x4, x0, x2, r3; vmac.f bml5, bmh2, x5, x11, r2 + ; CHECK-NEXT: vldb wh8, [p1], m6; vshuffle x5, x0, x2, r16; vmac.f bml6, bmh3, x7, x11, r2 + ; CHECK-NEXT: vldb wl0, [p0, #0]; vshuffle x6, x1, x3, r3; vmac.f bmh0, bmh0, x4, x8, r2 ; CHECK-NEXT: vldb wl9, [p1], m5; vshuffle x7, x1, x3, r16; vmac.f bmh2, bmh2, x5, x8, r2 - ; CHECK-NEXT: vldb wl0, [p0, #0]; vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2 - ; CHECK-NEXT: vldb wh0, [p0, #32]; vmac.f bmh3, bmh3, x7, x8, r2 - ; CHECK-NEXT: vldb wl1, [p0, #64]; vshuffle x11, x11, x11, r6; vmac.f bmh4, bmh0, x4, x9, r2 - ; CHECK-NEXT: padds [p0], m4; vldb wh1, [p0, #96]; vmac.f bmh6, bmh2, x5, x9, r2 - ; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; vmac.f bmh5, bmh1, x6, x9, r2 - ; CHECK-NEXT: vldb wl2, [p0], #32; vshuffle x8, x8, x8, r6; vmac.f bmh7, bmh3, x7, x9, r2 - ; CHECK-NEXT: vldb wh2, [p0], #32; vmac.f bmh8, bmh0, x4, x10, r2 - ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bml0, bmh1, x6, x10, r2 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml1, bmh2, x5, x10, r2 - ; CHECK-NEXT: vldb wl10, [p1], m5; vmac.f bml2, bmh3, x7, x10, r2 - ; CHECK-NEXT: vldb wh10, [p1], m6; vmac.f bml3, bmh0, x4, x11, r2 - ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x9, x9, x9, r6; vmac.f bml4, bmh1, x6, x11, r2 - ; CHECK-NEXT: vldb.3d wh11, [p1], d1; vmac.f bml5, bmh2, x5, x11, r2 + ; CHECK-NEXT: vldb wh0, [p0, #32]; vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: vldb wl1, [p0, #64]; vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: padds [p0], m4; vldb wh1, [p0, #96]; vshuffle x11, x11, x11, r6; vmac.f bmh4, bmh0, x4, x9, r2 + ; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; vmac.f bmh6, bmh2, x5, x9, r2 + ; CHECK-NEXT: vldb wl2, [p0], #32; vshuffle x8, x8, x8, r6; vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vldb wh2, [p0], #32; vmac.f bmh7, bmh3, x7, x9, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh8, bmh0, x4, x10, r2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml0, bmh1, x6, x10, r2 + ; CHECK-NEXT: vldb wl10, [p1], m5; vmac.f bml1, bmh2, x5, x10, r2 + ; CHECK-NEXT: vldb wh10, [p1], m6; vmac.f bml2, bmh3, x7, x10, r2 + ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x9, x9, x9, r6; vmac.f bml3, bmh0, x4, x11, r2 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x4, x0, x2, r3; vmac.f bml6, bmh3, x7, x11, r2 + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; nopa ; nops ; nopxm ; vmac.f bml4, bmh1, x6, x11, r2 ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup - ; CHECK-NEXT: vshuffle x5, x0, x2, r16 + ; CHECK-NEXT: nopa ; nopx ; vshuffle x4, x0, x2, r3; vmac.f bml5, bmh2, x5, x11, r2 + ; CHECK-NEXT: vshuffle x5, x0, x2, r16; vmac.f bml6, bmh3, x7, x11, r2 ; CHECK-NEXT: vshuffle x6, x1, x3, r3; vmac.f bmh0, bmh0, x4, x8, r2 ; CHECK-NEXT: vshuffle x7, x1, x3, r16; vmac.f bmh2, bmh2, x5, x8, r2 ; CHECK-NEXT: vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nooffset.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nooffset.mir index 0dc14d842dcd..f0c39d42e26d 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nooffset.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nooffset.mir @@ -24,76 +24,76 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: nopa ; vldb wl11, [p1], m5; nopxm ; nops - ; CHECK-NEXT: vldb wh11, [p1], m6 - ; CHECK-NEXT: vldb wl5, [p1], m5 - ; CHECK-NEXT: vlda wh5, [p1], m6 - ; CHECK-NEXT: vldb wl8, [p0], #32 + ; CHECK-NEXT: nopa ; vldb wl8, [p0], #32; nopxm ; nops ; CHECK-NEXT: vldb wh8, [p0], #32 + ; CHECK-NEXT: vldb wl11, [p1], m5 ; CHECK-NEXT: vldb wl1, [p0], #32 ; CHECK-NEXT: vldb wh1, [p0], #32 ; CHECK-NEXT: paddb [p0], m4 + ; CHECK-NEXT: vldb wh11, [p1], m6 ; CHECK-NEXT: vldb wl0, [p0], #32 - ; CHECK-NEXT: vldb wh0, [p0], #32; vshuffle x5, x5, x5, r2 + ; CHECK-NEXT: vldb wh0, [p0], #32 + ; CHECK-NEXT: vldb wl5, [p1], m5 + ; CHECK-NEXT: vlda wh5, [p1], m6 ; CHECK-NEXT: vldb wl3, [p0], #32 ; CHECK-NEXT: vldb.3d wh3, [p0], d0 ; CHECK-NEXT: vldb wl0, [p1], m5 ; CHECK-NEXT: vldb wh0, [p1], m6 - ; CHECK-NEXT: vldb wl7, [p1], m5 - ; CHECK-NEXT: vldb.3d wh7, [p1], d1 - ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x2, x8, x0, r16 - ; CHECK-NEXT: vldb wh11, [p1], m6; vshuffle x6, x8, x0, r4 - ; CHECK-NEXT: vldb wl5, [p1], m5; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r3 - ; CHECK-NEXT: vlda wh5, [p1], m6; vshuffle x10, x1, x3, r4; vmac.f bmh4, bmh4, x6, x5, r3 - ; CHECK-NEXT: vldb wl8, [p0], #32; vshuffle x3, x11, x11, r2; vmac.f bmh7, bmh7, x9, x5, r3 - ; CHECK-NEXT: vldb wh8, [p0], #32; vshuffle x0, x0, x0, r2; vmac.f bmh6, bmh6, x10, x5, r3 - ; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x7, x7, x7, r2; vmac.f bmh0, bmh0, x6, x3, r3 - ; CHECK-NEXT: vldb wh1, [p0], #32; add.nc lc, r0, #-2; vmac.f bml2, bml2, x9, x0, r3 - ; CHECK-NEXT: paddb [p0], m4; movxm ls, #.LBB0_2; vmac.f bmh1, bmh1, x2, x3, r3 - ; CHECK-NEXT: vldb wl0, [p0], #32; movxm le, #.L_LEnd0; vmac.f bmh2, bmh2, x10, x3, r3 - ; CHECK-NEXT: vldb wh0, [p0], #32; nopa ; nops ; nopx ; vshuffle x5, x5, x5, r2; vmac.f bmh3, bmh3, x9, x3, r3 - ; CHECK-NEXT: vldb wl3, [p0], #32; nopa ; nops ; nopxm ; vmac.f bmh8, bmh8, x6, x0, r3 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; nopa ; nops ; nopxm ; vmac.f bml0, bml0, x2, x0, r3 - ; CHECK-NEXT: vldb wl0, [p1], m5; nopa ; nops ; nopxm ; vmac.f bml1, bml1, x10, x0, r3 + ; CHECK-NEXT: vldb wl7, [p1], m5; vshuffle x6, x8, x0, r4 + ; CHECK-NEXT: vldb.3d wh7, [p1], d1; vshuffle x2, x8, x0, r16 + ; CHECK-NEXT: vldb wl8, [p0], #32; vshuffle x5, x5, x5, r2 + ; CHECK-NEXT: vldb wh8, [p0], #32 + ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x10, x1, x3, r4; vmac.f bmh4, bmh4, x6, x5, r3 + ; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r3 + ; CHECK-NEXT: vldb wh1, [p0], #32; vshuffle x3, x11, x11, r2; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: paddb [p0], m4; vshuffle x0, x0, x0, r2; vmac.f bmh7, bmh7, x9, x5, r3 + ; CHECK-NEXT: vldb wh11, [p1], m6; vshuffle x7, x7, x7, r2; vmac.f bmh0, bmh0, x6, x3, r3 + ; CHECK-NEXT: vldb wl0, [p0], #32; add.nc lc, r0, #-2; vmac.f bmh1, bmh1, x2, x3, r3 + ; CHECK-NEXT: vldb wh0, [p0], #32; movxm ls, #.LBB0_2; vmac.f bmh2, bmh2, x10, x3, r3 + ; CHECK-NEXT: vldb wl5, [p1], m5; movxm le, #.L_LEnd0; vmac.f bmh3, bmh3, x9, x3, r3 + ; CHECK-NEXT: nopb ; vlda wh5, [p1], m6; nops ; nopxm ; vmac.f bmh8, bmh8, x6, x0, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32; nopa ; nops ; nopxm ; vmac.f bml0, bml0, x2, x0, r3 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; nopa ; nops ; nopxm ; vmac.f bml1, bml1, x10, x0, r3 + ; CHECK-NEXT: vldb wl0, [p1], m5; nopa ; nops ; nopxm ; vmac.f bml2, bml2, x9, x0, r3 ; CHECK-NEXT: vldb wh0, [p1], m6; nopa ; nops ; nopxm ; vmac.f bml3, bml3, x6, x7, r3 - ; CHECK-NEXT: vldb wl7, [p1], m5; nopa ; nops ; nopxm ; vmac.f bml5, bml5, x2, x7, r3 - ; CHECK-NEXT: vldb.3d wh7, [p1], d1; nopa ; nops ; nopxm ; vmac.f bml6, bml6, x10, x7, r3 + ; CHECK-NEXT: vldb wl7, [p1], m5; nopa ; nops ; nopx ; vshuffle x6, x8, x0, r4; vmac.f bml5, bml5, x2, x7, r3 + ; CHECK-NEXT: vldb.3d wh7, [p1], d1; nopa ; nops ; nopx ; vshuffle x2, x8, x0, r16; vmac.f bml6, bml6, x10, x7, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: nopa ; vldb wl11, [p1], m5; nopx ; vshuffle x2, x8, x0, r16; vmac.f bml4, bml4, x9, x7, r3 - ; CHECK-NEXT: vldb wh11, [p1], m6; vshuffle x6, x8, x0, r4 - ; CHECK-NEXT: vldb wl5, [p1], m5; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r3 - ; CHECK-NEXT: vlda wh5, [p1], m6; vshuffle x10, x1, x3, r4; vmac.f bmh4, bmh4, x6, x5, r3 - ; CHECK-NEXT: vldb wl8, [p0], #32; vshuffle x3, x11, x11, r2; vmac.f bmh7, bmh7, x9, x5, r3 - ; CHECK-NEXT: vldb wh8, [p0], #32; vshuffle x0, x0, x0, r2; vmac.f bmh6, bmh6, x10, x5, r3 - ; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x7, x7, x7, r2; vmac.f bmh0, bmh0, x6, x3, r3 - ; CHECK-NEXT: vldb wh1, [p0], #32; vmac.f bml2, bml2, x9, x0, r3 - ; CHECK-NEXT: paddb [p0], m4; vmac.f bmh1, bmh1, x2, x3, r3 - ; CHECK-NEXT: vldb wl0, [p0], #32; vmac.f bmh2, bmh2, x10, x3, r3 - ; CHECK-NEXT: vldb wh0, [p0], #32; vshuffle x5, x5, x5, r2; vmac.f bmh3, bmh3, x9, x3, r3 - ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh8, bmh8, x6, x0, r3 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml0, bml0, x2, x0, r3 - ; CHECK-NEXT: vldb wl0, [p1], m5; vmac.f bml1, bml1, x10, x0, r3 + ; CHECK-NEXT: vldb wl8, [p0], #32; nopa ; nops ; nopx ; vshuffle x5, x5, x5, r2; vmac.f bml4, bml4, x9, x7, r3 + ; CHECK-NEXT: vldb wh8, [p0], #32 + ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x10, x1, x3, r4; vmac.f bmh4, bmh4, x6, x5, r3 + ; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r3 + ; CHECK-NEXT: vldb wh1, [p0], #32; vshuffle x3, x11, x11, r2; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: paddb [p0], m4; vshuffle x0, x0, x0, r2; vmac.f bmh7, bmh7, x9, x5, r3 + ; CHECK-NEXT: vldb wh11, [p1], m6; vshuffle x7, x7, x7, r2; vmac.f bmh0, bmh0, x6, x3, r3 + ; CHECK-NEXT: vldb wl0, [p0], #32; vmac.f bmh1, bmh1, x2, x3, r3 + ; CHECK-NEXT: vldb wh0, [p0], #32; vmac.f bmh2, bmh2, x10, x3, r3 + ; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bmh3, bmh3, x9, x3, r3 + ; CHECK-NEXT: vlda wh5, [p1], m6; vmac.f bmh8, bmh8, x6, x0, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bml0, bml0, x2, x0, r3 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml1, bml1, x10, x0, r3 + ; CHECK-NEXT: vldb wl0, [p1], m5; vmac.f bml2, bml2, x9, x0, r3 ; CHECK-NEXT: vldb wh0, [p1], m6; vmac.f bml3, bml3, x6, x7, r3 - ; CHECK-NEXT: vldb wl7, [p1], m5; vmac.f bml5, bml5, x2, x7, r3 + ; CHECK-NEXT: vldb wl7, [p1], m5; vshuffle x6, x8, x0, r4; vmac.f bml5, bml5, x2, x7, r3 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: vldb.3d wh7, [p1], d1; nopa ; nops ; nopxm ; vmac.f bml6, bml6, x10, x7, r3 + ; CHECK-NEXT: vldb.3d wh7, [p1], d1; nopa ; nops ; nopx ; vshuffle x2, x8, x0, r16; vmac.f bml6, bml6, x10, x7, r3 ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup - ; CHECK-NEXT: nopa ; nopx ; vshuffle x2, x8, x0, r16; vmac.f bml4, bml4, x9, x7, r3 - ; CHECK-NEXT: vshuffle x6, x8, x0, r4 - ; CHECK-NEXT: vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r3 + ; CHECK-NEXT: nopa ; nopb ; nopx ; vshuffle x5, x5, x5, r2; vmac.f bml4, bml4, x9, x7, r3 + ; CHECK-NEXT: nop ; CHECK-NEXT: vshuffle x10, x1, x3, r4; vmac.f bmh4, bmh4, x6, x5, r3 - ; CHECK-NEXT: vshuffle x3, x11, x11, r2; vmac.f bmh7, bmh7, x9, x5, r3 - ; CHECK-NEXT: vshuffle x0, x0, x0, r2; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r3 + ; CHECK-NEXT: vshuffle x3, x11, x11, r2; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: vshuffle x0, x0, x0, r2; vmac.f bmh7, bmh7, x9, x5, r3 ; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh0, bmh0, x6, x3, r3 - ; CHECK-NEXT: vmac.f bml2, bml2, x9, x0, r3 ; CHECK-NEXT: vmac.f bmh1, bmh1, x2, x3, r3 ; CHECK-NEXT: vmac.f bmh2, bmh2, x10, x3, r3 ; CHECK-NEXT: vmac.f bmh3, bmh3, x9, x3, r3 ; CHECK-NEXT: vmac.f bmh8, bmh8, x6, x0, r3 ; CHECK-NEXT: vmac.f bml0, bml0, x2, x0, r3 ; CHECK-NEXT: vmac.f bml1, bml1, x10, x0, r3 + ; CHECK-NEXT: vmac.f bml2, bml2, x9, x0, r3 ; CHECK-NEXT: vmac.f bml3, bml3, x6, x7, r3 ; CHECK-NEXT: vmac.f bml5, bml5, x2, x7, r3 ; CHECK-NEXT: vmac.f bml6, bml6, x10, x7, r3 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round-memdep.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round-memdep.mir index 61db962ef75d..c97a7d2b7ad6 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round-memdep.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round-memdep.mir @@ -3,7 +3,7 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates # RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s @@ -29,13 +29,17 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: nopa ; nopxm + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32 ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32 ; CHECK-NEXT: nop - ; CHECK-NEXT: add.nc lc, r0, #-4 + ; CHECK-NEXT: add.nc lc, r0, #-5 ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; movxm ls, #.LBB0_2 ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; movxm le, #.L_LEnd0 ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv @@ -54,7 +58,11 @@ ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vst.srs.d8.s32 cm2, s0, [p1], #32; nopx ; vups.s32.s8 cm3, wh2, s1; nopv ; CHECK-NEXT: // %bb.3: // %loop.exit - ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; nopb ; nopx ; vups.s32.s8 cm2, wh0, s1; vsrs.s8.s32 wh0, cm0, s1 + ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; vsrs.s8.s32 wh0, cm0, s1; nopx ; vups.s32.s8 cm2, wh0, s1; nopv + ; CHECK-NEXT: nopa ; nopx + ; CHECK-NEXT: vsrs.s8.s32 wh2, cm1, s1 + ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; vst.srs.d8.s32 cm2, s0, [p1], #32; vups.s32.s8 cm3, wh2, s1 + ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; vsrs.s8.s32 wh0, cm0, s1; vups.s32.s8 cm2, wh0, s1 ; CHECK-NEXT: nop ; CHECK-NEXT: vsrs.s8.s32 wh2, cm1, s1 ; CHECK-NEXT: vst.srs.d8.s32 cm2, s0, [p1], #32; vups.s32.s8 cm3, wh2, s1 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir index b488ec1b22f7..78c033e73a37 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir @@ -5,7 +5,7 @@ # # (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates -# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - --debug-only=postpipeliner-summary | FileCheck %s # Inspired by Round # Currently we don't get this post-pipelined, because the RecMII is diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll new file mode 100644 index 000000000000..614e32a8520f --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll @@ -0,0 +1,192 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; +; This file is licensed under the Apache License v2.0 with LLVM Exceptions. +; See https://llvm.org/LICENSE.txt for license information. +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +; +; (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +; RUN: llc -mtriple=aie2p --aie-force-postpipeliner %s -o - | FileCheck %s + +; This is a bf16->bfp16 conversion function used by Conv2D kernels. +; Ultimately, we should target II=4 + +; Function Attrs: mustprogress noinline +define weak_odr dso_local void @convert_bf16_to_bfp16(ptr noalias %in, ptr noalias %out, ptr nonnull align 64 dereferenceable(64) %params) local_unnamed_addr #0 { +; CHECK-LABEL: convert_bf16_to_bfp16: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda r0, [p2, #0]; nopb ; nops ; nopx ; mov m0, #4; nopv +; CHECK-NEXT: padda [p2], m0; nopb ; nopx +; CHECK-NEXT: lda dn0, [p2], #4 +; CHECK-NEXT: lda m1, [p2], #4 +; CHECK-NEXT: nop +; CHECK-NEXT: mova dj0, #0 +; CHECK-NEXT: movx r24, #0; mov dj1, dj0 +; CHECK-NEXT: mov r26, r24 +; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24]; mov dc1, dj0 +; CHECK-NEXT: vldb.pop.512 x0, [p0, lf0, r24]; mov dn1, dn0 +; CHECK-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1]; movxm ls, #.LBB0_1 +; CHECK-NEXT: movxm le, #.L_LEnd0 +; CHECK-NEXT: add.nc lc, r0, #-2 +; CHECK-NEXT: lda m0, [p2, #0]; nopb ; nops ; nopxm ; nopv +; CHECK-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopxm ; nopv +; CHECK-NEXT: nopa ; vldb.pop.512 x0, [p0, lf0, r24]; nops ; nopxm ; nopv +; CHECK-NEXT: nopa ; vldb.pop.512.2d x2, [p0, lf0, r24, d1]; nops ; nopx ; mov dc0, dj0; nopv +; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; vconv.fp32.bf16 cml0, x0; nopv +; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; vconv.fp32.bf16 cmh0, x2; nopv +; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; mov p2, p1; nopv +; CHECK-NEXT: // implicit-def: $sf +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_1: // %for.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nopx ; vst.push.576.conv.bfp16ebs8.fp32 dm0, [p2, sf, r26] +; CHECK-NEXT: vldb.pop.512 x0, [p0, lf0, r24]; vst.flush.512.conv [p2, sf, r26] +; CHECK-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1]; vst.flush.512.conv.2d [p2, sf, r26, d0] +; CHECK-NEXT: vconv.fp32.bf16 cml0, x0 +; CHECK-NEXT: vconv.fp32.bf16 cmh0, x2 +; CHECK-NEXT: .L_LEnd0: +; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv +; CHECK-NEXT: // %bb.2: // %for.cond.cleanup +; CHECK-NEXT: vst.push.576.conv.bfp16ebs8.fp32 dm0, [p2, sf, r26] +; CHECK-NEXT: vst.flush.512.conv [p2, sf, r26] +; CHECK-NEXT: vst.flush.512.conv.2d [p2, sf, r26, d0] +; CHECK-NEXT: vconv.fp32.bf16 cml0, x0 +; CHECK-NEXT: vconv.fp32.bf16 cmh0, x2 +; CHECK-NEXT: nop +; CHECK-NEXT: vst.push.576.conv.bfp16ebs8.fp32 dm0, [p2, sf, r26] +; CHECK-NEXT: vst.flush.512.conv [p2, sf, r26] +; CHECK-NEXT: vst.flush.512.conv.2d [p2, sf, r26, d0] +; CHECK-NEXT: ret lr +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %num = getelementptr inbounds i8, ptr %params, i20 4 + %0 = load i32, ptr %num, align 4, !tbaa !4 + %inc_I = getelementptr inbounds i8, ptr %params, i20 8 + %1 = load i32, ptr %inc_I, align 8, !tbaa !9 + %inc_O = getelementptr inbounds i8, ptr %params, i20 12 + %2 = load i32, ptr %inc_O, align 4, !tbaa !10 + %3 = load i32, ptr %params, align 64, !tbaa !11 + %4 = icmp ugt i32 %3, 3 + tail call void @llvm.assume(i1 %4) + %5 = trunc i32 %1 to i20 + %6 = trunc i32 %0 to i20 + %7 = trunc i32 %2 to i20 + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %i.048 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %pI16.047 = phi ptr [ %in, %entry ], [ %23, %for.body ] + %pO.046 = phi ptr [ %out, %entry ], [ %45, %for.body ] + %fI.sroa.8.045 = phi i32 [ 0, %entry ], [ %25, %for.body ] + %dimsO.sroa.8.044 = phi i32 [ 0, %entry ], [ %49, %for.body ] + %fI.sroa.0.043 = phi <32 x i32> [ undef, %entry ], [ %24, %for.body ] + %fO.sroa.8.042 = phi i32 [ 0, %entry ], [ %47, %for.body ] + %fO.sroa.0.041 = phi <32 x i32> [ undef, %entry ], [ %46, %for.body ] + %dimsI.sroa.8.040 = phi i32 [ 0, %entry ], [ %27, %for.body ] + %8 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.ld.fill.p0.p0(ptr %pI16.047, <32 x i32> %fI.sroa.0.043, i32 %fI.sroa.8.045) + %9 = extractvalue { ptr, <32 x i32>, i32 } %8, 0 + %10 = extractvalue { ptr, <32 x i32>, i32 } %8, 1 + %11 = extractvalue { ptr, <32 x i32>, i32 } %8, 2 + %12 = tail call { <64 x i8>, ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.ld.pop.unaligned.p0.p0(ptr %9, <32 x i32> %10, i32 %11) + %13 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32 } %12, 0 + %14 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32 } %12, 1 + %15 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32 } %12, 2 + %16 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32 } %12, 3 + %17 = bitcast <64 x i8> %13 to <32 x bfloat> + %18 = tail call noundef <32 x float> @llvm.aie2p.v32bf16.to.v32accfloat(<32 x bfloat> %17) + %19 = bitcast <32 x float> %18 to <16 x i64> + %20 = trunc nuw i32 %dimsI.sroa.8.040 to i20 + %21 = tail call { <64 x i8>, ptr, <32 x i32>, i32, i20 } @llvm.aie2p.fifo.ld.pop.2d.unaligned.p0.p0(ptr %14, <32 x i32> %15, i32 %16, i20 %5, i20 %6, i20 %20, i20 0) + %22 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, i20 } %21, 0 + %23 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, i20 } %21, 1 + %24 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, i20 } %21, 2 + %25 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, i20 } %21, 3 + %26 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, i20 } %21, 4 + %27 = zext i20 %26 to i32 + %28 = bitcast <64 x i8> %22 to <32 x bfloat> + %29 = tail call noundef <32 x float> @llvm.aie2p.v32bf16.to.v32accfloat(<32 x bfloat> %28) + %30 = bitcast <32 x float> %29 to <16 x i64> + %shuffle2.i.i = shufflevector <16 x i64> %19, <16 x i64> %30, <32 x i32> + %31 = bitcast <32 x i64> %shuffle2.i.i to <64 x float> + %32 = tail call { <64 x i8>, <8 x i8> } @llvm.aie2p.v64accfloat.to.v64bfp16ebs8(<64 x float> %31) + %33 = extractvalue { <64 x i8>, <8 x i8> } %32, 0 + %34 = extractvalue { <64 x i8>, <8 x i8> } %32, 1 + %35 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.576.bfp16.p0.p0(ptr %pO.046, <64 x i8> %33, <8 x i8> %34, <32 x i32> %fO.sroa.0.041, i32 %fO.sroa.8.042) + %36 = extractvalue { ptr, <32 x i32>, i32 } %35, 0 + %37 = extractvalue { ptr, <32 x i32>, i32 } %35, 1 + %38 = extractvalue { ptr, <32 x i32>, i32 } %35, 2 + %39 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush.p0.p0(ptr %36, <32 x i32> %37, i32 %38) + %40 = extractvalue { ptr, <32 x i32>, i32 } %39, 0 + %41 = extractvalue { ptr, <32 x i32>, i32 } %39, 1 + %42 = extractvalue { ptr, <32 x i32>, i32 } %39, 2 + %43 = trunc nuw i32 %dimsO.sroa.8.044 to i20 + %44 = tail call { ptr, <32 x i32>, i32, i20 } @llvm.aie2p.fifo.st.flush.2d.conv.p0.p0(ptr %40, <32 x i32> %41, i32 %42, i20 %7, i20 %6, i20 %43, i20 0) + %45 = extractvalue { ptr, <32 x i32>, i32, i20 } %44, 0 + %46 = extractvalue { ptr, <32 x i32>, i32, i20 } %44, 1 + %47 = extractvalue { ptr, <32 x i32>, i32, i20 } %44, 2 + %48 = extractvalue { ptr, <32 x i32>, i32, i20 } %44, 3 + %49 = zext i20 %48 to i32 + %inc = add nuw i32 %i.048, 1 + %exitcond.not = icmp eq i32 %inc, %3 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !12 +} + +; Function Attrs: nounwind memory(argmem: read) +declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.ld.fill.p0.p0(ptr, <32 x i32>, i32) #1 + +; Function Attrs: nounwind memory(none) +declare <32 x float> @llvm.aie2p.v32bf16.to.v32accfloat(<32 x bfloat>) #2 + +; Function Attrs: nounwind memory(argmem: read) +declare { <64 x i8>, ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.ld.pop.unaligned.p0.p0(ptr, <32 x i32>, i32) #1 + +; Function Attrs: nounwind memory(argmem: read) +declare { <64 x i8>, ptr, <32 x i32>, i32, i20 } @llvm.aie2p.fifo.ld.pop.2d.unaligned.p0.p0(ptr, <32 x i32>, i32, i20, i20, i20, i20) #1 + +; Function Attrs: nounwind memory(argmem: write) +declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.576.bfp16.p0.p0(ptr, <64 x i8>, <8 x i8>, <32 x i32>, i32) #3 + +; Function Attrs: nounwind memory(inaccessiblemem: read) +declare { <64 x i8>, <8 x i8> } @llvm.aie2p.v64accfloat.to.v64bfp16ebs8(<64 x float>) #4 + +; Function Attrs: nounwind memory(argmem: write) +declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush.p0.p0(ptr, <32 x i32>, i32) #3 + +; Function Attrs: nounwind memory(argmem: write) +declare { ptr, <32 x i32>, i32, i20 } @llvm.aie2p.fifo.st.flush.2d.conv.p0.p0(ptr, <32 x i32>, i32, i20, i20, i20, i20) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) +declare void @llvm.assume(i1 noundef) #5 + +attributes #0 = { mustprogress noinline "no-jump-tables"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +attributes #1 = { nounwind memory(argmem: read) } +attributes #2 = { nounwind memory(none) } +attributes #3 = { nounwind memory(argmem: write) } +attributes #4 = { nounwind memory(inaccessiblemem: read) } +attributes #5 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } + +!llvm.linker.options = !{} +!llvm.module.flags = !{!0, !1, !2} + +!0 = !{i32 7, !"Dwarf Version", i32 4} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{i32 1, !"wchar_size", i32 4} +!4 = !{!5, !6, i64 4} +!5 = !{!"_ZTS13BfToBfpParams", !6, i64 0, !6, i64 4, !6, i64 8, !6, i64 12} +!6 = !{!"int", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C++ TBAA"} +!9 = !{!5, !6, i64 8} +!10 = !{!5, !6, i64 12} +!11 = !{!5, !6, i64 0} +!12 = distinct !{!12, !13, !14, !15} +!13 = !{!"llvm.loop.mustprogress"} +!14 = !{!"llvm.loop.itercount.range", i64 4} +!15 = !{!"llvm.loop.unroll.disable"} diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_kernel_red.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_kernel_red.ll new file mode 100644 index 000000000000..bb59ed7c98b4 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_kernel_red.ll @@ -0,0 +1,248 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; +; This file is licensed under the Apache License v2.0 with LLVM Exceptions. +; See https://llvm.org/LICENSE.txt for license information. +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +; +; (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +; RUN: llc -mtriple=aie2p --aie-force-postpipeliner %s -o - | FileCheck %s + +; This is a reduced version of the Conv2D_bfp16 kernel function which only contains +; the innermost loop. It was mostly obtained with llvm-extract, but stores were +; manually removed from the loop. + +; Ultimately, we should target II=4 + +; Function Attrs: mustprogress +define dso_local void @conv2d_bfp16.for.body90.i(<32 x i32> %fW.sroa.0.1489.i, i32 %fW.sroa.14.1488.i, <32 x i32> %fA.sroa.0.1487.i, i32 %fA.sroa.18.1486.i, ptr addrspace(6) %pW.1485.i, ptr addrspace(5) %pA.1484.i, <64 x i32> %0, <64 x i32> %1, <64 x i32> %2, <64 x i32> %3, i32 %4, i32 %5, i20 %6, i20 %7, i20 %8, i20 %9, i20 %10, i20 %11, i32 %12, i32 %13, i32 %14, ptr %.out, ptr %.out1, ptr %.out2, ptr %.out3, ptr %.out4, ptr %.out5, ptr %pA.1.i.out, ptr %pW.1.i.out, ptr %fA.sroa.18.1.i.out, ptr %fA.sroa.0.1.i.out, ptr %fW.sroa.14.1.i.out, ptr %fW.sroa.0.1.i.out) #3 { +; CHECK-LABEL: conv2d_bfp16.for.body90.i: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %newFuncRoot +; CHECK-NEXT: paddxm [sp], #64 +; CHECK-NEXT: st p6, [sp, #-60]; nopx // 4-byte Folded Spill +; CHECK-NEXT: mov p6, sp +; CHECK-NEXT: padda [p6], #-320 +; CHECK-NEXT: vlda bmll3, [p6, #0] +; CHECK-NEXT: vlda bmlh3, [p6, #64] +; CHECK-NEXT: vlda bmhl3, [p6, #128]; mov m0, #-576 +; CHECK-NEXT: vlda bmhh3, [p6, #192]; mov p6, sp +; CHECK-NEXT: padda [p6], m0 +; CHECK-NEXT: vlda bmll2, [p6, #0] +; CHECK-NEXT: vlda bmlh2, [p6, #64] +; CHECK-NEXT: vlda bmhl2, [p6, #128]; movxm m0, #-1092 +; CHECK-NEXT: vlda bmhh2, [p6, #192]; mov p6, sp +; CHECK-NEXT: mova m0, #-832; paddb [p6], m0 +; CHECK-NEXT: lda dj4, [p6, #0]; mov p6, sp +; CHECK-NEXT: padda [p6], m0 +; CHECK-NEXT: vlda bmll1, [p6, #0] +; CHECK-NEXT: vlda bmlh1, [p6, #64] +; CHECK-NEXT: vlda bmhl1, [p6, #128]; movxm m0, #-1096 +; CHECK-NEXT: vlda bmhh1, [p6, #192]; mov p6, sp +; CHECK-NEXT: padda [p6], m0 +; CHECK-NEXT: lda m1, [p6, #0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: st p7, [sp, #-64] // 4-byte Folded Spill +; CHECK-NEXT: mov p7, sp +; CHECK-NEXT: movxm m0, #-1108 +; CHECK-NEXT: nop +; CHECK-NEXT: mov p6, sp +; CHECK-NEXT: padda [p7], m0; movxm m0, #-1112 +; CHECK-NEXT: lda p7, [p7, #0]; paddb [p6], m0; movxm m0, #-1116 +; CHECK-NEXT: lda r7, [p6, #0]; mov p6, sp +; CHECK-NEXT: padda [p6], m0; movxm m0, #-1120 +; CHECK-NEXT: lda r3, [p6, #0]; mov p6, sp +; CHECK-NEXT: padda [p6], m0; movxm m0, #-1088 +; CHECK-NEXT: lda r2, [p6, #0]; mov p6, sp +; CHECK-NEXT: padda [p6], m0 +; CHECK-NEXT: vlda bmll0, [p6, #0] +; CHECK-NEXT: vlda bmlh0, [p6, #64] +; CHECK-NEXT: vlda bmhl0, [p6, #128] +; CHECK-NEXT: vlda bmhh0, [p6, #192]; movx r25, #0; mov dn0, p3 +; CHECK-NEXT: mova dc4, #0; vldb.fill.512 [p1, lf1, r25]; mov dj0, p4 +; CHECK-NEXT: mova r24, #0; vldb.fill.512 [p1, lf1, r25]; mov dn4, p5 +; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24]; mov dc0, dc4 +; CHECK-NEXT: vldb.pop.576 ex0, [p1, lf1, r25]; add r1, r6, #-1; mov m0, p2 +; CHECK-NEXT: vldb.pop.576.3d ex2, [p1, lf1, r25, d0]; movxm ls, #.LBB0_1 +; CHECK-NEXT: vldb.pop.576 ex4, [p0, lf0, r24]; movxm le, #.L_LEnd0 +; CHECK-NEXT: vldb.pop.576 ex6, [p0, lf0, r24, m1]; add.nc lc, r1, #-2 +; CHECK-NEXT: nopa ; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; nopv +; CHECK-NEXT: nopa ; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; nopv +; CHECK-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopxm ; nopv +; CHECK-NEXT: nopa ; vldb.pop.576 ex0, [p1, lf1, r25]; nops ; nopxm ; nopv +; CHECK-NEXT: nopa ; vldb.pop.576.3d ex2, [p1, lf1, r25, d0]; nops ; nopxm ; nopv +; CHECK-NEXT: nopa ; vldb.pop.576 ex4, [p0, lf0, r24]; nops ; nopx ; vshuffle ex8, ex0, ex2, r4; nopv +; CHECK-NEXT: mova r0, #780; vldb.pop.576 ex6, [p0, lf0, r24, m1]; nops ; nopx ; vshuffle ex10, ex0, ex2, r5; nopv +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_1: // %for.body90.i +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: nopa ; vldb.fill.512 [p1, lf1, r25]; nopxm ; vmac.f dm0, dm0, ex8, ex4, r0 +; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25]; vmac.f dm1, dm1, ex10, ex4, r0 +; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24]; vmac.f dm2, dm2, ex8, ex6, r0 +; CHECK-NEXT: vldb.pop.576 ex0, [p1, lf1, r25]; vmac.f dm3, dm3, ex10, ex6, r0 +; CHECK-NEXT: vldb.pop.576.3d ex2, [p1, lf1, r25, d0] +; CHECK-NEXT: vldb.pop.576 ex4, [p0, lf0, r24]; vshuffle ex8, ex0, ex2, r4 +; CHECK-NEXT: .L_LEnd0: +; CHECK-NEXT: nopa ; vldb.pop.576 ex6, [p0, lf0, r24, m1]; nops ; nopx ; vshuffle ex10, ex0, ex2, r5; nopv +; CHECK-NEXT: // %bb.2: // %for.cond.cleanup89.i.exitStub +; CHECK-NEXT: lda p6, [sp, #-60]; nopb ; nopx ; mov p0, r7; vmac.f dm0, dm0, ex8, ex4, r0 // 4-byte Folded Reload +; CHECK-NEXT: vmac.f dm1, dm1, ex10, ex4, r0 +; CHECK-NEXT: vmac.f dm2, dm2, ex8, ex6, r0 +; CHECK-NEXT: vmac.f dm3, dm3, ex10, ex6, r0 +; CHECK-NEXT: nop +; CHECK-NEXT: vshuffle ex8, ex0, ex2, r4 +; CHECK-NEXT: vshuffle ex10, ex0, ex2, r5 +; CHECK-NEXT: vmac.f dm0, dm0, ex8, ex4, r0 +; CHECK-NEXT: vmac.f dm1, dm1, ex10, ex4, r0 +; CHECK-NEXT: vmac.f dm2, dm2, ex8, ex6, r0 +; CHECK-NEXT: vmac.f dm3, dm3, ex10, ex6, r0 +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: vst bmll0, [p7, #0] +; CHECK-NEXT: vst bmlh0, [p7, #64] +; CHECK-NEXT: vst bmhl0, [p7, #128] +; CHECK-NEXT: vst bmhh0, [p7, #192] +; CHECK-NEXT: lda p7, [sp, #-64] // 4-byte Folded Reload +; CHECK-NEXT: vst bmll1, [p0, #0] +; CHECK-NEXT: vst bmlh1, [p0, #64] +; CHECK-NEXT: vst bmhl1, [p0, #128] +; CHECK-NEXT: vst bmhh1, [p0, #192]; mov p0, r3 +; CHECK-NEXT: vst bmll2, [p0, #0] +; CHECK-NEXT: vst bmlh2, [p0, #64] +; CHECK-NEXT: vst bmhl2, [p0, #128] +; CHECK-NEXT: vst bmhh2, [p0, #192]; mov p0, r2 +; CHECK-NEXT: ret lr +; CHECK-NEXT: vst bmll3, [p0, #0] // Delay Slot 5 +; CHECK-NEXT: vst bmlh3, [p0, #64] // Delay Slot 4 +; CHECK-NEXT: vst bmhl3, [p0, #128] // Delay Slot 3 +; CHECK-NEXT: vst bmhh3, [p0, #192]; paddxm [sp], #-64 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +newFuncRoot: + br label %for.body90.i + +for.body90.i: ; preds = %newFuncRoot, %for.body90.i + %fW.sroa.0.1502.i = phi <32 x i32> [ undef, %newFuncRoot ], [ %fW.sroa.0.1.i, %for.body90.i ] + %fW.sroa.14.1501.i = phi i32 [ 0, %newFuncRoot ], [ %fW.sroa.14.1.i, %for.body90.i ] + %fA.sroa.0.1500.i = phi <32 x i32> [ undef, %newFuncRoot ], [ %fA.sroa.0.1.i, %for.body90.i ] + %fA.sroa.18.1499.i = phi i32 [ 0, %newFuncRoot ], [ %fA.sroa.18.1.i, %for.body90.i ] + %pW.1498.i = phi ptr addrspace(6) [ %pW.1485.i, %newFuncRoot ], [ %pW.1.i, %for.body90.i ] + %pA.1497.i = phi ptr addrspace(5) [ %pA.1484.i, %newFuncRoot ], [ %pA.1.i, %for.body90.i ] + %i.0496.i = phi i32 [ 1, %newFuncRoot ], [ %inc.i, %for.body90.i ] + %accs.sroa.26.1495.i = phi <64 x i32> [ %0, %newFuncRoot ], [ %60, %for.body90.i ] + %accs.sroa.18.1494.i = phi <64 x i32> [ %1, %newFuncRoot ], [ %59, %for.body90.i ] + %accs.sroa.10.1493.i = phi <64 x i32> [ %2, %newFuncRoot ], [ %58, %for.body90.i ] + %accs.sroa.0.1492.i = phi <64 x i32> [ %3, %newFuncRoot ], [ %57, %for.body90.i ] + %dimsAI.sroa.17.1491.i = phi i32 [ 0, %newFuncRoot ], [ %41, %for.body90.i ] + %dimsAI.sroa.13.1490.i = phi i32 [ 0, %newFuncRoot ], [ %39, %for.body90.i ] + %15 = tail call { ptr addrspace(5), <32 x i32>, i32 } @llvm.aie2p.fifo.ld.fill.p0.p0(ptr addrspace(5) %pA.1497.i, <32 x i32> %fA.sroa.0.1500.i, i32 %fA.sroa.18.1499.i), !alias.scope !4, !noalias !7 + %16 = extractvalue { ptr addrspace(5), <32 x i32>, i32 } %15, 0 + %17 = extractvalue { ptr addrspace(5), <32 x i32>, i32 } %15, 1 + %18 = extractvalue { ptr addrspace(5), <32 x i32>, i32 } %15, 2 + %19 = tail call { ptr addrspace(5), <32 x i32>, i32 } @llvm.aie2p.fifo.ld.fill.p0.p0(ptr addrspace(5) %16, <32 x i32> %17, i32 %18), !alias.scope !4, !noalias !7 + %20 = extractvalue { ptr addrspace(5), <32 x i32>, i32 } %19, 0 + %21 = extractvalue { ptr addrspace(5), <32 x i32>, i32 } %19, 1 + %22 = extractvalue { ptr addrspace(5), <32 x i32>, i32 } %19, 2 + %23 = tail call { ptr addrspace(6), <32 x i32>, i32 } @llvm.aie2p.fifo.ld.fill.p0.p0(ptr addrspace(6) %pW.1498.i, <32 x i32> %fW.sroa.0.1502.i, i32 %fW.sroa.14.1501.i), !alias.scope !11, !noalias !12 + %24 = extractvalue { ptr addrspace(6), <32 x i32>, i32 } %23, 0 + %25 = extractvalue { ptr addrspace(6), <32 x i32>, i32 } %23, 1 + %26 = extractvalue { ptr addrspace(6), <32 x i32>, i32 } %23, 2 + %27 = tail call { ptr addrspace(5), <32 x i32>, i32, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.bfp16.p0.p0(ptr addrspace(5) %20, <32 x i32> %21, i32 %22), !alias.scope !4, !noalias !7 + %28 = extractvalue { ptr addrspace(5), <32 x i32>, i32, <64 x i8>, <8 x i8> } %27, 0 + %29 = extractvalue { ptr addrspace(5), <32 x i32>, i32, <64 x i8>, <8 x i8> } %27, 1 + %30 = extractvalue { ptr addrspace(5), <32 x i32>, i32, <64 x i8>, <8 x i8> } %27, 2 + %31 = extractvalue { ptr addrspace(5), <32 x i32>, i32, <64 x i8>, <8 x i8> } %27, 3 + %32 = extractvalue { ptr addrspace(5), <32 x i32>, i32, <64 x i8>, <8 x i8> } %27, 4 + %33 = trunc nuw i32 %dimsAI.sroa.13.1490.i to i20 + %34 = trunc nuw i32 %dimsAI.sroa.17.1491.i to i20 + %35 = tail call { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.3d.bfp16.p0.p0(ptr addrspace(5) %28, <32 x i32> %29, i32 %30, i20 %6, i20 %7, i20 %33, i20 %8, i20 %9, i20 %34, i20 %10), !alias.scope !4, !noalias !7 + %36 = extractvalue { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %35, 5 + %37 = extractvalue { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %35, 6 + %38 = extractvalue { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %35, 3 + %39 = zext i20 %38 to i32 + %40 = extractvalue { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %35, 4 + %41 = zext i20 %40 to i32 + %42 = tail call { ptr addrspace(6), <32 x i32>, i32, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.bfp16.p0.p0(ptr addrspace(6) %24, <32 x i32> %25, i32 %26), !alias.scope !11, !noalias !12 + %43 = extractvalue { ptr addrspace(6), <32 x i32>, i32, <64 x i8>, <8 x i8> } %42, 0 + %44 = extractvalue { ptr addrspace(6), <32 x i32>, i32, <64 x i8>, <8 x i8> } %42, 1 + %45 = extractvalue { ptr addrspace(6), <32 x i32>, i32, <64 x i8>, <8 x i8> } %42, 2 + %46 = extractvalue { ptr addrspace(6), <32 x i32>, i32, <64 x i8>, <8 x i8> } %42, 3 + %47 = extractvalue { ptr addrspace(6), <32 x i32>, i32, <64 x i8>, <8 x i8> } %42, 4 + %48 = tail call { ptr addrspace(6), <32 x i32>, i32, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.1d.bfp16.p0.p0(ptr addrspace(6) %43, <32 x i32> %44, i32 %45, i20 %11), !alias.scope !11, !noalias !12 + %49 = extractvalue { ptr addrspace(6), <32 x i32>, i32, <64 x i8>, <8 x i8> } %48, 3 + %50 = extractvalue { ptr addrspace(6), <32 x i32>, i32, <64 x i8>, <8 x i8> } %48, 4 + %51 = tail call { <64 x i8>, <8 x i8> } @llvm.aie2p.vshuffle.576.bfp16(<64 x i8> %31, <8 x i8> %32, <64 x i8> %36, <8 x i8> %37, i32 %12) + %52 = extractvalue { <64 x i8>, <8 x i8> } %51, 0 + %53 = extractvalue { <64 x i8>, <8 x i8> } %51, 1 + %54 = tail call { <64 x i8>, <8 x i8> } @llvm.aie2p.vshuffle.576.bfp16(<64 x i8> %31, <8 x i8> %32, <64 x i8> %36, <8 x i8> %37, i32 %13) + %55 = extractvalue { <64 x i8>, <8 x i8> } %54, 0 + %56 = extractvalue { <64 x i8>, <8 x i8> } %54, 1 + %57 = tail call <64 x i32> @llvm.aie2p.BFP576.BFP576.ACC2048.mac.conf(<64 x i8> %52, <8 x i8> %53, <64 x i8> %46, <8 x i8> %47, <64 x i32> %accs.sroa.0.1492.i, i32 780) + %58 = tail call <64 x i32> @llvm.aie2p.BFP576.BFP576.ACC2048.mac.conf(<64 x i8> %55, <8 x i8> %56, <64 x i8> %46, <8 x i8> %47, <64 x i32> %accs.sroa.10.1493.i, i32 780) + %59 = tail call <64 x i32> @llvm.aie2p.BFP576.BFP576.ACC2048.mac.conf(<64 x i8> %52, <8 x i8> %53, <64 x i8> %49, <8 x i8> %50, <64 x i32> %accs.sroa.18.1494.i, i32 780) + %60 = tail call <64 x i32> @llvm.aie2p.BFP576.BFP576.ACC2048.mac.conf(<64 x i8> %55, <8 x i8> %56, <64 x i8> %49, <8 x i8> %50, <64 x i32> %accs.sroa.26.1495.i, i32 780) + %inc.i = add nuw i32 %i.0496.i, 1 + %pA.1.i = extractvalue { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %35, 0 + %pW.1.i = extractvalue { ptr addrspace(6), <32 x i32>, i32, <64 x i8>, <8 x i8> } %48, 0 + %fA.sroa.18.1.i = extractvalue { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %35, 2 + %fA.sroa.0.1.i = extractvalue { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %35, 1 + %fW.sroa.14.1.i = extractvalue { ptr addrspace(6), <32 x i32>, i32, <64 x i8>, <8 x i8> } %48, 2 + %fW.sroa.0.1.i = extractvalue { ptr addrspace(6), <32 x i32>, i32, <64 x i8>, <8 x i8> } %48, 1 + %exitcond.not.i = icmp eq i32 %inc.i, %14 + br i1 %exitcond.not.i, label %for.cond.cleanup89.i.exitStub, label %for.body90.i, !llvm.loop !13 + +for.cond.cleanup89.i.exitStub: ; preds = %for.body90.i + store <64 x i32> %57, ptr %.out2, align 256 + store <64 x i32> %58, ptr %.out3, align 256 + store <64 x i32> %59, ptr %.out4, align 256 + store <64 x i32> %60, ptr %.out5, align 256 + ret void +} + +; Function Attrs: nounwind memory(argmem: read) +declare { ptr addrspace(5), <32 x i32>, i32 } @llvm.aie2p.fifo.ld.fill.p5.p5(ptr addrspace(5), <32 x i32>, i32) #0 + +; Function Attrs: nounwind memory(argmem: read) +declare { ptr addrspace(6), <32 x i32>, i32 } @llvm.aie2p.fifo.ld.fill.p6.p6(ptr addrspace(6), <32 x i32>, i32) #0 + +; Function Attrs: nounwind memory(argmem: read) +declare { ptr addrspace(5), <32 x i32>, i32, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.bfp16.p5.p5(ptr addrspace(5), <32 x i32>, i32) #0 + +; Function Attrs: nounwind memory(argmem: read) +declare { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.3d.bfp16.p5.p5(ptr addrspace(5), <32 x i32>, i32, i20, i20, i20, i20, i20, i20, i20) #0 + +; Function Attrs: nounwind memory(argmem: read) +declare { ptr addrspace(6), <32 x i32>, i32, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.bfp16.p6.p6(ptr addrspace(6), <32 x i32>, i32) #0 + +; Function Attrs: nounwind memory(argmem: read) +declare { ptr addrspace(6), <32 x i32>, i32, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.1d.bfp16.p6.p6(ptr addrspace(6), <32 x i32>, i32, i20) #0 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare { <64 x i8>, <8 x i8> } @llvm.aie2p.vshuffle.576.bfp16(<64 x i8>, <8 x i8>, <64 x i8>, <8 x i8>, i32) #1 + +; Function Attrs: nounwind memory(inaccessiblemem: read) +declare <64 x i32> @llvm.aie2p.BFP576.BFP576.ACC2048.mac.conf(<64 x i8>, <8 x i8>, <64 x i8>, <8 x i8>, <64 x i32>, i32) #2 + +attributes #0 = { nounwind memory(argmem: read) } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nounwind memory(inaccessiblemem: read) } +attributes #3 = { mustprogress "no-jump-tables"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + +!llvm.linker.options = !{} +!llvm.module.flags = !{!0, !1, !2} + +!0 = !{i32 7, !"Dwarf Version", i32 4} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{i32 1, !"wchar_size", i32 4} +!4 = !{!5} +!5 = distinct !{!5, !6, !"_Z14conv2d_genericILh1EL5act_t0ELb0ELb0EEvPu6__bf16S1_S1_S1_R27conv2d_bf16_internal_params10out_mode_t: %input"} +!6 = distinct !{!6, !"_Z14conv2d_genericILh1EL5act_t0ELb0ELb0EEvPu6__bf16S1_S1_S1_R27conv2d_bf16_internal_params10out_mode_t"} +!7 = !{!8, !9, !10} +!8 = distinct !{!8, !6, !"_Z14conv2d_genericILh1EL5act_t0ELb0ELb0EEvPu6__bf16S1_S1_S1_R27conv2d_bf16_internal_params10out_mode_t: %weights"} +!9 = distinct !{!9, !6, !"_Z14conv2d_genericILh1EL5act_t0ELb0ELb0EEvPu6__bf16S1_S1_S1_R27conv2d_bf16_internal_params10out_mode_t: %acc_in"} +!10 = distinct !{!10, !6, !"_Z14conv2d_genericILh1EL5act_t0ELb0ELb0EEvPu6__bf16S1_S1_S1_R27conv2d_bf16_internal_params10out_mode_t: %output"} +!11 = !{!8} +!12 = !{!5, !9, !10} +!13 = distinct !{!13, !14, !15, !16} +!14 = !{!"llvm.loop.mustprogress"} +!15 = !{!"llvm.loop.itercount.range", i64 7} +!16 = !{!"llvm.loop.unroll.disable"} diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_conv.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_conv.mir new file mode 100644 index 000000000000..5d9b58313f94 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_conv.mir @@ -0,0 +1,217 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2p --aie-loop-min-tripcount=4 --aie-addrspace-none-is-safe=0 %s \ +# RUN: --start-before=postmisched --debug-only=postpipeliner-summary -o - | FileCheck %s --check-prefix=CONSERVATIVE +# RUN: llc --mtriple=aie2p --aie-loop-min-tripcount=4 --aie-addrspace-none-is-safe=1 %s \ +# RUN: --start-before=postmisched --debug-only=postpipeliner-summary -o - | FileCheck %s --check-prefix=OPTIMISTIC + + +--- | + define void @convert_bf16_to_bfp16(ptr noalias %in, ptr noalias %out, ptr nonnull align 64 dereferenceable(64) %params) { + ; CONSERVATIVE-LABEL: convert_bf16_to_bfp16: + ; CONSERVATIVE: .p2align 4 + ; CONSERVATIVE-NEXT: // %bb.0: // %entry + ; CONSERVATIVE-NEXT: lda r0, [p2, #0] + ; CONSERVATIVE-NEXT: nop + ; CONSERVATIVE-NEXT: nop + ; CONSERVATIVE-NEXT: nop + ; CONSERVATIVE-NEXT: nop + ; CONSERVATIVE-NEXT: nop + ; CONSERVATIVE-NEXT: nop + ; CONSERVATIVE-NEXT: jz r0, #.LBB0_4 + ; CONSERVATIVE-NEXT: nop // Delay Slot 5 + ; CONSERVATIVE-NEXT: nop // Delay Slot 4 + ; CONSERVATIVE-NEXT: nop // Delay Slot 3 + ; CONSERVATIVE-NEXT: nop // Delay Slot 2 + ; CONSERVATIVE-NEXT: nop // Delay Slot 1 + ; CONSERVATIVE-NEXT: // %bb.1: + ; CONSERVATIVE-NEXT: mova m0, #12; nopb ; nops ; nopx ; mov p3, p2; nopv + ; CONSERVATIVE-NEXT: padda [p3], m0 + ; CONSERVATIVE-NEXT: lda m0, [p3], #-4 + ; CONSERVATIVE-NEXT: lda m1, [p3], #-4 + ; CONSERVATIVE-NEXT: lda dn0, [p3, #0] + ; CONSERVATIVE-NEXT: nop + ; CONSERVATIVE-NEXT: nop + ; CONSERVATIVE-NEXT: mova dj0, #0 + ; CONSERVATIVE-NEXT: mova r24, #0; mov dj1, dj0 + ; CONSERVATIVE-NEXT: mov r26, r24 + ; CONSERVATIVE-NEXT: vldb.fill.512 [p0, lf0, r24]; mov dc1, dj0 + ; CONSERVATIVE-NEXT: vldb.pop.512 x0, [p0, lf0, r24]; mov dn1, dn0 + ; CONSERVATIVE-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1]; add.nc lc, r0, #-2 + ; CONSERVATIVE-NEXT: movxm ls, #.LBB0_2 + ; CONSERVATIVE-NEXT: movxm le, #.L_LEnd0 + ; CONSERVATIVE-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CONSERVATIVE-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopxm ; nopv + ; CONSERVATIVE-NEXT: nopa ; vldb.pop.512 x0, [p0, lf0, r24]; nops ; nopxm ; nopv + ; CONSERVATIVE-NEXT: nopa ; vldb.pop.512.2d x2, [p0, lf0, r24, d1]; nops ; nopx ; mov p2, p1; nopv + ; CONSERVATIVE-NEXT: nopa ; nopb ; nops ; nopx ; vconv.fp32.bf16 cml0, x0; nopv + ; CONSERVATIVE-NEXT: nopa ; nopb ; nops ; nopx ; vconv.fp32.bf16 cmh0, x2; nopv + ; CONSERVATIVE-NEXT: nopa ; nopb ; nops ; nopx ; mov dc0, dj0; nopv + ; CONSERVATIVE-NEXT: // implicit-def: $sf + ; CONSERVATIVE-NEXT: .p2align 4 + ; CONSERVATIVE-NEXT: .LBB0_2: // =>This Inner Loop Header: Depth=1 + ; CONSERVATIVE-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nopx ; vst.push.576.conv.bfp16ebs8.fp32 dm0, [p2, sf, r26] + ; CONSERVATIVE-NEXT: vldb.pop.512 x0, [p0, lf0, r24]; vst.flush.512.conv [p2, sf, r26] + ; CONSERVATIVE-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1]; vst.flush.512.conv.2d [p2, sf, r26, d0] + ; CONSERVATIVE-NEXT: vconv.fp32.bf16 cml0, x0 + ; CONSERVATIVE-NEXT: vconv.fp32.bf16 cmh0, x2 + ; CONSERVATIVE-NEXT: .L_LEnd0: + ; CONSERVATIVE-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CONSERVATIVE-NEXT: // %bb.3: + ; CONSERVATIVE-NEXT: nopa ; vst.push.576.conv.bfp16ebs8.fp32 dm0, [p2, sf, r26]; nopx + ; CONSERVATIVE-NEXT: vst.flush.512.conv [p2, sf, r26] + ; CONSERVATIVE-NEXT: vst.flush.512.conv.2d [p2, sf, r26, d0] + ; CONSERVATIVE-NEXT: vconv.fp32.bf16 cml0, x0 + ; CONSERVATIVE-NEXT: vconv.fp32.bf16 cmh0, x2 + ; CONSERVATIVE-NEXT: nop + ; CONSERVATIVE-NEXT: vst.push.576.conv.bfp16ebs8.fp32 dm0, [p2, sf, r26] + ; CONSERVATIVE-NEXT: vst.flush.512.conv [p2, sf, r26] + ; CONSERVATIVE-NEXT: vst.flush.512.conv.2d [p2, sf, r26, d0] + ; CONSERVATIVE-NEXT: nop + ; CONSERVATIVE-NEXT: nop + ; CONSERVATIVE-NEXT: nop + ; CONSERVATIVE-NEXT: nop + ; CONSERVATIVE-NEXT: .p2align 4 + ; CONSERVATIVE-NEXT: .LBB0_4: + ; CONSERVATIVE-NEXT: nopa ; ret lr + ; CONSERVATIVE-NEXT: nop // Delay Slot 5 + ; CONSERVATIVE-NEXT: nop // Delay Slot 4 + ; CONSERVATIVE-NEXT: nop // Delay Slot 3 + ; CONSERVATIVE-NEXT: nop // Delay Slot 2 + ; CONSERVATIVE-NEXT: nop // Delay Slot 1 + ; + ; OPTIMISTIC-LABEL: convert_bf16_to_bfp16: + ; OPTIMISTIC: .p2align 4 + ; OPTIMISTIC-NEXT: // %bb.0: // %entry + ; OPTIMISTIC-NEXT: lda r0, [p2, #0] + ; OPTIMISTIC-NEXT: nop + ; OPTIMISTIC-NEXT: nop + ; OPTIMISTIC-NEXT: nop + ; OPTIMISTIC-NEXT: nop + ; OPTIMISTIC-NEXT: nop + ; OPTIMISTIC-NEXT: nop + ; OPTIMISTIC-NEXT: jz r0, #.LBB0_4 + ; OPTIMISTIC-NEXT: nop // Delay Slot 5 + ; OPTIMISTIC-NEXT: nop // Delay Slot 4 + ; OPTIMISTIC-NEXT: nop // Delay Slot 3 + ; OPTIMISTIC-NEXT: nop // Delay Slot 2 + ; OPTIMISTIC-NEXT: nop // Delay Slot 1 + ; OPTIMISTIC-NEXT: // %bb.1: + ; OPTIMISTIC-NEXT: mova m0, #12; nopb ; nopx ; mov p3, p2 + ; OPTIMISTIC-NEXT: padda [p3], m0 + ; OPTIMISTIC-NEXT: lda m0, [p3], #-4 + ; OPTIMISTIC-NEXT: lda m1, [p3], #-4 + ; OPTIMISTIC-NEXT: lda dn0, [p3, #0] + ; OPTIMISTIC-NEXT: nop + ; OPTIMISTIC-NEXT: nop + ; OPTIMISTIC-NEXT: mova dj0, #0 + ; OPTIMISTIC-NEXT: mova r24, #0; mov dj1, dj0 + ; OPTIMISTIC-NEXT: mov r26, r24 + ; OPTIMISTIC-NEXT: vldb.fill.512 [p0, lf0, r24]; mov dc1, dj0 + ; OPTIMISTIC-NEXT: vldb.pop.512 x0, [p0, lf0, r24]; mov dn1, dn0 + ; OPTIMISTIC-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1]; add.nc lc, r0, #-3 + ; OPTIMISTIC-NEXT: movxm ls, #.LBB0_2 + ; OPTIMISTIC-NEXT: vldb.fill.512 [p0, lf0, r24]; movxm le, #.L_LEnd0 + ; OPTIMISTIC-NEXT: nopa ; vldb.pop.512 x0, [p0, lf0, r24]; nops ; nopxm ; nopv + ; OPTIMISTIC-NEXT: nopa ; vldb.pop.512.2d x2, [p0, lf0, r24, d1]; nops ; nopxm ; nopv + ; OPTIMISTIC-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; OPTIMISTIC-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopx ; mov p2, p1; nopv + ; OPTIMISTIC-NEXT: nopa ; vldb.pop.512 x0, [p0, lf0, r24]; nops ; nopx ; vconv.fp32.bf16 cml0, x0; nopv + ; OPTIMISTIC-NEXT: nopa ; vldb.pop.512.2d x2, [p0, lf0, r24, d1]; nops ; nopx ; vconv.fp32.bf16 cmh0, x2; nopv + ; OPTIMISTIC-NEXT: nopa ; nopb ; nops ; nopx ; mov dc0, dj0; nopv + ; OPTIMISTIC-NEXT: // implicit-def: $sf + ; OPTIMISTIC-NEXT: .p2align 4 + ; OPTIMISTIC-NEXT: .LBB0_2: // =>This Inner Loop Header: Depth=1 + ; OPTIMISTIC-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nopx ; vst.push.576.conv.bfp16ebs8.fp32 dm0, [p2, sf, r26] + ; OPTIMISTIC-NEXT: vst.flush.512.conv [p2, sf, r26]; vldb.pop.512 x0, [p0, lf0, r24]; vconv.fp32.bf16 cml0, x0 + ; OPTIMISTIC-NEXT: vst.flush.512.conv.2d [p2, sf, r26, d0]; vldb.pop.512.2d x2, [p0, lf0, r24, d1]; vconv.fp32.bf16 cmh0, x2 + ; OPTIMISTIC-NEXT: .L_LEnd0: + ; OPTIMISTIC-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; OPTIMISTIC-NEXT: // %bb.3: + ; OPTIMISTIC-NEXT: vst.push.576.conv.bfp16ebs8.fp32 dm0, [p2, sf, r26] + ; OPTIMISTIC-NEXT: vst.flush.512.conv [p2, sf, r26]; vconv.fp32.bf16 cml0, x0 + ; OPTIMISTIC-NEXT: vst.flush.512.conv.2d [p2, sf, r26, d0]; vconv.fp32.bf16 cmh0, x2 + ; OPTIMISTIC-NEXT: nop + ; OPTIMISTIC-NEXT: vst.push.576.conv.bfp16ebs8.fp32 dm0, [p2, sf, r26] + ; OPTIMISTIC-NEXT: vst.flush.512.conv [p2, sf, r26]; vconv.fp32.bf16 cml0, x0 + ; OPTIMISTIC-NEXT: vst.flush.512.conv.2d [p2, sf, r26, d0]; vconv.fp32.bf16 cmh0, x2 + ; OPTIMISTIC-NEXT: nop + ; OPTIMISTIC-NEXT: vst.push.576.conv.bfp16ebs8.fp32 dm0, [p2, sf, r26] + ; OPTIMISTIC-NEXT: vst.flush.512.conv [p2, sf, r26] + ; OPTIMISTIC-NEXT: vst.flush.512.conv.2d [p2, sf, r26, d0] + ; OPTIMISTIC-NEXT: nop + ; OPTIMISTIC-NEXT: nop + ; OPTIMISTIC-NEXT: nop + ; OPTIMISTIC-NEXT: nop + ; OPTIMISTIC-NEXT: .p2align 4 + ; OPTIMISTIC-NEXT: .LBB0_4: + ; OPTIMISTIC-NEXT: nopa ; ret lr + ; OPTIMISTIC-NEXT: nop // Delay Slot 5 + ; OPTIMISTIC-NEXT: nop // Delay Slot 4 + ; OPTIMISTIC-NEXT: nop // Delay Slot 3 + ; OPTIMISTIC-NEXT: nop // Delay Slot 2 + ; OPTIMISTIC-NEXT: nop // Delay Slot 1 + entry: + ret void + } +... +--- +name: convert_bf16_to_bfp16 +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.3(0x30000000), %bb.1(0x50000000) + liveins: $p0, $p1, $p2 + + renamable $r0 = LDA_dms_lda_idx_imm $p2, 0 :: (dereferenceable load (s32) from %ir.params, align 64) + JZ renamable $r0, %bb.3 + DelayedSchedBarrier + + bb.1: + successors: %bb.2(0x80000000) + liveins: $p1, $p2, $plfr0:0x0000000000400000, $r0 + + $p3 = MOV_alu_mv_mv_mv_scl $p2 + $p2 = MOV_alu_mv_mv_mv_scl $p1 + renamable $m0 = MOV_PD_imm11_pseudo 12 + renamable $dj0 = MOV_PD_imm11_pseudo 0 + renamable $p3 = nuw PADD_mod_pseudo killed renamable $p3, killed renamable $m0 + $dj1 = MOV_alu_mv_mv_mv_scl $dj0 + renamable $m0, renamable $p3 = LDA_dms_lda_pstm_nrm_imm killed renamable $p3, -4 :: (dereferenceable load (s20) from %ir.params, align 4) + renamable $m1, renamable $p3 = LDA_dms_lda_pstm_nrm_imm killed renamable $p3, -4 :: (dereferenceable load (s20) from %ir.params, align 4) + renamable $dn0 = LDA_dms_lda_idx_imm killed renamable $p3, 0 :: (dereferenceable load (s20) from %ir.params, align 4) + renamable $r24 = MOV_RLC_imm11_pseudo 0 + $dn1 = MOV_alu_mv_mv_mv_scl killed $dn0 + $dc0 = MOV_alu_mv_mv_mv_scl $dj0 + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.2 + $le = MOVXM + $dc1 = MOV_alu_mv_mv_mv_scl $dj0 + renamable $sf = IMPLICIT_DEF + $r26 = MOV_alu_mv_mv_mv_scl $r24 + + bb.2 (align 16): + successors: %bb.2(0x7c000000), %bb.3(0x04000000) + liveins: $sf, $d0:0x0000000000200E00, $d1:0x0000000000200E00, $p2, $plfr0:0x0000000000484040, $r26 + + $p0, $lf0, $r24 = VLDB_FILL_512 $p0, $lf0, $r24 :: (load unknown-size from %ir.in, align 1) + $x0, $p0, $lf0, $r24 = VLDB_POP_512_normal_pop $p0, $lf0, $r24, implicit-def $srfifo_uf :: (load unknown-size from %ir.in, align 1) + $x2, $p0, $lf0, $r24, $dc1 = VLDB_POP_512_2D $p0, $lf0, $r24, $d1, implicit-def $srfifo_uf :: (load unknown-size from %ir.in, align 1) + renamable $cml0 = VCONV_fp32_bf16_mv_ups_xbf killed renamable $x0 + renamable $cmh0 = VCONV_fp32_bf16_mv_ups_xbf killed renamable $x2 + renamable $sf, renamable $p2, renamable $r26 = VST_PUSH_576_CONV_bfp16ebs8_fp32 killed renamable $sf, killed renamable $dm0, killed renamable $p2, killed renamable $r26, implicit-def $srf2bflags, implicit-def $srfifo_of, implicit $crf2bmask, implicit $crrnd :: (store unknown-size into %ir.out, align 4) + renamable $sf, renamable $p2, renamable $r26 = VST_FLUSH_512_CONV_normal_flush killed renamable $sf, killed renamable $p2, killed renamable $r26, implicit-def $srfifo_of :: (store unknown-size into %ir.out, align 4) + $sf, $p2, $r26, $dc0 = VST_FLUSH_512_CONV_2D killed $sf, killed $p2, killed $r26, $d0, implicit-def $srfifo_of :: (store unknown-size into %ir.out, align 4) + PseudoLoopEnd , %bb.2 + + bb.3 (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_kernel_fixedslot.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_kernel_fixedslot.mir new file mode 100644 index 000000000000..3e3334a98a0a --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_kernel_fixedslot.mir @@ -0,0 +1,104 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2p --aie-loop-min-tripcount=7 %s \ +# RUN: --start-before=postmisched --debug-only=postpipeliner-summary -o - | FileCheck %s + + +--- | + define void @conv2d_bfp16(ptr addrspace(5) noalias %pA, ptr addrspace(6) noalias %pW) { + ; CHECK-LABEL: conv2d_bfp16: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r25, #0; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25] + ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; movx r24, #0 + ; CHECK-NEXT: vlda.pop.576 ex0, [p1, lf1, r25]; vldb.fill.512 [p0, lf0, r24] + ; CHECK-NEXT: vlda.pop.576.3d ex4, [p1, lf1, r25, d0]; vldb.pop.576 ex2, [p0, lf0, r24] + ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25] + ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; vldb.pop.576 ex6, [p0, lf0, r24, m1] + ; CHECK-NEXT: vlda.pop.576 ex0, [p1, lf1, r25]; vldb.fill.512 [p0, lf0, r24]; add.nc lc, r1, #-4 + ; CHECK-NEXT: vlda.pop.576.3d ex4, [p1, lf1, r25, d0]; vldb.pop.576 ex2, [p0, lf0, r24]; movxm ls, #.LBB0_1 + ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; movxm le, #.L_LEnd0 + ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; vldb.pop.576 ex6, [p0, lf0, r24, m1]; nops ; nopxm ; nopv + ; CHECK-NEXT: vlda.pop.576 ex0, [p1, lf1, r25]; vldb.fill.512 [p0, lf0, r24]; nops ; nopxm ; nopv + ; CHECK-NEXT: vlda.pop.576.3d ex4, [p1, lf1, r25, d0]; vldb.pop.576 ex2, [p0, lf0, r24]; nops ; nopx ; vshuffle ex8, ex0, ex4, r4; nopv + ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; nopb ; nops ; movx r0, #780; vshuffle ex10, ex0, ex4, r5; nopv + ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; vldb.pop.576 ex6, [p0, lf0, r24, m1]; nops ; nopxm ; vmac.f dm0, dm0, ex8, ex2, r0 + ; CHECK-NEXT: vlda.pop.576 ex0, [p1, lf1, r25]; vldb.fill.512 [p0, lf0, r24]; nops ; nopxm ; vmac.f dm1, dm1, ex10, ex2, r0 + ; CHECK-NEXT: vlda.pop.576.3d ex4, [p1, lf1, r25, d0]; vldb.pop.576 ex2, [p0, lf0, r24]; nops ; nopx ; vshuffle ex8, ex0, ex4, r4; vmac.f dm2, dm2, ex8, ex6, r0 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; nopx ; vshuffle ex10, ex0, ex4, r5; vmac.f dm3, dm3, ex10, ex6, r0 + ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; vldb.pop.576 ex6, [p0, lf0, r24, m1]; vmac.f dm0, dm0, ex8, ex2, r0 + ; CHECK-NEXT: vlda.pop.576 ex0, [p1, lf1, r25]; vldb.fill.512 [p0, lf0, r24]; vmac.f dm1, dm1, ex10, ex2, r0 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: vlda.pop.576.3d ex4, [p1, lf1, r25, d0]; vldb.pop.576 ex2, [p0, lf0, r24]; nops ; nopx ; vshuffle ex8, ex0, ex4, r4; vmac.f dm2, dm2, ex8, ex6, r0 + ; CHECK-NEXT: // %bb.2: + ; CHECK-NEXT: nopa ; nopb ; nopx ; vshuffle ex10, ex0, ex4, r5; vmac.f dm3, dm3, ex10, ex6, r0 + ; CHECK-NEXT: vldb.pop.576 ex6, [p0, lf0, r24, m1]; vmac.f dm0, dm0, ex8, ex2, r0 + ; CHECK-NEXT: vmac.f dm1, dm1, ex10, ex2, r0 + ; CHECK-NEXT: vshuffle ex8, ex0, ex4, r4; vmac.f dm2, dm2, ex8, ex6, r0 + ; CHECK-NEXT: vshuffle ex10, ex0, ex4, r5; vmac.f dm3, dm3, ex10, ex6, r0 + ; CHECK-NEXT: vmac.f dm0, dm0, ex8, ex2, r0 + ; CHECK-NEXT: vmac.f dm1, dm1, ex10, ex2, r0 + ; CHECK-NEXT: vshuffle ex8, ex0, ex4, r4; vmac.f dm2, dm2, ex8, ex6, r0 + ; CHECK-NEXT: vshuffle ex10, ex0, ex4, r5; vmac.f dm3, dm3, ex10, ex6, r0 + ; CHECK-NEXT: vmac.f dm0, dm0, ex8, ex2, r0 + ; CHECK-NEXT: vmac.f dm1, dm1, ex10, ex2, r0 + ; CHECK-NEXT: vmac.f dm2, dm2, ex8, ex6, r0 + ; CHECK-NEXT: vmac.f dm3, dm3, ex10, ex6, r0 + ; CHECK-NEXT: ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + ret void + } +... +--- +name: conv2d_bfp16 +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x80000000) + liveins: $p0, $p1, $p2, $p3, $p4, $p5, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $y2, $y3, $p6, $p7 + renamable $r0 = MOV_RLC_imm11_pseudo 780 + $lc = ADD_NC_mv_add_ri $r1, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + renamable $r24 = MOV_RLC_imm11_pseudo 0 + renamable $r25 = MOV_RLC_imm11_pseudo 0 + + bb.1 (align 16): + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $dm0:0x000000001800000C, $dm1:0x000000001800000C, $dm2:0x000000001800000C, $dm3:0x000000001800000C, $m1, $plfr0:0x0000000000484040, $plfr1:0x0000000000484040, $r0, $r4, $r5, $d0_3d:0x0001C00000200E00 + + $p1, $lf1, $r25 = VLDA_FILL_512 $p1, $lf1, $r25 :: (load unknown-size from %ir.pA, align 1, addrspace 5) + $p1, $lf1, $r25 = VLDA_FILL_512 $p1, $lf1, $r25 :: (load unknown-size from %ir.pA, align 1, addrspace 5) + $p0, $lf0, $r24 = VLDB_FILL_512 $p0, $lf0, $r24 :: (load unknown-size from %ir.pW, align 1, addrspace 6) + $ex0, $p1, $lf1, $r25 = VLDA_POP_576_normal_pop $p1, $lf1, $r25, implicit-def $srfifo_uf :: (load unknown-size from %ir.pA, align 1, addrspace 5) + $ex4, $p1, $lf1, $r25, $dc0, $dc4 = VLDA_POP_576_3D $p1, $lf1, $r25, $d0_3d, implicit-def $srfifo_uf :: (load unknown-size from %ir.pA, align 1, addrspace 5) + $ex2, $p0, $lf0, $r24 = VLDB_POP_576_normal_pop $p0, $lf0, $r24, implicit-def $srfifo_uf :: (load unknown-size from %ir.pW, align 1, addrspace 6) + $ex6, $p0, $lf0, $r24 = VLDB_POP_576_fifo_1d_pop $p0, $lf0, $r24, $m1, implicit-def $srfifo_uf :: (load unknown-size from %ir.pW, align 1, addrspace 6) + renamable $ex8 = VSHUFFLE_vec_shuffle_ex renamable $ex0, renamable $ex4, renamable $r4 + renamable $ex10 = VSHUFFLE_vec_shuffle_ex killed renamable $ex0, killed renamable $ex4, renamable $r5 + renamable $dm0 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm0, renamable $ex8, renamable $ex2, renamable $r0, implicit-def dead $srfpflags, implicit $crfpmask + renamable $dm1 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm1, renamable $ex10, killed renamable $ex2, renamable $r0, implicit-def dead $srfpflags, implicit $crfpmask + renamable $dm2 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm2, killed renamable $ex8, renamable $ex6, renamable $r0, implicit-def dead $srfpflags, implicit $crfpmask + renamable $dm3 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm3, killed renamable $ex10, killed renamable $ex6, renamable $r0, implicit-def dead $srfpflags, implicit $crfpmask + PseudoLoopEnd , %bb.1 + + bb.2: + liveins: $dm0:0x000000001800000C, $dm1:0x000000001800000C, $dm2:0x000000001800000C, $dm3:0x000000001800000C + + RET implicit $lr + DelayedSchedBarrier implicit $dm0, implicit $dm1, implicit $dm2, implicit $dm3 +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_kernel_multislot.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_kernel_multislot.mir new file mode 100644 index 000000000000..ecd01959cc45 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_kernel_multislot.mir @@ -0,0 +1,103 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2p --aie-loop-min-tripcount=7 %s \ +# RUN: --start-before=postmisched --debug-only=postpipeliner-summary -o - | FileCheck %s + + +--- | + define void @conv2d_bfp16(ptr addrspace(5) noalias %pA, ptr addrspace(6) noalias %pW) { + ; CHECK-LABEL: conv2d_bfp16: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r25, #0 + ; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25] + ; CHECK-NEXT: mova r24, #0; vldb.fill.512 [p1, lf1, r25] + ; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24] + ; CHECK-NEXT: vldb.pop.576 ex0, [p1, lf1, r25] + ; CHECK-NEXT: vldb.pop.576.3d ex4, [p1, lf1, r25, d0]; add.nc lc, r1, #-2 + ; CHECK-NEXT: vldb.pop.576 ex2, [p0, lf0, r24]; movxm ls, #.LBB0_1 + ; CHECK-NEXT: vldb.pop.576 ex6, [p0, lf0, r24, m1]; movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopa ; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; vldb.pop.576 ex0, [p1, lf1, r25]; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; vldb.pop.576.3d ex4, [p1, lf1, r25, d0]; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; vldb.pop.576 ex2, [p0, lf0, r24]; nops ; nopx ; vshuffle ex8, ex0, ex4, r4; nopv + ; CHECK-NEXT: mova r0, #780; vldb.pop.576 ex6, [p0, lf0, r24, m1]; nops ; nopx ; vshuffle ex10, ex0, ex4, r5; nopv + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: nopa ; vldb.fill.512 [p1, lf1, r25]; nopxm ; vmac.f dm0, dm0, ex8, ex2, r0 + ; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25]; vmac.f dm1, dm1, ex10, ex2, r0 + ; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24]; vmac.f dm2, dm2, ex8, ex6, r0 + ; CHECK-NEXT: vldb.pop.576 ex0, [p1, lf1, r25]; vmac.f dm3, dm3, ex10, ex6, r0 + ; CHECK-NEXT: vldb.pop.576.3d ex4, [p1, lf1, r25, d0] + ; CHECK-NEXT: vldb.pop.576 ex2, [p0, lf0, r24]; vshuffle ex8, ex0, ex4, r4 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopa ; vldb.pop.576 ex6, [p0, lf0, r24, m1]; nops ; nopx ; vshuffle ex10, ex0, ex4, r5; nopv + ; CHECK-NEXT: // %bb.2: + ; CHECK-NEXT: nopa ; nopb ; nopx ; vmac.f dm0, dm0, ex8, ex2, r0 + ; CHECK-NEXT: vmac.f dm1, dm1, ex10, ex2, r0 + ; CHECK-NEXT: vmac.f dm2, dm2, ex8, ex6, r0 + ; CHECK-NEXT: vmac.f dm3, dm3, ex10, ex6, r0 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vshuffle ex8, ex0, ex4, r4 + ; CHECK-NEXT: vshuffle ex10, ex0, ex4, r5 + ; CHECK-NEXT: vmac.f dm0, dm0, ex8, ex2, r0 + ; CHECK-NEXT: vmac.f dm1, dm1, ex10, ex2, r0 + ; CHECK-NEXT: vmac.f dm2, dm2, ex8, ex6, r0 + ; CHECK-NEXT: vmac.f dm3, dm3, ex10, ex6, r0 + ; CHECK-NEXT: ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + ret void + } +... +--- +name: conv2d_bfp16 +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x80000000) + liveins: $p0, $p1, $p2, $p3, $p4, $p5, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $y2, $y3, $p6, $p7 + renamable $r0 = MOV_RLC_imm11_pseudo 780 + $lc = ADD_NC_mv_add_ri $r1, 0 + $ls = MOVXM %bb.1 + $le = MOVXM + renamable $r24 = MOV_RLC_imm11_pseudo 0 + renamable $r25 = MOV_RLC_imm11_pseudo 0 + + bb.1 (align 16): + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $dm0:0x000000001800000C, $dm1:0x000000001800000C, $dm2:0x000000001800000C, $dm3:0x000000001800000C, $m1, $plfr0:0x0000000000484040, $plfr1:0x0000000000484040, $r0, $r4, $r5, $d0_3d:0x0001C00000200E00 + + $p1, $lf1, $r25 = VLD_FILL_512_pseudo $p1, $lf1, $r25 :: (load unknown-size from %ir.pA, align 1, addrspace 5) + $p1, $lf1, $r25 = VLD_FILL_512_pseudo $p1, $lf1, $r25 :: (load unknown-size from %ir.pA, align 1, addrspace 5) + $p0, $lf0, $r24 = VLD_FILL_512_pseudo $p0, $lf0, $r24 :: (load unknown-size from %ir.pW, align 1, addrspace 6) + $ex0, $p1, $lf1, $r25 = VLD_POP_576_normal_pop_pseudo $p1, $lf1, $r25, implicit-def $srfifo_uf :: (load unknown-size from %ir.pA, align 1, addrspace 5) + $ex4, $p1, $lf1, $r25, $dc0, $dc4 = VLD_POP_576_3D_pseudo $p1, $lf1, $r25, $d0_3d, implicit-def $srfifo_uf :: (load unknown-size from %ir.pA, align 1, addrspace 5) + $ex2, $p0, $lf0, $r24 = VLD_POP_576_normal_pop_pseudo $p0, $lf0, $r24, implicit-def $srfifo_uf :: (load unknown-size from %ir.pW, align 1, addrspace 6) + $ex6, $p0, $lf0, $r24 = VLD_POP_576_fifo_1d_pop_pseudo $p0, $lf0, $r24, $m1, implicit-def $srfifo_uf :: (load unknown-size from %ir.pW, align 1, addrspace 6) + renamable $ex8 = VSHUFFLE_vec_shuffle_ex renamable $ex0, renamable $ex4, renamable $r4 + renamable $ex10 = VSHUFFLE_vec_shuffle_ex killed renamable $ex0, killed renamable $ex4, renamable $r5 + renamable $dm0 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm0, renamable $ex8, renamable $ex2, renamable $r0, implicit-def dead $srfpflags, implicit $crfpmask + renamable $dm1 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm1, renamable $ex10, killed renamable $ex2, renamable $r0, implicit-def dead $srfpflags, implicit $crfpmask + renamable $dm2 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm2, killed renamable $ex8, renamable $ex6, renamable $r0, implicit-def dead $srfpflags, implicit $crfpmask + renamable $dm3 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX killed renamable $dm3, killed renamable $ex10, killed renamable $ex6, renamable $r0, implicit-def dead $srfpflags, implicit $crfpmask + PseudoLoopEnd , %bb.1 + + bb.2: + liveins: $dm0:0x000000001800000C, $dm1:0x000000001800000C, $dm2:0x000000001800000C, $dm3:0x000000001800000C + + RET implicit $lr + DelayedSchedBarrier implicit $dm0, implicit $dm1, implicit $dm2, implicit $dm3 +...