diff --git a/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp b/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp index c48e7ae024fb..0531f04809fb 100644 --- a/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp +++ b/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp @@ -1021,6 +1021,23 @@ const MCSlotInfo *AIEBaseInstrInfo::getSlotInfo(const MCSlotKind Kind) const { return FormatInterface->getSlotInfo(Kind); } +bool AIEBaseInstrInfo::isMultiSlotPseudo(const MachineInstr &MI) const { + return MI.isPseudo() && + getFormatInterface()->getAlternateInstsOpcode(MI.getOpcode()); +} + +std::optional +AIEBaseInstrInfo::getSlotOpcode(const MCSlotKind Slot, + const MachineInstr &MI) const { + assert(isMultiSlotPseudo(MI)); + for (const auto &OpCode : + *getFormatInterface()->getAlternateInstsOpcode(MI.getOpcode())) { + if (getSlotKind(OpCode) == Slot) + return OpCode; + } + return {}; +} + const PacketFormats &AIEBaseInstrInfo::getPacketFormats() const { return FormatInterface->getPacketFormats(); } diff --git a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h index b5fa40fa7c35..ec29801d5e8a 100644 --- a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h +++ b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h @@ -77,6 +77,13 @@ struct AIEBaseInstrInfo : public TargetInstrInfo { /// slot will be the default one (unknown). MCSlotKind getSlotKind(unsigned Opcode) const; virtual const MCSlotInfo *getSlotInfo(const MCSlotKind Kind) const; + /// \return Opcode of multi-slot pseudo \p MI that runs in \p Slot + std::optional getSlotOpcode(const MCSlotKind Slot, + const MachineInstr &MI) const; + + /// \return wether \p MI is a multi-slot pseudo instruction + bool isMultiSlotPseudo(const MachineInstr &MI) const; + /// Return the Packet formats for this target virtual const PacketFormats &getPacketFormats() const; /// Return a nop of the given byte size, or the smallest if zero. diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp index 271baf6a1da6..826aae1b6186 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp @@ -18,6 +18,7 @@ #include "AIELiveRegs.h" #include "AIEMachineScheduler.h" #include "AIEMaxLatencyFinder.h" +#include "AIEMultiSlotInstrMaterializer.h" #include "Utils/AIELoopUtils.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -60,6 +61,11 @@ static cl::opt PostPipelinerMaxII( "aie-postpipeliner-maxii", cl::init(40), cl::desc("[AIE] Maximum II to be tried in the post-ra pipeliner")); +static cl::opt EnableMultiSlotInstrMaterialization( + "aie-multi-slot-pseudo-instr", cl::Hidden, cl::init(false), + cl::desc("Statically materialize Multi-Slot Pseudo Instructions in " + "loops.")); + namespace llvm::AIE { void dumpInterBlock(const InterBlockEdges &Edges) { @@ -586,7 +592,7 @@ SchedulingStage InterBlockScheduling::updateScheduling(BlockState &BS) { // But first try SWP if (BS.getRegions().size() == 1) { auto &PostSWP = BS.getPostSWP(); - if (PostSWP.canAccept(*BS.TheBlock)) { + if (PostSWP.isPostPipelineCandidate(*BS.TheBlock)) { BS.FixPoint.II = PostSWP.getResMII(*BS.TheBlock); return BS.FixPoint.Stage = SchedulingStage::Pipelining; } @@ -1161,6 +1167,11 @@ void BlockState::initInterBlock(const MachineSchedContext &Context, // Don't worry, this just constructs a mostly empty container class auto NumInstrs = getTop().getFreeInstructions().size(); PostSWP = std::make_unique(HR, NumInstrs); + + // perform static assignment of multi-slot pseudos + if (EnableMultiSlotInstrMaterialization && + PostSWP->isPostPipelineCandidate(*TheBlock)) + staticallyMaterializeMultiSlotInstructions(*TheBlock, HR); } // We are called just after the first round of scheduling a block. diff --git a/llvm/lib/Target/AIE/AIEMultiSlotInstrMaterializer.cpp b/llvm/lib/Target/AIE/AIEMultiSlotInstrMaterializer.cpp new file mode 100644 index 000000000000..2a149395c61e --- /dev/null +++ b/llvm/lib/Target/AIE/AIEMultiSlotInstrMaterializer.cpp @@ -0,0 +1,229 @@ +//===--- AIEMultiSlotInstrMaterializer.cpp - -Multi Slot Instr materializer===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// \file assigns an issue slot to multi-slot pseudo instructions within a single +// block loop to help loop pipelining. +// +//===----------------------------------------------------------------------===// + +#include "AIEMultiSlotInstrMaterializer.h" +#include "AIEHazardRecognizer.h" + +using namespace llvm; + +#define DEBUG_TYPE "aie-multi-slot-pseudo" + +namespace llvm::AIE { + +class SlotMapping { +public: + SlotMapping(const AIEBaseInstrInfo *TII) : TII(TII) {} + + /// update \p MemBankBits assigned to \p Slot . Create the Slot mapping, if + /// necessary. + void update(const MCSlotKind &Slot, const MemoryBankBits MemBankBits) { + SlotToBanks[Slot] |= MemBankBits; + } + + /// \return first Slot where MemoryBankBits overlap with \p MemBankBits . + std::optional + getAssignedSlot(const MemoryBankBits MemBankBits) const { + auto IT = + find_if(SlotToBanks, + [MemBankBits]( + const std::pair &SlotBankPair) { + auto [Slot, Banks] = SlotBankPair; + return (Banks & MemBankBits) != 0; + }); + + if (IT == SlotToBanks.end()) + return {}; + + const auto Slot = IT->first; + return Slot; + } + + /// \return whether no MemoryBank has multiple Slots assigned to it in the + /// current mapping. + bool hasUniqueSlotForBank() const { + MemoryBankBits AccumulatedBanks = {}; + for (auto &[Slot, Banks] : SlotToBanks) { + if (Banks & AccumulatedBanks) { + LLVM_DEBUG(dbgs() << "Conflict detected at Slot " << Slot << "\n"); + return false; + } + AccumulatedBanks |= Banks; + } + return true; + } + + /// \return whether a Slot can be assigned to \b MI and assign it in the + /// mapping. + bool assignSlot(const MachineInstr &MI, const AIEHazardRecognizer &HR) { + auto MemBankBits = HR.getMemoryBanks(&MI); + LLVM_DEBUG(dbgs() << "Memory Bank: " << MemBankBits << " " << MI); + if (!MemBankBits) { + LLVM_DEBUG(dbgs() << "Warning: No MemoryBanks assigned to " << MI); + return false; + } + + std::optional SelectedSlot = getAssignedSlot(MemBankBits); + if (!SelectedSlot) + SelectedSlot = getUnusedLoadSlot(); + if (!SelectedSlot) { + LLVM_DEBUG(dbgs() << "Reassigning existing Slot to MemoryBankBits " + << MemBankBits << "\n"); + SelectedSlot = getLeastRecentlyUsedSlot(); + } + + update(*SelectedSlot, MemBankBits); + + return true; + } + +private: + /// Mapping between a Slot and the MemoryBanks that occupy the Slot. + std::map SlotToBanks; + /// If Slots have to be reassigned (because every Slot has already been + /// assigned to a Memory Bank), use an Index to cycle through already + /// used Slots. + unsigned ReassignIndex = 0; + const AIEBaseInstrInfo *TII; + + /// \return an unused Slot from the mapping. + std::optional getUnusedLoadSlot() const { + const SmallVector LoadSlots = + TII->getFormatInterface()->getLoadSlotKinds(); + + for (const auto &Slot : LoadSlots) { + + // check if Slot is already used in SlotMemBankBitsMap + auto FoundSlot = SlotToBanks.find(Slot); + if (FoundSlot != SlotToBanks.end()) + continue; + + LLVM_DEBUG(dbgs() << " Found Unused Slot " << Slot << "\n"); + return Slot; + } + + // no slots were assigned yet, assign first Slot. + // FIXME: use a heuristic that takes Slots utilization and + // utilization of MemoryBanks into consideration. + return LoadSlots[0]; + } + + /// Cycle through load Slots and \return an already used Slot + /// FIXME: use a heuristic that takes utilization into account, instead of + /// blindly cycling through the Slots. + std::optional getLeastRecentlyUsedSlot() { + const auto AvailableSlots = TII->getFormatInterface()->getLoadSlotKinds(); + + if (ReassignIndex >= AvailableSlots.size()) + ReassignIndex = 0; + + return AvailableSlots[ReassignIndex++]; + } +}; + +/// \return a map between Slots and the MemoryBanks that occurs within \p MBB . +SlotMapping getAssignedSlots(const MachineBasicBlock &MBB, + const AIEBaseInstrInfo *TII, + const AIEHazardRecognizer &HR) { + SlotMapping SlotToBanks(TII); + + LLVM_DEBUG(dbgs() << "Collecting any already materialized Slot to MemoryBank " + "assignments\n"); + for (const auto &MI : MBB) { + if (!MI.mayLoad() || TII->isMultiSlotPseudo(MI)) + continue; + + const auto Slot = TII->getSlotKind(MI.getOpcode()); + const MemoryBankBits MemBankBits = HR.getMemoryBanks(&MI); + LLVM_DEBUG(dbgs() << "Slot: " << Slot << " MemoryBank: " << MemBankBits + << " on " << MI); + + SlotToBanks.update(Slot, MemBankBits); + } + return SlotToBanks; +} + +/// \return whether a valid assignment of Slots to MemoryBankBits is found. +/// Multi-Slot pseudo load instructions in \p MBB get a Slot assigned, according +/// to the MemoyBankBits that is attached to the MachineInstr. Existing mappings +/// in \p SlotToBanks are used and updated. +bool assignSlots(SlotMapping &SlotToBanks, const MachineBasicBlock &MBB, + const AIEBaseInstrInfo *TII, const AIEHazardRecognizer &HR) { + for (const auto &MI : MBB) { + if (!MI.mayLoad() || !TII->isMultiSlotPseudo(MI)) + continue; + + if (!SlotToBanks.assignSlot(MI, HR)) { + return false; + } + } + + return SlotToBanks.hasUniqueSlotForBank(); +} + +/// Materialize \p MI with a Slot according to overlapping MemoryBankBits +/// between \p MI and the Slot mapping in \p SlotToBanks . +void materializeInstr(MachineInstr &MI, const SlotMapping &SlotToBanks, + const AIEBaseInstrInfo *TII, + const AIEHazardRecognizer &HR) { + auto MemBankBits = HR.getMemoryBanks(&MI); + assert(MemBankBits && "No MemoryBanks attached to MachineInstr."); + + const auto Slot = SlotToBanks.getAssignedSlot(MemBankBits); + assert(Slot && "Could not find Slot for MemoryBank!"); + + auto OpCode = TII->getSlotOpcode(*Slot, MI); + assert(OpCode && "Failed to retrieve a valid Opcode"); + + MI.setDesc(TII->get(*OpCode)); + LLVM_DEBUG(dbgs() << "Assigned " << *Slot << " to " << MI); +} + +/// Materialize multi-slot pseudo instructions in \p MBB according to +/// overlapping MemoryBankBits between a MachineInstr and the Slot mapping in +/// \p SlotToBanks . +void materializeSlots(const SlotMapping &SlotToBanks, MachineBasicBlock &MBB, + const AIEBaseInstrInfo *TII, + const AIEHazardRecognizer &HR) { + LLVM_DEBUG(dbgs() << "\nAssigning Slots to MachineInstr\n"); + + for (auto &MI : MBB) { + if (!MI.mayLoad() || !TII->isMultiSlotPseudo(MI)) + continue; + + materializeInstr(MI, SlotToBanks, TII, HR); + } +} + +void staticallyMaterializeMultiSlotInstructions(MachineBasicBlock &MBB, + const AIEHazardRecognizer &HR) { + LLVM_DEBUG(dbgs() << "Statically Assigning multi slot pseudos for " + << MBB.getName() << "\n"); + + const AIEBaseInstrInfo *TII = static_cast( + MBB.getParent()->getSubtarget().getInstrInfo()); + + auto SlotToBanks = getAssignedSlots(MBB, TII, HR); + + if (!assignSlots(SlotToBanks, MBB, TII, HR)) { + LLVM_DEBUG( + dbgs() + << "Could not find Slot Assignments, Skipping materialization\n"); + return; + } + + materializeSlots(SlotToBanks, MBB, TII, HR); +} +} // namespace llvm::AIE +// diff --git a/llvm/lib/Target/AIE/AIEMultiSlotInstrMaterializer.h b/llvm/lib/Target/AIE/AIEMultiSlotInstrMaterializer.h new file mode 100644 index 000000000000..84799906353c --- /dev/null +++ b/llvm/lib/Target/AIE/AIEMultiSlotInstrMaterializer.h @@ -0,0 +1,29 @@ +//===--- AIEMultiSlotInstrMaterializer.h -Multi Slot Instr materializer----===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// \file assigns an issue-slot to multi slot pseudo instructions within a single +// block loop to help loop pipelining. +// +//===----------------------------------------------------------------------===// +#include "AIEBaseInstrInfo.h" + +namespace llvm { +class AIEHazardRecognizer; +} + +namespace llvm::AIE { + +/// Statically assign and materialize Slots to multi-slot pseudo MachineInstr in +/// \p MBB . +/// FIXME: Currently we are only handling multi-slot memory load pseudos. +void staticallyMaterializeMultiSlotInstructions(MachineBasicBlock &MBB, + const AIEHazardRecognizer &HR); + +} // namespace llvm::AIE diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp index 43955e01e38c..e3da0614be1b 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp @@ -65,7 +65,7 @@ class PostPipelineDumper : public PipelineScheduleVisitor { PostPipeliner::PostPipeliner(const AIEHazardRecognizer &HR, int NInstr) : HR(HR), NInstr(NInstr) {} -bool PostPipeliner::canAccept(MachineBasicBlock &LoopBlock) { +bool PostPipeliner::isPostPipelineCandidate(MachineBasicBlock &LoopBlock) { // We leave the single-block loop criterion to our caller. It is fulfilled // by being a loopaware scheduling candidate. // First get us some instruments diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.h b/llvm/lib/Target/AIE/AIEPostPipeliner.h index 63cb496ffe8d..b07f6f482b60 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.h +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.h @@ -233,10 +233,10 @@ class PostPipeliner { /// Check whether this is a suitable loop for the PostPipeliner. It also /// leaves some useful information. - bool canAccept(MachineBasicBlock &LoopBlock); + bool isPostPipelineCandidate(MachineBasicBlock &LoopBlock); /// Get a lowerbound for the II required to accommodate the slots. - /// \pre canAccept has returned true + /// \pre isPostPipelineCandidate has returned true int getResMII(MachineBasicBlock &LoopBlock); // Schedule using the given InitiationInterval. Return true when successful. diff --git a/llvm/lib/Target/AIE/CMakeLists.txt b/llvm/lib/Target/AIE/CMakeLists.txt index c1dd7ed52124..c302c94ae055 100644 --- a/llvm/lib/Target/AIE/CMakeLists.txt +++ b/llvm/lib/Target/AIE/CMakeLists.txt @@ -112,6 +112,7 @@ add_llvm_target(AIECodeGen AIEMaxLatencyFinder.cpp AIEMCInstLower.cpp AIEMIRFormatter.cpp + AIEMultiSlotInstrMaterializer.cpp AIEPostPipeliner.cpp AIEPostSelectOptimize.cpp AIEPseudoBranchExpansion.cpp diff --git a/llvm/lib/Target/AIE/MCTargetDesc/AIE2MCFormats.cpp b/llvm/lib/Target/AIE/MCTargetDesc/AIE2MCFormats.cpp index 0541e6e4b4a4..5657720ae3a7 100644 --- a/llvm/lib/Target/AIE/MCTargetDesc/AIE2MCFormats.cpp +++ b/llvm/lib/Target/AIE/MCTargetDesc/AIE2MCFormats.cpp @@ -37,4 +37,8 @@ const MCFormatDesc *AIE2MCFormats::getMCFormats() const { const PacketFormats &AIE2MCFormats::getPacketFormats() const { return Formats; } +SmallVector AIE2MCFormats::getLoadSlotKinds() const { + return {AIE2SlotKind::AIE2_SLOT_LDB, AIE2SlotKind::AIE2_SLOT_LDA}; +} + } // end namespace llvm diff --git a/llvm/lib/Target/AIE/MCTargetDesc/AIEMCFormats.h b/llvm/lib/Target/AIE/MCTargetDesc/AIEMCFormats.h index 46e7e0754838..8eb6d03b52d7 100644 --- a/llvm/lib/Target/AIE/MCTargetDesc/AIEMCFormats.h +++ b/llvm/lib/Target/AIE/MCTargetDesc/AIEMCFormats.h @@ -407,6 +407,11 @@ class AIEBaseMCFormats { virtual const PacketFormats &getPacketFormats() const = 0; + // \return all Slots that correspond to the load instructions + virtual SmallVector getLoadSlotKinds() const { + llvm_unreachable("Target didn't implement getLoadSlotKinds()"); + } + protected: /// Check if the Instruction is indeed into the Tables. void checkInstructionIsSupported(unsigned int Opcode) const; @@ -432,6 +437,7 @@ class AIE2MCFormats : public AIEBaseMCFormats { const MCSlotInfo *getSlotInfo(const MCSlotKind Kind) const override; const MCFormatDesc *getMCFormats() const override; const PacketFormats &getPacketFormats() const override; + SmallVector getLoadSlotKinds() const override; }; class AIE2PMCFormats : public AIEBaseMCFormats { @@ -443,6 +449,7 @@ class AIE2PMCFormats : public AIEBaseMCFormats { const MCSlotInfo *getSlotInfo(const MCSlotKind Kind) const override; const MCFormatDesc *getMCFormats() const override; const PacketFormats &getPacketFormats() const override; + SmallVector getLoadSlotKinds() const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/AIE/MCTargetDesc/aie2p/AIE2PMCFormats.cpp b/llvm/lib/Target/AIE/MCTargetDesc/aie2p/AIE2PMCFormats.cpp index 245c7abfd966..720b76c5f67c 100644 --- a/llvm/lib/Target/AIE/MCTargetDesc/aie2p/AIE2PMCFormats.cpp +++ b/llvm/lib/Target/AIE/MCTargetDesc/aie2p/AIE2PMCFormats.cpp @@ -39,4 +39,8 @@ const PacketFormats &AIE2PMCFormats::getPacketFormats() const { return Formats; } +SmallVector AIE2PMCFormats::getLoadSlotKinds() const { + return {AIE2PSlotKind::AIE2P_SLOT_LDB, AIE2PSlotKind::AIE2P_SLOT_LDA}; +} + } // end namespace llvm diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll index 614e32a8520f..718c9954c25b 100644 --- a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll +++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll @@ -5,7 +5,8 @@ ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception ; ; (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates -; RUN: llc -mtriple=aie2p --aie-force-postpipeliner %s -o - | FileCheck %s +; RUN: llc -mtriple=aie2p --aie-force-postpipeliner \ +; RUN: -aie-multi-slot-pseudo-instr=true %s -o - | FileCheck %s ; This is a bf16->bfp16 conversion function used by Conv2D kernels. ; Ultimately, we should target II=4 diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_kernel_red.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_kernel_red.ll index bb59ed7c98b4..7d995906793e 100644 --- a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_kernel_red.ll +++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_kernel_red.ll @@ -5,7 +5,8 @@ ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception ; ; (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates -; RUN: llc -mtriple=aie2p --aie-force-postpipeliner %s -o - | FileCheck %s +; RUN: llc -mtriple=aie2p --aie-force-postpipeliner \ +; RUN: -aie-multi-slot-pseudo-instr=true %s -o - | FileCheck %s ; This is a reduced version of the Conv2D_bfp16 kernel function which only contains ; the innermost loop. It was mostly obtained with llvm-extract, but stores were @@ -19,7 +20,7 @@ define dso_local void @conv2d_bfp16.for.body90.i(<32 x i32> %fW.sroa.0.1489.i, i ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %newFuncRoot ; CHECK-NEXT: paddxm [sp], #64 -; CHECK-NEXT: st p6, [sp, #-60]; nopx // 4-byte Folded Spill +; CHECK-NEXT: st p6, [sp, #-60] // 4-byte Folded Spill ; CHECK-NEXT: mov p6, sp ; CHECK-NEXT: padda [p6], #-320 ; CHECK-NEXT: vlda bmll3, [p6, #0] @@ -57,41 +58,42 @@ define dso_local void @conv2d_bfp16.for.body90.i(<32 x i32> %fW.sroa.0.1489.i, i ; CHECK-NEXT: padda [p6], m0 ; CHECK-NEXT: vlda bmll0, [p6, #0] ; CHECK-NEXT: vlda bmlh0, [p6, #64] -; CHECK-NEXT: vlda bmhl0, [p6, #128] -; CHECK-NEXT: vlda bmhh0, [p6, #192]; movx r25, #0; mov dn0, p3 -; CHECK-NEXT: mova dc4, #0; vldb.fill.512 [p1, lf1, r25]; mov dj0, p4 -; CHECK-NEXT: mova r24, #0; vldb.fill.512 [p1, lf1, r25]; mov dn4, p5 -; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24]; mov dc0, dc4 -; CHECK-NEXT: vldb.pop.576 ex0, [p1, lf1, r25]; add r1, r6, #-1; mov m0, p2 -; CHECK-NEXT: vldb.pop.576.3d ex2, [p1, lf1, r25, d0]; movxm ls, #.LBB0_1 -; CHECK-NEXT: vldb.pop.576 ex4, [p0, lf0, r24]; movxm le, #.L_LEnd0 -; CHECK-NEXT: vldb.pop.576 ex6, [p0, lf0, r24, m1]; add.nc lc, r1, #-2 -; CHECK-NEXT: nopa ; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; nopv -; CHECK-NEXT: nopa ; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; nopv -; CHECK-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopxm ; nopv -; CHECK-NEXT: nopa ; vldb.pop.576 ex0, [p1, lf1, r25]; nops ; nopxm ; nopv -; CHECK-NEXT: nopa ; vldb.pop.576.3d ex2, [p1, lf1, r25, d0]; nops ; nopxm ; nopv -; CHECK-NEXT: nopa ; vldb.pop.576 ex4, [p0, lf0, r24]; nops ; nopx ; vshuffle ex8, ex0, ex2, r4; nopv -; CHECK-NEXT: mova r0, #780; vldb.pop.576 ex6, [p0, lf0, r24, m1]; nops ; nopx ; vshuffle ex10, ex0, ex2, r5; nopv +; CHECK-NEXT: vlda bmhl0, [p6, #128]; mov dn0, p3 +; CHECK-NEXT: vlda bmhh0, [p6, #192]; movx r25, #0; mov dj0, p4 +; CHECK-NEXT: mova dc4, #0; vldb.fill.512 [p1, lf1, r25]; mov dn4, p5 +; CHECK-NEXT: mova r24, #0; vldb.fill.512 [p1, lf1, r25]; mov dc0, dc4 +; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; mov m0, p2 +; CHECK-NEXT: vlda.pop.576 ex4, [p0, lf0, r24]; vldb.pop.576.3d ex2, [p1, lf1, r25, d0] +; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25] +; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25]; add r1, r6, #-1 +; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; movxm ls, #.LBB0_1 +; CHECK-NEXT: vlda.pop.576 ex4, [p0, lf0, r24]; vldb.pop.576.3d ex2, [p1, lf1, r25, d0]; movxm le, #.L_LEnd0 +; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25]; add.nc lc, r1, #-4 +; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; nopv +; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; nops ; nopxm ; nopv +; CHECK-NEXT: vlda.pop.576 ex4, [p0, lf0, r24]; vldb.pop.576.3d ex2, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle ex8, ex0, ex2, r4; nopv +; CHECK-NEXT: mova r0, #780; vldb.fill.512 [p1, lf1, r25]; nops ; nopx ; vshuffle ex10, ex0, ex2, r5; nopv +; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; vmac.f dm0, dm0, ex8, ex4, r0 +; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; nops ; nopxm ; vmac.f dm1, dm1, ex10, ex4, r0 +; CHECK-NEXT: vlda.pop.576 ex4, [p0, lf0, r24]; vldb.pop.576.3d ex2, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle ex8, ex0, ex2, r4; vmac.f dm2, dm2, ex8, ex6, r0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body90.i ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: nopa ; vldb.fill.512 [p1, lf1, r25]; nopxm ; vmac.f dm0, dm0, ex8, ex4, r0 -; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25]; vmac.f dm1, dm1, ex10, ex4, r0 -; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24]; vmac.f dm2, dm2, ex8, ex6, r0 -; CHECK-NEXT: vldb.pop.576 ex0, [p1, lf1, r25]; vmac.f dm3, dm3, ex10, ex6, r0 -; CHECK-NEXT: vldb.pop.576.3d ex2, [p1, lf1, r25, d0] -; CHECK-NEXT: vldb.pop.576 ex4, [p0, lf0, r24]; vshuffle ex8, ex0, ex2, r4 +; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25]; nopx ; vshuffle ex10, ex0, ex2, r5; vmac.f dm3, dm3, ex10, ex6, r0 +; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25]; vmac.f dm0, dm0, ex8, ex4, r0 +; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; vmac.f dm1, dm1, ex10, ex4, r0 ; CHECK-NEXT: .L_LEnd0: -; CHECK-NEXT: nopa ; vldb.pop.576 ex6, [p0, lf0, r24, m1]; nops ; nopx ; vshuffle ex10, ex0, ex2, r5; nopv +; CHECK-NEXT: vlda.pop.576 ex4, [p0, lf0, r24]; vldb.pop.576.3d ex2, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle ex8, ex0, ex2, r4; vmac.f dm2, dm2, ex8, ex6, r0 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup89.i.exitStub -; CHECK-NEXT: lda p6, [sp, #-60]; nopb ; nopx ; mov p0, r7; vmac.f dm0, dm0, ex8, ex4, r0 // 4-byte Folded Reload +; CHECK-NEXT: lda p6, [sp, #-60]; nopx ; vshuffle ex10, ex0, ex2, r5; vmac.f dm3, dm3, ex10, ex6, r0 // 4-byte Folded Reload +; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vmac.f dm0, dm0, ex8, ex4, r0 +; CHECK-NEXT: mov p0, r7; vmac.f dm1, dm1, ex10, ex4, r0 +; CHECK-NEXT: vshuffle ex8, ex0, ex2, r4; vmac.f dm2, dm2, ex8, ex6, r0 +; CHECK-NEXT: vshuffle ex10, ex0, ex2, r5; vmac.f dm3, dm3, ex10, ex6, r0 +; CHECK-NEXT: vmac.f dm0, dm0, ex8, ex4, r0 ; CHECK-NEXT: vmac.f dm1, dm1, ex10, ex4, r0 -; CHECK-NEXT: vmac.f dm2, dm2, ex8, ex6, r0 -; CHECK-NEXT: vmac.f dm3, dm3, ex10, ex6, r0 -; CHECK-NEXT: nop -; CHECK-NEXT: vshuffle ex8, ex0, ex2, r4 -; CHECK-NEXT: vshuffle ex10, ex0, ex2, r5 +; CHECK-NEXT: vshuffle ex8, ex0, ex2, r4; vmac.f dm2, dm2, ex8, ex6, r0 +; CHECK-NEXT: vshuffle ex10, ex0, ex2, r5; vmac.f dm3, dm3, ex10, ex6, r0 ; CHECK-NEXT: vmac.f dm0, dm0, ex8, ex4, r0 ; CHECK-NEXT: vmac.f dm1, dm1, ex10, ex4, r0 ; CHECK-NEXT: vmac.f dm2, dm2, ex8, ex6, r0 diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_kernel_multislot.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_kernel_multislot.mir index ecd01959cc45..96f2ce6e6070 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_kernel_multislot.mir +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_kernel_multislot.mir @@ -6,7 +6,7 @@ # (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates # RUN: llc --mtriple=aie2p --aie-loop-min-tripcount=7 %s \ -# RUN: --start-before=postmisched --debug-only=postpipeliner-summary -o - | FileCheck %s +# RUN: --start-before=postmisched --debug-only=postpipeliner-summary --aie-multi-slot-pseudo-instr=true -o - | FileCheck %s --- | @@ -14,39 +14,40 @@ ; CHECK-LABEL: conv2d_bfp16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry - ; CHECK-NEXT: mova r25, #0 + ; CHECK-NEXT: mova r25, #0; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25] ; CHECK-NEXT: mova r24, #0; vldb.fill.512 [p1, lf1, r25] - ; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24] - ; CHECK-NEXT: vldb.pop.576 ex0, [p1, lf1, r25] - ; CHECK-NEXT: vldb.pop.576.3d ex4, [p1, lf1, r25, d0]; add.nc lc, r1, #-2 - ; CHECK-NEXT: vldb.pop.576 ex2, [p0, lf0, r24]; movxm ls, #.LBB0_1 - ; CHECK-NEXT: vldb.pop.576 ex6, [p0, lf0, r24, m1]; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopa ; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; vldb.pop.576 ex0, [p1, lf1, r25]; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; vldb.pop.576.3d ex4, [p1, lf1, r25, d0]; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; vldb.pop.576 ex2, [p0, lf0, r24]; nops ; nopx ; vshuffle ex8, ex0, ex4, r4; nopv - ; CHECK-NEXT: mova r0, #780; vldb.pop.576 ex6, [p0, lf0, r24, m1]; nops ; nopx ; vshuffle ex10, ex0, ex4, r5; nopv + ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25] + ; CHECK-NEXT: vlda.pop.576 ex2, [p0, lf0, r24]; vldb.pop.576.3d ex4, [p1, lf1, r25, d0] + ; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25] + ; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25] + ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; add.nc lc, r1, #-4 + ; CHECK-NEXT: vlda.pop.576 ex2, [p0, lf0, r24]; vldb.pop.576.3d ex4, [p1, lf1, r25, d0]; movxm ls, #.LBB0_1 + ; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25]; movxm le, #.L_LEnd0 + ; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; nopv + ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; nops ; nopxm ; nopv + ; CHECK-NEXT: vlda.pop.576 ex2, [p0, lf0, r24]; vldb.pop.576.3d ex4, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle ex8, ex0, ex4, r4; nopv + ; CHECK-NEXT: mova r0, #780; vldb.fill.512 [p1, lf1, r25]; nops ; nopx ; vshuffle ex10, ex0, ex4, r5; nopv + ; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; vmac.f dm0, dm0, ex8, ex2, r0 + ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; nops ; nopxm ; vmac.f dm1, dm1, ex10, ex2, r0 + ; CHECK-NEXT: vlda.pop.576 ex2, [p0, lf0, r24]; vldb.pop.576.3d ex4, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle ex8, ex0, ex4, r4; vmac.f dm2, dm2, ex8, ex6, r0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: nopa ; vldb.fill.512 [p1, lf1, r25]; nopxm ; vmac.f dm0, dm0, ex8, ex2, r0 - ; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25]; vmac.f dm1, dm1, ex10, ex2, r0 - ; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24]; vmac.f dm2, dm2, ex8, ex6, r0 - ; CHECK-NEXT: vldb.pop.576 ex0, [p1, lf1, r25]; vmac.f dm3, dm3, ex10, ex6, r0 - ; CHECK-NEXT: vldb.pop.576.3d ex4, [p1, lf1, r25, d0] - ; CHECK-NEXT: vldb.pop.576 ex2, [p0, lf0, r24]; vshuffle ex8, ex0, ex4, r4 + ; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25]; nopx ; vshuffle ex10, ex0, ex4, r5; vmac.f dm3, dm3, ex10, ex6, r0 + ; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25]; vmac.f dm0, dm0, ex8, ex2, r0 + ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; vmac.f dm1, dm1, ex10, ex2, r0 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopa ; vldb.pop.576 ex6, [p0, lf0, r24, m1]; nops ; nopx ; vshuffle ex10, ex0, ex4, r5; nopv + ; CHECK-NEXT: vlda.pop.576 ex2, [p0, lf0, r24]; vldb.pop.576.3d ex4, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle ex8, ex0, ex4, r4; vmac.f dm2, dm2, ex8, ex6, r0 ; CHECK-NEXT: // %bb.2: - ; CHECK-NEXT: nopa ; nopb ; nopx ; vmac.f dm0, dm0, ex8, ex2, r0 + ; CHECK-NEXT: nopa ; nopb ; nopx ; vshuffle ex10, ex0, ex4, r5; vmac.f dm3, dm3, ex10, ex6, r0 + ; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vmac.f dm0, dm0, ex8, ex2, r0 ; CHECK-NEXT: vmac.f dm1, dm1, ex10, ex2, r0 - ; CHECK-NEXT: vmac.f dm2, dm2, ex8, ex6, r0 - ; CHECK-NEXT: vmac.f dm3, dm3, ex10, ex6, r0 - ; CHECK-NEXT: nop - ; CHECK-NEXT: vshuffle ex8, ex0, ex4, r4 - ; CHECK-NEXT: vshuffle ex10, ex0, ex4, r5 + ; CHECK-NEXT: vshuffle ex8, ex0, ex4, r4; vmac.f dm2, dm2, ex8, ex6, r0 + ; CHECK-NEXT: vshuffle ex10, ex0, ex4, r5; vmac.f dm3, dm3, ex10, ex6, r0 + ; CHECK-NEXT: vmac.f dm0, dm0, ex8, ex2, r0 + ; CHECK-NEXT: vmac.f dm1, dm1, ex10, ex2, r0 + ; CHECK-NEXT: vshuffle ex8, ex0, ex4, r4; vmac.f dm2, dm2, ex8, ex6, r0 + ; CHECK-NEXT: vshuffle ex10, ex0, ex4, r5; vmac.f dm3, dm3, ex10, ex6, r0 ; CHECK-NEXT: vmac.f dm0, dm0, ex8, ex2, r0 ; CHECK-NEXT: vmac.f dm1, dm1, ex10, ex2, r0 ; CHECK-NEXT: vmac.f dm2, dm2, ex8, ex6, r0 diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/future_conflict_assignment.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/future_conflict_assignment.mir new file mode 100644 index 000000000000..3757d398a0eb --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/future_conflict_assignment.mir @@ -0,0 +1,104 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + + +# RUN: llc --mtriple=aie2p -O2 --start-before=postmisched \ +# RUN: -aie-multi-slot-pseudo-instr=true -verify-machineinstrs \ +# RUN: --aie-loop-min-tripcount=10 %s -o - | FileCheck %s + + +# Note that the LLVM IR doesn't match the actual MIR code. It is just a standard +# loop providing some pointers into different spaces to dereference. + +# future_conflict_assignment loop, that is currently not handled correctly, +# since a global view of the memoryBanks are not provided +# MI0 (via p0) = Memory Bank A +# MI1 (via p1) = Memory Bank D +# MI2 (via p0) = Memory Bank AD (need to compress Mappings to a single entry that contain AD) +# MI3 (via p1) = Memory Bank C (becomes Slot for itself) +# +# optimal but not found solution: +# addrspace: A, D, AD -> common Slot +# addrspace C -> common Slot +--- | + define dso_local void @future_conflict_assignment(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: future_conflict_assignment: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: + ; CHECK-NEXT: add.nc lc, r0, #0 + ; CHECK-NEXT: movxm ls, #.LBB0_2 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; nopv + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; nopv + ; CHECK-NEXT: // %bb.3: + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + ret void + } +... +--- +name: future_conflict_assignment +alignment: 16 +tracksRegLiveness: true +body: | + bb.0 (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOVA 0 + $r1 = GE $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.2 + $le = MOVXM + + bb.2 (align 16): + successors: %bb.2(0x7c000000), %bb.3(0x04000000) + liveins: $dc1, $dc5, $dm0, $dm1, $dm2, $dm3, $ex1, $ex2, $ex3, $ex4, $ex5, $ex6, $ex8, $ex10, $lf0, $lf1, $m4, $m5, $m7, $p0, $p1, $p3, $p4, $p5, $p6, $p7, $r3, $r6, $r7, $r8, $r9, $r16, $r17, $r24, $r25, $d3_3d:0x0001800000200C00 + + $p0, $lf0, $r24 = VLD_FILL_512_pseudo killed $p0, killed $lf0, killed $r24 :: (load (<32 x s16>), addrspace 5) + $p1, $lf1, $r25 = VLD_FILL_512_pseudo killed $p1, killed $lf1, killed $r25 :: (load (<32 x s16>), addrspace 8) + $p0, $lf0, $r24 = VLD_FILL_512_pseudo killed $p0, killed $lf0, killed $r24 :: (load (<32 x s16>), addrspace 11) + $p1, $lf1, $r25 = VLD_FILL_512_pseudo killed $p1, killed $lf1, killed $r25 :: (load (<32 x s16>), addrspace 7) + PseudoLoopEnd , %bb.2 + + bb.3 (align 16): + RET implicit $lr + DelayedSchedBarrier diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/non_overlapping_addrspace.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/non_overlapping_addrspace.mir new file mode 100644 index 000000000000..1f2aee47f989 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/non_overlapping_addrspace.mir @@ -0,0 +1,103 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + + +# RUN: llc --mtriple=aie2p -O2 --start-before=postmisched \ +# RUN: -aie-multi-slot-pseudo-instr=true -verify-machineinstrs \ +# RUN: --aie-loop-min-tripcount=10 %s -o - | FileCheck %s + + +# Note that the LLVM IR doesn't match the actual MIR code. It is just a standard +# loop providing some pointers into different spaces to dereference. + +# non_overlapping_addrspace loop with 2 non-overlapping address spaces +# pointer p0 uses addrspace: 5 (A), 9 (AB), 10 (AC), 12 (BC) +# pointer p1 uses addrspace: 8 (D) +# +# addrspace: A, AB, BC, AC -> common Slot +# addrspace: D -> common Slot +--- | + define dso_local void @non_overlapping_addrspace(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: non_overlapping_addrspace: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: + ; CHECK-NEXT: add.nc lc, r0, #0 + ; CHECK-NEXT: movxm ls, #.LBB0_2 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; vldb.fill.512 [p0, lf0, r24]; nopx + ; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24] + ; CHECK-NEXT: vldb.pop.576 ex6, [p0, lf0, r24] + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopa ; vldb.pop.576.3d ex1, [p0, lf0, r24, d3]; nops ; nopxm ; nopv + ; CHECK-NEXT: // %bb.3: + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + ret void + } +... +--- +name: non_overlapping_addrspace +alignment: 16 +tracksRegLiveness: true +body: | + bb.0 (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOVA 0 + $r1 = GE $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.2 + $le = MOVXM + + bb.2(align 16): + successors: %bb.2(0x7c000000), %bb.3(0x04000000) + liveins: $dc1, $dc5, $dm0, $dm1, $dm2, $dm3, $ex1, $ex2, $ex3, $ex4, $ex5, $ex6, $ex8, $ex10, $lf0, $lf1, $m4, $m5, $m7, $p0, $p1, $p3, $p4, $p5, $p6, $p7, $r3, $r6, $r7, $r8, $r9, $r16, $r17, $r24, $r25, $d3_3d:0x0001800000200C00 + + $p0, $lf0, $r24 = VLD_FILL_512_pseudo killed $p0, killed $lf0, killed $r24 :: (load (<32 x s16>), addrspace 5) + $p0, $lf0, $r24 = VLD_FILL_512_pseudo killed $p0, killed $lf0, killed $r24 :: (load (<32 x s16>), addrspace 9) + $p1, $lf1, $r25 = VLD_FILL_512_pseudo killed $p1, killed $lf1, killed $r25 :: (load (<32 x s16>), addrspace 8) + $ex6, $p0, $lf0, $r24 = VLD_POP_576_normal_pop_pseudo killed $p0, killed $lf0, killed $r24, implicit-def $srfifo_uf :: (load (<32 x s16>), addrspace 10) + $ex1, $p0, $lf0, $r24, $dc3, $dc7 = VLD_POP_576_3D_pseudo killed $p0, killed $lf0, killed $r24, $d3_3d, implicit-def $srfifo_uf :: (load unknown-size , align 1, addrspace 12) + PseudoLoopEnd , %bb.2 + + bb.3(align 16): + RET implicit $lr + DelayedSchedBarrier diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/partially_materialized.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/partially_materialized.mir new file mode 100644 index 000000000000..2ac65971ce5b --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/partially_materialized.mir @@ -0,0 +1,100 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + + +# RUN: llc --mtriple=aie2p -O2 --start-before=postmisched \ +# RUN: -aie-multi-slot-pseudo-instr=true -verify-machineinstrs \ +# RUN: --aie-loop-min-tripcount=10 %s -o - | FileCheck %s + + +# Note that the LLVM IR doesn't match the actual MIR code. It is just a standard +# loop providing some pointers into different spaces to dereference. + +# partially_materialized sample loop, with partially assigned Memory Banks to Slots: +# pointer p0 -> addrspace 5 -> slot not yet assigned -> assign VLDA +# pointer p1 -> addrspace 6 -> already assigned VLDB +--- | + define dso_local void @partially_materialized(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: partially_materialized: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: + ; CHECK-NEXT: add.nc lc, r0, #0 + ; CHECK-NEXT: movxm ls, #.LBB0_2 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.fill.512 [p1, lf1, r25]; nopm + ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.fill.512 [p1, lf1, r25] + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24]; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: // %bb.3: + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + ret void + } +... +--- +name: partially_materialized +alignment: 16 +tracksRegLiveness: true +body: | + bb.0 (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOVA 0 + $r1 = GE $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.2 + $le = MOVXM + + bb.2 (align 16): + successors: %bb.2(0x7c000000), %bb.3(0x04000000) + liveins: $dc1, $dc5, $dm0, $dm1, $dm2, $dm3, $ex1, $ex2, $ex3, $ex4, $ex5, $ex6, $ex8, $ex10, $lf0, $lf1, $m4, $m5, $m7, $p0, $p1, $p3, $p4, $p5, $p6, $p7, $r3, $r6, $r7, $r8, $r9, $r16, $r17, $r24, $r25, $d3_3d:0x0001800000200C00 + + $p0, $lf0, $r24 = VLD_FILL_512_pseudo killed $p0, killed $lf0, killed $r24 :: (load (<32 x s16>), addrspace 5) + $p0, $lf0, $r24 = VLD_FILL_512_pseudo killed $p0, killed $lf0, killed $r24 :: (load (<32 x s16>), addrspace 5) + $p1, $lf1, $r25 = VLDB_FILL_512 killed $p1, killed $lf1, killed $r25 :: (load (<32 x s16>), addrspace 6) + $ex6, $p0, $lf0, $r24 = VLD_POP_576_normal_pop_pseudo killed $p0, killed $lf0, killed $r24, implicit-def $srfifo_uf :: (load (<32 x s16>), addrspace 5) + $p1, $lf1, $r25 = VLD_FILL_512_pseudo killed $p1, killed $lf1, killed $r25 :: (load (<32 x s16>), addrspace 6) + + PseudoLoopEnd , %bb.2 + + bb.3 (align 16): + RET implicit $lr + DelayedSchedBarrier