diff --git a/llvm/lib/Target/AIE/AIE2MultiSlotPseudoInstrInfo.td b/llvm/lib/Target/AIE/AIE2MultiSlotPseudoInstrInfo.td index 5e87a2036589..ec475465f0f7 100644 --- a/llvm/lib/Target/AIE/AIE2MultiSlotPseudoInstrInfo.td +++ b/llvm/lib/Target/AIE/AIE2MultiSlotPseudoInstrInfo.td @@ -73,7 +73,7 @@ let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1, Itinerary = II_ // Pseudo VLD let hasSideEffects = false, mayLoad = true, mayStore = false in { - let Itinerary = II_VLDA_W in { + let Itinerary = II_VLDB in { def VLD_idx_pseudo : MultiSlot_Pseudo< (outs mWa:$dst), (ins eP:$ptr, eDJ:$dj), "vld_idx_pseudo", "$dst, [$ptr, $dj]", @@ -83,7 +83,7 @@ let hasSideEffects = false, mayLoad = true, mayStore = false in { "vld_idx_imm_imm3x32_pseudo", "$dst, [$ptr, $imm]", [VLDB_dmw_ldb_ag_idx_imm, VLDA_dmw_lda_w_ag_idx_imm]>; } - let Itinerary = II_VLDA_POSTINC_W in + let Itinerary = II_VLDB_POSTINC in let Constraints = "$ptr_out = $ptr" in { def VLD_pstm_pseudo : MultiSlot_Pseudo< (outs mWa:$dst, eP:$ptr_out), (ins eP:$ptr, eM:$mod), @@ -94,14 +94,14 @@ let hasSideEffects = false, mayLoad = true, mayStore = false in { "vld_pstm_imm_4x32_pseudo", "$dst, [$ptr], $imm", [VLDB_dmw_ldb_ag_pstm_nrm_imm, VLDA_dmw_lda_w_ag_pstm_nrm_imm]>; } - let Itinerary = II_VLDA_2D_W in + let Itinerary = II_VLDB_2D in let Constraints = "$ptr_out = $ptr" in { def VLD_2D_pseudo : MultiSlot_Pseudo< (outs mWa:$dst, eP:$ptr_out, eDC:$count_out), (ins eP:$ptr, eD:$mod), "vld.2d_pseudo", "$dst, [$ptr], $mod", [VLDB_2D, VLDA_2D_dmw_lda_w]>, AIE_HasTiedSubregister; } - let Itinerary = II_VLDA_3D_W in + let Itinerary = II_VLDB_3D in let Constraints = "$ptr_out = $ptr" in { def VLD_3D_pseudo : MultiSlot_Pseudo< (outs mWa:$dst, eP:$ptr_out, eDC:$count_lo_out, eDC:$count_hi_out), (ins eP:$ptr, eDS:$mod), diff --git a/llvm/lib/Target/AIE/AIEAlternateDescriptors.h b/llvm/lib/Target/AIE/AIEAlternateDescriptors.h index 01673a348efd..24636b21f085 100644 --- a/llvm/lib/Target/AIE/AIEAlternateDescriptors.h +++ b/llvm/lib/Target/AIE/AIEAlternateDescriptors.h @@ -31,6 +31,9 @@ class AIEAlternateDescriptors { AIEAlternateDescriptors() = default; ~AIEAlternateDescriptors() = default; + MIAltDescsMap::const_iterator begin() const { return AlternateDescs.begin(); } + MIAltDescsMap::const_iterator end() const { return AlternateDescs.end(); } + // Construct an alternate descriptor with the given alternate descriptors. AIEAlternateDescriptors(const MIAltDescsMap &AltDescs) : AlternateDescs(AltDescs) {} @@ -43,6 +46,10 @@ class AIEAlternateDescriptors { AlternateDescs[MI] = &TII->get(AltInstOpcode); } + void setAlternateDescriptor(MachineInstr *MI, const MCInstrDesc *AltDesc) { + AlternateDescs[MI] = AltDesc; + } + // Return the alternate descriptor for the given multi-opcode instruction. std::optional getSelectedDescriptor(MachineInstr *MI) const { diff --git a/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp b/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp index 0e4fbe56ec73..90db336f1d52 100644 --- a/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp +++ b/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp @@ -473,7 +473,7 @@ ScheduleHazardRecognizer::HazardType AIEHazardRecognizer::getHazardType( bool AIEHazardRecognizer::checkConflict( const ResourceScoreboard &Scoreboard, MachineInstr &MI, int DeltaCycles) const { - const MCInstrDesc &Desc = MI.getDesc(); + const MCInstrDesc &Desc = *SelectedAltDescs.getDesc(&MI); const unsigned SchedClass = TII->getSchedClass(Desc, MI.operands(), MI.getMF()->getRegInfo()); const MemoryBankBits MemoryBanks = getMemoryBanks(&MI); diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp index 024ecbd09b48..b818083a87f2 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp @@ -300,6 +300,7 @@ namespace { /// into the appropriate blockstate region. /// TimedRegion is built one bundle at the time class PipelineExtractor : public PipelineScheduleVisitor { + AIEAlternateDescriptors &AlternateDesc; BlockState &Loop; BlockState *Prologue = nullptr; BlockState *Epilogue = nullptr; @@ -334,14 +335,20 @@ class PipelineExtractor : public PipelineScheduleVisitor { // Prologue and epilogue obtain copies. MachineInstr *ToBeEmitted = InLoop ? MI : Loop.TheBlock->getParent()->CloneMachineInstr(MI); - CurrentBundle.add(ToBeEmitted); + // We force the prologue/epilogue copies to have the same descriptor as the + // original instruction in the steady state. + if (auto AltDesc = AlternateDesc.getSelectedDescriptor(MI)) + AlternateDesc.setAlternateDescriptor(ToBeEmitted, AltDesc.value()); + + CurrentBundle.add(ToBeEmitted, AlternateDesc.getOpcode(MI)); } void endBundle() override { TimedRegion.emplace_back(CurrentBundle); } public: PipelineExtractor(InterBlockScheduling &InterBlock, BlockState &BS, const AIEBaseInstrInfo &TII) - : Loop(BS), CurrentBundle(TII.getFormatInterface()) { + : AlternateDesc(InterBlock.getSelectedAltDescs()), Loop(BS), + CurrentBundle(TII.getFormatInterface()) { MachineBasicBlock *LoopBlock = Loop.TheBlock; for (auto *P : LoopBlock->predecessors()) { if (P == LoopBlock) { diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp index 57fa9ed2ab52..4cf86c5b373d 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp @@ -93,6 +93,10 @@ static cl::opt PreSchedFollowsSkipPipeliner( "aie-presched-follows-skip-pipeliner", cl::init(true), cl::desc("Don't run the prescheduler if the pipeliner is skipped")); +static cl::opt ReAssignMultiSlotInstr( + "aie-reassign-multislot-instr", cl::init(true), + cl::desc("Re-assign multi-slot instructions during iterative scheduling")); + namespace { // A sentinel value to represent an unknown SUnit. const constexpr unsigned UnknownSUNum = ~0; @@ -277,6 +281,9 @@ void AIEPostRASchedStrategy::initializeBotScoreBoard(ScoreboardTrust Trust) { /// make sure we always have enough lookahead available. We arrange for that /// by starting in the earliest possible cycle, -Depth auto InsertInCycle = [=](MachineInstr &MI, int Cycle) { + assert(BotHazardRec->getSelectedAltDescs().getSelectedDescriptor(&MI) == + std::nullopt && + "Instructions opcode are already materialized"); BotHazardRec->emitInScoreboard( MI.getDesc(), BotHazardRec->getMemoryBanks(&MI), MI.operands(), MI.getMF()->getRegInfo(), Cycle - Depth); @@ -607,6 +614,9 @@ void AIEPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) { void AIEPostRASchedStrategy::commitBlockSchedule(MachineBasicBlock *BB) { auto &BS = InterBlock.getBlockState(BB); + if (ReAssignMultiSlotInstr) + materializeMultiSlotInstrs(); + // TODO: Update assert when the fixed instructions become part of the // scheduling region. assert(BS.getRegions().empty() || @@ -673,8 +683,8 @@ void AIEPostRASchedStrategy::leaveRegion(const SUnit &ExitSU) { if (BS.FixPoint.Stage != SchedulingStage::Scheduling) { return; } - materializeMultiOpcodeInstrs(); - InterBlock.getSelectedAltDescs().clear(); + if (!ReAssignMultiSlotInstr) + materializeMultiSlotInstrs(); if (IsBottomRegion) { // This is the earliest point where we can destroy the recorded // schedule in iterative scheduling. enterMBB and enterRegion are too early, @@ -700,25 +710,13 @@ void AIEPostRASchedStrategy::leaveRegion(const SUnit &ExitSU) { DEBUG_BLOCKS(dbgs() << " << leaveRegion\n"); } -void AIEPostRASchedStrategy::materializeMultiOpcodeInstrs() { - const TargetInstrInfo *TII = getTII(CurMBB); - const AIEHazardRecognizer &TopHazardRec = *getAIEHazardRecognizer(Top); - const AIEHazardRecognizer &BotHazardRec = *getAIEHazardRecognizer(Bot); - - auto MaterializePseudo = [&TII](MachineInstr &MI, - const AIEHazardRecognizer &HazardRec) { - // Materialize instructions with multiple opcode options - if (std::optional AltOpcode = - HazardRec.getSelectedAltDescs().getSelectedOpcode(&MI)) { - MI.setDesc(TII->get(*AltOpcode)); - } - }; +void AIEPostRASchedStrategy::materializeMultiSlotInstrs() { + for (auto &[MI, Desc] : make_range(InterBlock.getSelectedAltDescs().begin(), + InterBlock.getSelectedAltDescs().end())) { + MI->setDesc(*Desc); + } - assert(DAG->top() == DAG->bottom()); - for (MachineInstr &MI : make_range(DAG->begin(), DAG->top())) - MaterializePseudo(MI, TopHazardRec); - for (MachineInstr &MI : make_range(DAG->bottom(), DAG->end())) - MaterializePseudo(MI, BotHazardRec); + InterBlock.getSelectedAltDescs().clear(); } const SUnit &getBundledSUnit(const ScheduleDAGMI *DAG, MachineInstr *MI) { diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.h b/llvm/lib/Target/AIE/AIEMachineScheduler.h index 213aa48f714b..a4455f61abcc 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.h +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.h @@ -152,7 +152,7 @@ class AIEPostRASchedStrategy : public PostGenericScheduler { /// Materialize "multi-opcode" instructions into the option that was selected /// at schedule time. See AIEHazardRecognizer::getSelectedAltOpcode(). - void materializeMultiOpcodeInstrs(); + void materializeMultiSlotInstrs(); /// Returns true if, when "concatenated", the Top and Bot zone have resource /// conflicts or timing issues. diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp index 48f194d0878b..d087a047b706 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp @@ -115,8 +115,10 @@ bool PostPipeliner::canAccept(MachineBasicBlock &LoopBlock) { return true; } -static SlotCounts getSlotCounts(MachineInstr &MI, const AIEBaseInstrInfo *TII) { - auto *SlotInfo = TII->getSlotInfo(TII->getSlotKind(MI.getOpcode())); +static SlotCounts getSlotCounts(MachineInstr &MI, const AIEBaseInstrInfo *TII, + const AIEHazardRecognizer &HR) { + const unsigned Opcode = HR.getSelectedAltDescs().getOpcode(&MI); + auto *SlotInfo = TII->getSlotInfo(TII->getSlotKind(Opcode)); return SlotInfo ? SlotInfo->getSlotSet() : 0; } @@ -124,7 +126,7 @@ int PostPipeliner::getResMII(MachineBasicBlock &LoopBlock) { // Add up all slot requirements and return the maximum slot count SlotCounts Counts; for (auto &MI : LoopBlock) { - Counts += getSlotCounts(MI, TII); + Counts += getSlotCounts(MI, TII, HR); } int MII = Counts.max(); LLVM_DEBUG(dbgs() << "PostPipeliner: ResMII=" << MII << "\n"); @@ -221,7 +223,7 @@ void PostPipeliner::computeForward() { const int NewEarliest = Me.Earliest + Dep.getSignedLatency(); SInfo.Earliest = std::max(SInfo.Earliest, NewEarliest); } - Me.Slots = getSlotCounts(*SU.getInstr(), TII); + Me.Slots = getSlotCounts(*SU.getInstr(), TII, HR); } } @@ -460,9 +462,9 @@ bool PostPipeliner::scheduleFirstIteration(PostPipelinerStrategy &Strategy) { if (N > 0 && HR.checkConflict(Scoreboard, *MI, Cycle)) { return false; } - - HR.emitInScoreboard(Scoreboard, MI->getDesc(), MemoryBanks, - MI->operands(), MI->getMF()->getRegInfo(), Cycle); + const MCInstrDesc &Desc = *HR.getSelectedAltDescs().getDesc(MI); + HR.emitInScoreboard(Scoreboard, Desc, MemoryBanks, MI->operands(), + MI->getMF()->getRegInfo(), Cycle); Cycle += II; } diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll index 0fe1d291e633..935608859039 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll @@ -282,19 +282,20 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: .LBB0_1: // %outer.loop.header ; DCL-NEXT: // =>This Loop Header: Depth=1 ; DCL-NEXT: // Child Loop BB0_2 Depth 2 -; DCL-NEXT: nopa ; vldb wl6, [p1], #32; nopxm -; DCL-NEXT: vldb wl3, [p0], m6; mov r0, p0 +; DCL-NEXT: nopa ; vldb wl3, [p0], m6; nopx ; mov r0, p0; nops +; DCL-NEXT: vldb wl6, [p1], #32 ; DCL-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32] ; DCL-NEXT: vlda.ups.s32.s16 bml0, s0, [p2], m5 -; DCL-NEXT: vldb wh6, [p1], #32 ; DCL-NEXT: vldb wh3, [p0], m6 +; DCL-NEXT: vldb wh6, [p1], #32 ; DCL-NEXT: vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m1, p5 ; DCL-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m1 +; DCL-NEXT: vldb wl1, [p0], m6 ; DCL-NEXT: vldb wl8, [p1], #32 -; DCL-NEXT: vldb wl7, [p0], m6 ; DCL-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32] ; DCL-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m5 -; DCL-NEXT: vldb.3d wh7, [p0], d0 +; DCL-NEXT: vldb.3d wh1, [p0], d0 +; DCL-NEXT: vldb wh8, [p1], #32 ; DCL-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32]; mov m2, r15 ; DCL-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m2 ; DCL-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32] @@ -303,31 +304,30 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m1 ; DCL-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32] ; DCL-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m5 -; DCL-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32] -; DCL-NEXT: vldb wh8, [p1], #32 ; DCL-NEXT: vldb wl5, [p0], m6; mov r1, p0 -; DCL-NEXT: vlda.ups.s32.s16 bml7, s0, [p2, #0]; and r0, r0, r9 -; DCL-NEXT: vldb wh5, [p0], m6; add r0, r0, #33 -; DCL-NEXT: vldb wl3, [p0], m6; vshift.align x4, x4, s1, x3, r0 -; DCL-NEXT: vldb.3d wh3, [p0], d0; and r10, r1, r9; vshift.align x2, x2, s1, x7, r0 +; DCL-NEXT: vldb wh5, [p0], m6 +; DCL-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32]; and r0, r0, r9 +; DCL-NEXT: vldb wl10, [p0], m6; add r0, r0, #33 +; DCL-NEXT: vldb.3d wh10, [p0], d0; vshift.align x4, x4, s1, x3, r0 +; DCL-NEXT: vlda.ups.s32.s16 bml7, s0, [p2, #0]; and r10, r1, r9; vshift.align x2, x2, s1, x1, r0 ; DCL-NEXT: vldb wl1, [p1], #32; add r0, r10, #33; mov r10, p0 ; DCL-NEXT: vldb wh1, [p1], #32; add r1, r5, #-1; vshuffle x7, x4, x2, r2 -; DCL-NEXT: vldb wl10, [p1], #32; add r1, r1, #-1; vshuffle x9, x7, x0, r8 -; DCL-NEXT: vldb wh10, [p1], #32; and r10, r10, r9 +; DCL-NEXT: vldb wl3, [p1], #32; add r1, r1, #-1; vshuffle x9, x7, x0, r8 +; DCL-NEXT: vldb wh3, [p1], #32; and r10, r10, r9 ; DCL-NEXT: .p2align 4 ; DCL-NEXT: .LBB0_2: // %inner.loop ; DCL-NEXT: // Parent Loop BB0_1 Depth=1 ; DCL-NEXT: // => This Inner Loop Header: Depth=2 ; DCL-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4 ; DCL-NEXT: nopa ; nopb ; nopx ; vshift.align x4, x4, s1, x5, r0; vmac cm5, cm5, x9, x8, r4 -; DCL-NEXT: vldb wl5, [p0], m6; vshift.align x2, x2, s1, x3, r0 +; DCL-NEXT: vldb wl5, [p0], m6; vshift.align x2, x2, s1, x10, r0 ; DCL-NEXT: vldb wh5, [p0], m6; add r1, r1, #-1; vshuffle x11, x9, x0, r8 -; DCL-NEXT: vldb wl3, [p0], m6; jnz r1, #.LBB0_2; vmac cm0, cm0, x7, x6, r4 -; DCL-NEXT: vldb.3d wh3, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 // Delay Slot 5 -; DCL-NEXT: vldb wl1, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 // Delay Slot 4 -; DCL-NEXT: vldb wh1, [p1], #32; vmov x6, x1; vmac cm6, cm6, x9, x8, r4 // Delay Slot 3 -; DCL-NEXT: vldb wl10, [p1], #32; add r0, r10, #33; mov r10, p0; vmac cm3, cm3, x11, x6, r4 // Delay Slot 2 -; DCL-NEXT: vldb wh10, [p1], #32; and r10, r10, r9; vmov x8, x10; vmac cm7, cm7, x11, x8, r4 // Delay Slot 1 +; DCL-NEXT: vldb wl10, [p0], m6; jnz r1, #.LBB0_2; vmac cm0, cm0, x7, x6, r4 +; DCL-NEXT: vldb.3d wh10, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 // Delay Slot 5 +; DCL-NEXT: vldb wl1, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm6, cm6, x9, x8, r4 // Delay Slot 4 +; DCL-NEXT: vldb wh1, [p1], #32; vmov x6, x1; vmac cm2, cm2, x9, x6, r4 // Delay Slot 3 +; DCL-NEXT: vldb wl3, [p1], #32; add r0, r10, #33; mov r10, p0; vmac cm3, cm3, x11, x6, r4 // Delay Slot 2 +; DCL-NEXT: vldb wh3, [p1], #32; and r10, r10, r9; vmov x8, x3; vmac cm7, cm7, x11, x8, r4 // Delay Slot 1 ; DCL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 ; DCL-NEXT: nopx ; vmov x11, x0 ; DCL-NEXT: vshuffle x0, x4, x2, r3 @@ -343,24 +343,24 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: lda dn7, [sp, #-92]; vmac cm8, cm4, x7, x8, r4 // 4-byte Folded Reload ; DCL-NEXT: vmac cm0, cm0, x7, x6, r4 ; DCL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x5, r0; vmac cm1, cm1, x9, x6, r4 // 4-byte Folded Reload -; DCL-NEXT: vshift.align x2, x2, s1, x3, r0; vmac cm3, cm3, x11, x6, r4 +; DCL-NEXT: vshift.align x2, x2, s1, x10, r0; vmac cm3, cm3, x11, x6, r4 ; DCL-NEXT: vshuffle x6, x4, x2, r2 ; DCL-NEXT: vmac cm6, cm7, x6, x8, r4 ; DCL-NEXT: vshuffle x8, x6, x0, r8; vmac cm7, cm0, x6, x1, r4 ; DCL-NEXT: st dn7, [sp, #-92] // 4-byte Folded Spill -; DCL-NEXT: vshuffle x3, x4, x2, r3; vmac cm0, cm1, x8, x1, r4 +; DCL-NEXT: vshuffle x10, x4, x2, r3; vmac cm0, cm1, x8, x1, r4 ; DCL-NEXT: st dj7, [sp, #-88] // 4-byte Folded Spill -; DCL-NEXT: vshuffle x5, x3, x0, r8; vmac cm1, cm2, x3, x1, r4 +; DCL-NEXT: vshuffle x5, x10, x0, r8; vmac cm1, cm2, x10, x1, r4 ; DCL-NEXT: lda m7, [sp, #-96]; vst.srs.s16.s32 bmh7, s2, [p3, #32] // 4-byte Folded Reload ; DCL-NEXT: lda dc7, [sp, #-84]; vst.srs.s16.s32 bml7, s3, [p3], #64; vmac cm2, cm3, x5, x1, r4 // 4-byte Folded Reload ; DCL-NEXT: vst.srs.s16.s32 bmh0, s3, [p3, #32] -; DCL-NEXT: vst.srs.s16.s32 bml0, s3, [p3], m4; vmac cm3, cm8, x6, x10, r4 +; DCL-NEXT: vst.srs.s16.s32 bml0, s3, [p3], m4; vmac cm3, cm8, x6, x3, r4 ; DCL-NEXT: vst.srs.s16.s32 bmh1, s3, [p3, #32] -; DCL-NEXT: vst.srs.s16.s32 bml1, s3, [p3], #64; mov m1, r27; vmac cm8, cm4, x8, x10, r4 +; DCL-NEXT: vst.srs.s16.s32 bml1, s3, [p3], #64; mov m1, r27; vmac cm8, cm4, x8, x3, r4 ; DCL-NEXT: vst.srs.s16.s32 bmh2, s3, [p3, #32] -; DCL-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m1; vmac cm5, cm5, x3, x10, r4 +; DCL-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m1; vmac cm5, cm5, x10, x3, r4 ; DCL-NEXT: vst.srs.s16.s32 bmh3, s3, [p3, #32] -; DCL-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64; vmac cm4, cm6, x5, x10, r4 +; DCL-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64; vmac cm4, cm6, x5, x3, r4 ; DCL-NEXT: vst.srs.s16.s32 bmh8, s3, [p3, #32] ; DCL-NEXT: vst.srs.s16.s32 bml8, s3, [p3], m4 ; DCL-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32]; mov m2, r13 @@ -444,38 +444,38 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: .LBB0_1: // %outer.loop.header ; ZOL-NEXT: // =>This Loop Header: Depth=1 ; ZOL-NEXT: // Child Loop BB0_2 Depth 2 -; ZOL-NEXT: vldb wl6, [p1], #32; nopa ; nops ; nopxm ; nopv -; ZOL-NEXT: vldb wl3, [p0], m6; mov r0, p0 -; ZOL-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32]; nopx +; ZOL-NEXT: vldb wl3, [p0], m6; nopa ; nops ; nopx ; mov r0, p0; nopv +; ZOL-NEXT: nopa ; vldb wl6, [p1], #32; nopx +; ZOL-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32] ; ZOL-NEXT: vlda.ups.s32.s16 bml0, s0, [p2], m5 -; ZOL-NEXT: vldb wh6, [p1], #32 ; ZOL-NEXT: vldb wh3, [p0], m6 +; ZOL-NEXT: vldb wh6, [p1], #32 ; ZOL-NEXT: vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m1, p5 ; ZOL-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m1 +; ZOL-NEXT: vldb wl1, [p0], m6 ; ZOL-NEXT: vldb wl8, [p1], #32 -; ZOL-NEXT: vldb wl7, [p0], m6 ; ZOL-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32] ; ZOL-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m5 +; ZOL-NEXT: vldb.3d wh1, [p0], d0 ; ZOL-NEXT: vldb wh8, [p1], #32 -; ZOL-NEXT: vldb.3d wh7, [p0], d0 ; ZOL-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32]; mov m2, r14 ; ZOL-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m2 -; ZOL-NEXT: vldb wl1, [p1], #32 ; ZOL-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32] ; ZOL-NEXT: vlda.ups.s32.s16 bml4, s0, [p2], m5 ; ZOL-NEXT: vlda.ups.s32.s16 bmh5, s0, [p2, #32] ; ZOL-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m1 ; ZOL-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32] -; ZOL-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m5; movxm ls, #.LBB0_2 +; ZOL-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m5 +; ZOL-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32]; movxm ls, #.LBB0_2 ; ZOL-NEXT: vldb wl5, [p0], m6; mov r1, p0 ; ZOL-NEXT: vldb wh5, [p0], m6; movxm le, #.L_LEnd0 -; ZOL-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32]; and r0, r0, r9; add.nc lc, r5, #-2 -; ZOL-NEXT: vldb wl3, [p0], m6; nopa ; nops ; add r0, r0, #33; nopm ; nopv -; ZOL-NEXT: vldb.3d wh3, [p0], d0; nopa ; nops ; nopx ; vshift.align x4, x4, s1, x3, r0; nopv -; ZOL-NEXT: nopb ; vlda.ups.s32.s16 bml7, s0, [p2, #0]; nops ; and r1, r1, r9; vshift.align x2, x2, s1, x7, r0; nopv +; ZOL-NEXT: vlda.ups.s32.s16 bml7, s0, [p2, #0]; and r0, r0, r9; add.nc lc, r5, #-2 +; ZOL-NEXT: vldb wl10, [p0], m6; nopa ; nops ; add r0, r0, #33; nopm ; nopv +; ZOL-NEXT: vldb.3d wh10, [p0], d0; nopa ; nops ; nopx ; vshift.align x4, x4, s1, x3, r0; nopv +; ZOL-NEXT: vldb wl1, [p1], #32; nopa ; nops ; and r1, r1, r9; vshift.align x2, x2, s1, x1, r0; nopv ; ZOL-NEXT: vldb wh1, [p1], #32; nopa ; nops ; add r0, r1, #33; mov r1, p0; nopv -; ZOL-NEXT: vldb wl10, [p1], #32; nopa ; nops ; nopx ; vshuffle x7, x4, x2, r2; nopv -; ZOL-NEXT: vldb wh10, [p1], #32; nopa ; nops ; nopx ; vshuffle x9, x7, x0, r8; nopv +; ZOL-NEXT: vldb wl3, [p1], #32; nopa ; nops ; nopx ; vshuffle x7, x4, x2, r2; nopv +; ZOL-NEXT: vldb wh3, [p1], #32; nopa ; nops ; nopx ; vshuffle x9, x7, x0, r8; nopv ; ZOL-NEXT: nopb ; nopa ; nops ; and r1, r1, r9; nopm ; nopv ; ZOL-NEXT: .p2align 4 ; ZOL-NEXT: .LBB0_2: // %inner.loop @@ -483,14 +483,14 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: // => This Inner Loop Header: Depth=2 ; ZOL-NEXT: nopa ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4 ; ZOL-NEXT: vldb wl5, [p0], m6; vshift.align x4, x4, s1, x5, r0; vmac cm5, cm5, x9, x8, r4 -; ZOL-NEXT: vldb wh5, [p0], m6; vshift.align x2, x2, s1, x3, r0 -; ZOL-NEXT: vldb wl3, [p0], m6; vshuffle x11, x9, x0, r8; vmac cm0, cm0, x7, x6, r4 -; ZOL-NEXT: vldb.3d wh3, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 -; ZOL-NEXT: vldb wl1, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 -; ZOL-NEXT: vldb wh1, [p1], #32; vmov x6, x1; vmac cm6, cm6, x9, x8, r4 -; ZOL-NEXT: vldb wl10, [p1], #32; add r0, r1, #33; mov r1, p0; vmac cm3, cm3, x11, x6, r4 +; ZOL-NEXT: vldb wh5, [p0], m6; vshift.align x2, x2, s1, x10, r0 +; ZOL-NEXT: vldb wl10, [p0], m6; vshuffle x11, x9, x0, r8; vmac cm0, cm0, x7, x6, r4 +; ZOL-NEXT: vldb.3d wh10, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 +; ZOL-NEXT: vldb wl1, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm6, cm6, x9, x8, r4 +; ZOL-NEXT: vldb wh1, [p1], #32; vmov x6, x1; vmac cm2, cm2, x9, x6, r4 +; ZOL-NEXT: vldb wl3, [p1], #32; add r0, r1, #33; mov r1, p0; vmac cm3, cm3, x11, x6, r4 ; ZOL-NEXT: .L_LEnd0: -; ZOL-NEXT: vldb wh10, [p1], #32; nopa ; nops ; and r1, r1, r9; vmov x8, x10; vmac cm7, cm7, x11, x8, r4 +; ZOL-NEXT: vldb wh3, [p1], #32; nopa ; nops ; and r1, r1, r9; vmov x8, x3; vmac cm7, cm7, x11, x8, r4 ; ZOL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 ; ZOL-NEXT: nopx ; vmov x11, x0 ; ZOL-NEXT: vshuffle x0, x4, x2, r3 @@ -506,24 +506,24 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: lda dn7, [sp, #-92]; vmac cm8, cm4, x7, x8, r4 // 4-byte Folded Reload ; ZOL-NEXT: vmac cm0, cm0, x7, x6, r4 ; ZOL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x5, r0; vmac cm1, cm1, x9, x6, r4 // 4-byte Folded Reload -; ZOL-NEXT: vshift.align x2, x2, s1, x3, r0; vmac cm3, cm3, x11, x6, r4 +; ZOL-NEXT: vshift.align x2, x2, s1, x10, r0; vmac cm3, cm3, x11, x6, r4 ; ZOL-NEXT: vshuffle x6, x4, x2, r2 ; ZOL-NEXT: vmac cm6, cm7, x6, x8, r4 ; ZOL-NEXT: vshuffle x8, x6, x0, r8; vmac cm7, cm0, x6, x1, r4 ; ZOL-NEXT: st dn7, [sp, #-92] // 4-byte Folded Spill -; ZOL-NEXT: vshuffle x3, x4, x2, r3; vmac cm0, cm1, x8, x1, r4 +; ZOL-NEXT: vshuffle x10, x4, x2, r3; vmac cm0, cm1, x8, x1, r4 ; ZOL-NEXT: st dj7, [sp, #-88] // 4-byte Folded Spill -; ZOL-NEXT: vshuffle x5, x3, x0, r8; vmac cm1, cm2, x3, x1, r4 +; ZOL-NEXT: vshuffle x5, x10, x0, r8; vmac cm1, cm2, x10, x1, r4 ; ZOL-NEXT: lda m7, [sp, #-96]; vst.srs.s16.s32 bmh7, s2, [p3, #32] // 4-byte Folded Reload ; ZOL-NEXT: lda dc7, [sp, #-84]; vst.srs.s16.s32 bml7, s3, [p3], #64; vmac cm2, cm3, x5, x1, r4 // 4-byte Folded Reload ; ZOL-NEXT: vst.srs.s16.s32 bmh0, s3, [p3, #32] -; ZOL-NEXT: vst.srs.s16.s32 bml0, s3, [p3], m4; vmac cm3, cm8, x6, x10, r4 +; ZOL-NEXT: vst.srs.s16.s32 bml0, s3, [p3], m4; vmac cm3, cm8, x6, x3, r4 ; ZOL-NEXT: vst.srs.s16.s32 bmh1, s3, [p3, #32] -; ZOL-NEXT: vst.srs.s16.s32 bml1, s3, [p3], #64; mov m1, r26; vmac cm8, cm4, x8, x10, r4 +; ZOL-NEXT: vst.srs.s16.s32 bml1, s3, [p3], #64; mov m1, r26; vmac cm8, cm4, x8, x3, r4 ; ZOL-NEXT: vst.srs.s16.s32 bmh2, s3, [p3, #32] -; ZOL-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m1; vmac cm5, cm5, x3, x10, r4 +; ZOL-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m1; vmac cm5, cm5, x10, x3, r4 ; ZOL-NEXT: vst.srs.s16.s32 bmh3, s3, [p3, #32] -; ZOL-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64; vmac cm4, cm6, x5, x10, r4 +; ZOL-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64; vmac cm4, cm6, x5, x3, r4 ; ZOL-NEXT: vst.srs.s16.s32 bmh8, s3, [p3, #32] ; ZOL-NEXT: vst.srs.s16.s32 bml8, s3, [p3], m4 ; ZOL-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32]; mov m2, r12 diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll index 761dd1e918aa..604973feaf36 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll @@ -78,34 +78,34 @@ define void @mul2d(ptr noalias %in_ptr0, ptr noalias %in_ptr1, ptr noalias %out_ ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: mova r4, #-1 -; CHECK-NEXT: mova dc0, #0; vldb wl2, [p1], #32; lshl r1, r1, r4 -; CHECK-NEXT: vldb wl8, [p1], #32; add r1, r1, #-1; mov dc4, dc0 -; CHECK-NEXT: vldb.3d wl6, [p0], d0; jz r1, #.LBB0_4 -; CHECK-NEXT: vldb.3d wl4, [p0], d0 // Delay Slot 5 +; CHECK-NEXT: mova dc0, #0; vldb wl6, [p1], #32; lshl r1, r1, r4 +; CHECK-NEXT: vldb wl4, [p1], #32; add r1, r1, #-1; mov dc4, dc0 +; CHECK-NEXT: vldb.3d wl8, [p0], d0; jz r1, #.LBB0_4 +; CHECK-NEXT: vldb.3d wl2, [p0], d0 // Delay Slot 5 ; CHECK-NEXT: extend.u8 r5, r5 // Delay Slot 4 ; CHECK-NEXT: mova r3, #0; movx r2, #1; mov s0, r5 // Delay Slot 3 ; CHECK-NEXT: ne r2, r0, r2; vbcst.8 x0, r3 // Delay Slot 2 ; CHECK-NEXT: mova r0, #808; mov crSRSSign, r2 // Delay Slot 1 ; CHECK-NEXT: // %bb.2: -; CHECK-NEXT: nopa ; nopb ; nopx ; vmov wh6, wl0 -; CHECK-NEXT: vmov wh4, wl0 +; CHECK-NEXT: nopa ; nopb ; nopx ; vmov wh8, wl0 +; CHECK-NEXT: vmov wh2, wl0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_3: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldb wl2, [p1], #32; nopxm -; CHECK-NEXT: vldb.3d wl6, [p0], d0; add r1, r1, #-1; vmul cm0, x6, x2, r0 -; CHECK-NEXT: vldb.3d wl4, [p0], d0; jnz r1, #.LBB0_3; vmul cm1, x4, x8, r0 -; CHECK-NEXT: vldb wl8, [p1], #32 // Delay Slot 5 +; CHECK-NEXT: vldb wl6, [p1], #32; nopxm +; CHECK-NEXT: vldb.3d wl8, [p0], d0; add r1, r1, #-1; vmul cm0, x8, x6, r0 +; CHECK-NEXT: vldb.3d wl2, [p0], d0; jnz r1, #.LBB0_3; vmul cm1, x2, x4, r0 +; CHECK-NEXT: vldb wl4, [p1], #32 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 ; CHECK-NEXT: vst.srs.d8.s32 cm0, s0, [p2], #32 // Delay Slot 2 ; CHECK-NEXT: vst.srs.d8.s32 cm1, s0, [p2], #32 // Delay Slot 1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmov wh6, wl0; nopv -; CHECK-NEXT: nopa ; vmov wh4, wl0 -; CHECK-NEXT: vmul cm0, x6, x2, r0 -; CHECK-NEXT: vmul cm1, x4, x8, r0 +; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmov wh8, wl0; nopv +; CHECK-NEXT: nopa ; vmov wh2, wl0 +; CHECK-NEXT: vmul cm0, x8, x6, r0 +; CHECK-NEXT: vmul cm1, x2, x4, r0 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-multiSlot.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-multiSlot.mir new file mode 100644 index 000000000000..24f1269a1db1 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-multiSlot.mir @@ -0,0 +1,301 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 --run-pass=postmisched --aie-reassign-multislot-instr=true %s -o - | FileCheck %s --check-prefix=ON +# RUN: llc --mtriple=aie2 --run-pass=postmisched --aie-reassign-multislot-instr=false %s -o - | FileCheck %s --check-prefix=OFF + +--- +name: multislot_across_loop +alignment: 16 +tracksRegLiveness: true +body: | + ; ON-LABEL: name: multislot_across_loop + ; ON: bb.0: + ; ON-NEXT: successors: %bb.1(0x80000000) + ; ON-NEXT: liveins: $p0, $r0, $r1, $r2 + ; ON-NEXT: {{ $}} + ; ON-NEXT: bb.1: + ; ON-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; ON-NEXT: liveins: $cm0, $cm1, $cm2, $m0, $m1, $p0, $p1, $p2, $r0, $r1, $r2, $r3, $s0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10 + ; ON-NEXT: {{ $}} + ; ON-NEXT: $wh3 = VSRS_S8_S32_mv_w_srs killed $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: BUNDLE implicit-def $wl3, implicit-def $srsrs_of, implicit-def $cm4, implicit-def $bml4, implicit-def $amll4, implicit-def $amlh4, implicit-def $bmh4, implicit-def $amhl4, implicit-def $amhh4, implicit killed $cm2, implicit $s0, implicit $crsat, implicit $crrnd, implicit $x3, implicit $x5, implicit $r0 { + ; ON-NEXT: $wl3 = VSRS_S8_S32_mv_w_srs killed $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: $cm4 = VMUL_vmac_cm_core_dense $x3, $x5, $r0 + ; ON-NEXT: } + ; ON-NEXT: $cm3 = VMUL_vmac_cm_core_dense killed $x7, $x9, $r0 + ; ON-NEXT: BUNDLE implicit-def $wh2, implicit-def $srsrs_of, implicit-def $cm1, implicit-def $bml1, implicit-def $amll1, implicit-def $amlh1, implicit-def $bmh1, implicit-def $amhl1, implicit-def $amhh1, implicit killed $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit killed $x6, implicit $x8, implicit $r0 { + ; ON-NEXT: $wh2 = VSRS_S8_S32_mv_w_srs killed $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: $cm1 = VMUL_vmac_cm_core_dense killed $x6, $x8, $r0 + ; ON-NEXT: } + ; ON-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh3, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl3, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: BUNDLE implicit-def $wl2, implicit-def $srsrs_of, implicit-def $cm2, implicit-def $bml2, implicit-def $amll2, implicit-def $amlh2, implicit-def $bmh2, implicit-def $amhl2, implicit-def $amhh2, implicit killed $cm4, implicit $s0, implicit $crsat, implicit $crrnd, implicit $x2, implicit killed $x4, implicit $r0 { + ; ON-NEXT: $wl2 = VSRS_S8_S32_mv_w_srs killed $cm4, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: $cm2 = VMUL_vmac_cm_core_dense $x2, killed $x4, $r0 + ; ON-NEXT: } + ; ON-NEXT: $wh4 = VSRS_S8_S32_mv_w_srs killed $cm3, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: $wl4 = VSRS_S8_S32_mv_w_srs killed $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl2, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: VST_dmw_sts_w_ag_idx_imm $wh4, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl4, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: BUNDLE implicit-def $wl6, implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit $p0, implicit $x10, implicit killed $x1, implicit $r0 { + ; ON-NEXT: $wl6 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: $cm0 = VMUL_vmac_cm_core_dense $x10, killed $x1, $r0 + ; ON-NEXT: } + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: BUNDLE implicit-def $wl4, implicit-def $p1, implicit-def $wl6, implicit-def $p0, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0 { + ; ON-NEXT: $wl4, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: } + ; ON-NEXT: $wh2 = VSRS_S8_S32_mv_w_srs killed $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: BUNDLE implicit-def $wl10, implicit-def $wl1, implicit-def $wl2, implicit-def $srsrs_of, implicit-def $wh6, implicit $p1, implicit $p0, implicit killed $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit $wl0 { + ; ON-NEXT: $wl10 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: $wl2 = VSRS_S8_S32_mv_w_srs killed $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: $wh6 = VMOV_mv_w $wl0 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $wl5, implicit-def $p1, implicit-def $wl3, implicit-def $p0, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0 { + ; ON-NEXT: $wl5, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wl3, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $wl9, implicit-def $wl7, implicit-def $r3, implicit-def $srcarry, implicit-def $cm1, implicit-def $bml1, implicit-def $amll1, implicit-def $amlh1, implicit-def $bmh1, implicit-def $amhl1, implicit-def $amhh1, implicit $p1, implicit $p0, implicit killed $r3, implicit $x6, implicit $x2, implicit $r0 { + ; ON-NEXT: $wl9 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wl7 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: $r3 = ADD_add_r_ri killed $r3, -4, implicit-def $srcarry + ; ON-NEXT: $cm1 = VMUL_vmac_cm_core_dense $x6, $x2, $r0 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit killed $wh2, implicit $p2, implicit $r3 { + ; ON-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: JNZ $r3, %bb.1 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $p2, implicit-def $wh1, implicit killed $wl2, implicit killed $p2, implicit $wl0 { + ; ON-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl2, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: $wh1 = VMOV_mv_w $wl0 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $wl8, implicit-def $p1, implicit-def $wl6, implicit-def $p0, implicit-def $wh7, implicit-def $cm2, implicit-def $bml2, implicit-def $amll2, implicit-def $amlh2, implicit-def $bmh2, implicit-def $amhl2, implicit-def $amhh2, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0, implicit $wl0, implicit $x6, implicit $x4, implicit $r0 { + ; ON-NEXT: $wl8, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: $wh7 = VMOV_mv_w $wl0 + ; ON-NEXT: $cm2 = VMUL_vmac_cm_core_dense $x6, $x4, $r0 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $wl4, implicit-def $wh3, implicit $p1, implicit $wl0 { + ; ON-NEXT: $wl4 = VLDB_dmw_ldb_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wh3 = VMOV_mv_w $wl0 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $wl2, implicit-def $wh2, implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit $p0, implicit $wl0, implicit $x1, implicit killed $x10, implicit $r0 { + ; ON-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: $wh2 = VMOV_mv_w $wl0 + ; ON-NEXT: $cm0 = VMUL_vmac_cm_core_dense $x1, killed $x10, $r0 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $wl10, implicit-def $p0, implicit-def $wl1, implicit-def $p1, implicit-def $wh10, implicit killed $p0, implicit $m0, implicit killed $p1, implicit $m1, implicit $wl0 { + ; ON-NEXT: $wl10, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: $wl1, $p1 = VLDB_dmw_ldb_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wh10 = VMOV_mv_w $wl0 + ; ON-NEXT: } + ; ON-NEXT: DelayedSchedBarrier + ; ON-NEXT: {{ $}} + ; ON-NEXT: bb.2: + ; ON-NEXT: successors: %bb.3(0x80000000) + ; ON-NEXT: liveins: $r1, $r2 + ; ON-NEXT: {{ $}} + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: $r2 = OR killed $r2, killed $r1 + ; ON-NEXT: {{ $}} + ; ON-NEXT: bb.3: + ; ON-NEXT: liveins: $r2 + ; ON-NEXT: {{ $}} + ; ON-NEXT: RET implicit $lr + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: DelayedSchedBarrier implicit killed $r2 + ; + ; OFF-LABEL: name: multislot_across_loop + ; OFF: bb.0: + ; OFF-NEXT: successors: %bb.1(0x80000000) + ; OFF-NEXT: liveins: $p0, $r0, $r1, $r2 + ; OFF-NEXT: {{ $}} + ; OFF-NEXT: bb.1: + ; OFF-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; OFF-NEXT: liveins: $cm0, $cm1, $cm2, $m0, $m1, $p0, $p1, $p2, $r0, $r1, $r2, $r3, $s0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10 + ; OFF-NEXT: {{ $}} + ; OFF-NEXT: $wh3 = VSRS_S8_S32_mv_w_srs killed $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: BUNDLE implicit-def $wl3, implicit-def $srsrs_of, implicit-def $cm4, implicit-def $bml4, implicit-def $amll4, implicit-def $amlh4, implicit-def $bmh4, implicit-def $amhl4, implicit-def $amhh4, implicit killed $cm2, implicit $s0, implicit $crsat, implicit $crrnd, implicit $x3, implicit $x5, implicit $r0 { + ; OFF-NEXT: $wl3 = VSRS_S8_S32_mv_w_srs killed $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: $cm4 = VMUL_vmac_cm_core_dense $x3, $x5, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: $cm3 = VMUL_vmac_cm_core_dense killed $x7, $x9, $r0 + ; OFF-NEXT: BUNDLE implicit-def $wh2, implicit-def $srsrs_of, implicit-def $cm1, implicit-def $bml1, implicit-def $amll1, implicit-def $amlh1, implicit-def $bmh1, implicit-def $amhl1, implicit-def $amhh1, implicit killed $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit killed $x6, implicit $x8, implicit $r0 { + ; OFF-NEXT: $wh2 = VSRS_S8_S32_mv_w_srs killed $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: $cm1 = VMUL_vmac_cm_core_dense killed $x6, $x8, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh3, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl3, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: BUNDLE implicit-def $wl2, implicit-def $srsrs_of, implicit-def $cm2, implicit-def $bml2, implicit-def $amll2, implicit-def $amlh2, implicit-def $bmh2, implicit-def $amhl2, implicit-def $amhh2, implicit killed $cm4, implicit $s0, implicit $crsat, implicit $crrnd, implicit $x2, implicit killed $x4, implicit $r0 { + ; OFF-NEXT: $wl2 = VSRS_S8_S32_mv_w_srs killed $cm4, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: $cm2 = VMUL_vmac_cm_core_dense $x2, killed $x4, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: $wh4 = VSRS_S8_S32_mv_w_srs killed $cm3, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: $wl4 = VSRS_S8_S32_mv_w_srs killed $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl2, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: VST_dmw_sts_w_ag_idx_imm $wh4, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl4, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: BUNDLE implicit-def $wl6, implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit $p0, implicit $x10, implicit killed $x1, implicit $r0 { + ; OFF-NEXT: $wl6 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $cm0 = VMUL_vmac_cm_core_dense $x10, killed $x1, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: BUNDLE implicit-def $wl4, implicit-def $p1, implicit-def $wl6, implicit-def $p0, implicit-def $wh2, implicit-def $srsrs_of, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0, implicit killed $cm2, implicit $s0, implicit $crsat, implicit $crrnd { + ; OFF-NEXT: $wl4, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $wh2 = VSRS_S8_S32_mv_w_srs killed $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl10, implicit-def $wl1, implicit-def $wl2, implicit-def $srsrs_of, implicit-def $wh6, implicit $p1, implicit $p0, implicit killed $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit $wl0 { + ; OFF-NEXT: $wl10 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $wl2 = VSRS_S8_S32_mv_w_srs killed $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: $wh6 = VMOV_mv_w $wl0 + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl5, implicit-def $p1, implicit-def $wl3, implicit-def $p0, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0 { + ; OFF-NEXT: $wl5, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wl3, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl9, implicit-def $wl7, implicit-def $cm1, implicit-def $bml1, implicit-def $amll1, implicit-def $amlh1, implicit-def $bmh1, implicit-def $amhl1, implicit-def $amhh1, implicit $p1, implicit $p0, implicit $x6, implicit $x2, implicit $r0 { + ; OFF-NEXT: $wl9 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wl7 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $cm1 = VMUL_vmac_cm_core_dense $x6, $x2, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: BUNDLE implicit-def $p2, implicit-def $r3, implicit-def $srcarry, implicit killed $wl2, implicit killed $p2, implicit killed $r3 { + ; OFF-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl2, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: $r3 = ADD_add_r_ri killed $r3, -4, implicit-def $srcarry + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl8, implicit-def $p1, implicit-def $wl6, implicit-def $p0, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0, implicit $r3 { + ; OFF-NEXT: $wl8, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: JNZ $r3, %bb.1 + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl4, implicit-def $wl2, implicit-def $wh1, implicit $p1, implicit $p0, implicit $wl0 { + ; OFF-NEXT: $wl4 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $wh1 = VMOV_mv_w $wl0 + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl10, implicit-def $p0, implicit-def $wh7, implicit-def $cm2, implicit-def $bml2, implicit-def $amll2, implicit-def $amlh2, implicit-def $bmh2, implicit-def $amhl2, implicit-def $amhh2, implicit killed $p0, implicit $m0, implicit $wl0, implicit $x6, implicit $x4, implicit $r0 { + ; OFF-NEXT: $wl10, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $wh7 = VMOV_mv_w $wl0 + ; OFF-NEXT: $cm2 = VMUL_vmac_cm_core_dense $x6, $x4, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: $wh3 = VMOV_mv_w $wl0 + ; OFF-NEXT: BUNDLE implicit-def $wh2, implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit $wl0, implicit $x1, implicit $x10, implicit $r0 { + ; OFF-NEXT: $wh2 = VMOV_mv_w $wl0 + ; OFF-NEXT: $cm0 = VMUL_vmac_cm_core_dense $x1, $x10, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl1, implicit-def $p1, implicit-def $wh10, implicit killed $p1, implicit $m1, implicit $wl0 { + ; OFF-NEXT: $wl1, $p1 = VLDB_dmw_ldb_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wh10 = VMOV_mv_w $wl0 + ; OFF-NEXT: } + ; OFF-NEXT: DelayedSchedBarrier + ; OFF-NEXT: {{ $}} + ; OFF-NEXT: bb.2: + ; OFF-NEXT: successors: %bb.3(0x80000000) + ; OFF-NEXT: liveins: $r1, $r2 + ; OFF-NEXT: {{ $}} + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: $r2 = OR killed $r2, killed $r1 + ; OFF-NEXT: {{ $}} + ; OFF-NEXT: bb.3: + ; OFF-NEXT: liveins: $r2 + ; OFF-NEXT: {{ $}} + ; OFF-NEXT: RET implicit $lr + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: DelayedSchedBarrier implicit killed $r2 + bb.0: + liveins: $p0, $r0, $r1, $r2 + successors: %bb.1 + bb.1: + successors: %bb.1, %bb.2 + liveins: $cm0, $cm1, $cm2, $m0, $m1, $p0, $p1, $p2, $r0, $r1, $r2, $r3, $s0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10 + $cm4 = VMUL_vmac_cm_core_dense $x3, $x5, $r0 + $cm3 = VMUL_vmac_cm_core_dense $x7, $x9, $r0 + $wh3 = VSRS_S8_S32_mv_w_srs $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + $wl3 = VSRS_S8_S32_mv_w_srs $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + VST_dmw_sts_w_ag_idx_imm $wh3, $p2, 32 :: (store (<8 x s32>), addrspace 7) + $cm1 = VMUL_vmac_cm_core_dense $x6, $x8, $r0 + $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl3, $p2, 64 :: (store (<8 x s32>), addrspace 7) + $cm2 = VMUL_vmac_cm_core_dense $x2, $x4, $r0 + $wh2 = VSRS_S8_S32_mv_w_srs $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + $wl2 = VSRS_S8_S32_mv_w_srs $cm4, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + VST_dmw_sts_w_ag_idx_imm $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7) + $cm0 = VMUL_vmac_cm_core_dense $x10, $x1, $r0 + $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl2, $p2, 64 :: (store (<8 x s32>), addrspace 7) + $wl2 = VLD_idx_imm_3x32_pseudo $p1, 32 :: (load (<8 x s32>), align 32, addrspace 6) + $wh4 = VSRS_S8_S32_mv_w_srs $cm3, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + $wl4 = VSRS_S8_S32_mv_w_srs $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + VST_dmw_sts_w_ag_idx_imm $wh4, $p2, 32 :: (store (<8 x s32>), addrspace 7) + $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl4, $p2, 64 :: (store (<8 x s32>), addrspace 7) + $wl4, $p1 = VLD_pstm_pseudo $p1, $m1 :: (load (<8 x s32>), align 32, addrspace 6) + $wl6 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<8 x s32>), align 32, addrspace 5) + $wh6 = VMOV_mv_w $wl0 + $cm1 = VMUL_vmac_cm_core_dense $x6, $x2, $r0 + $wh2 = VSRS_S8_S32_mv_w_srs $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + $wl2 = VSRS_S8_S32_mv_w_srs $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + $wl10 = VLD_idx_imm_3x32_pseudo $p1, 32 :: (load (<8 x s32>), align 32, addrspace 6) + $wl6, $p0 = VLD_pstm_pseudo $p0, $m0 :: (load (<8 x s32>), align 32, addrspace 5) + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<8 x s32>), align 32, addrspace 5) + $wl5, $p1 = VLD_pstm_pseudo $p1, $m1 :: (load (<8 x s32>), align 32, addrspace 6) + $wl3, $p0 = VLD_pstm_pseudo $p0, $m0 :: (load (<8 x s32>), align 32, addrspace 5) + $wl9 = VLD_idx_imm_3x32_pseudo $p1, 32 :: (load (<8 x s32>), align 32, addrspace 6) + $wl7 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<8 x s32>), align 32, addrspace 5) + VST_dmw_sts_w_ag_idx_imm $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7) + $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl2, $p2, 64 :: (store (<8 x s32>), addrspace 7) + $cm2 = VMUL_vmac_cm_core_dense $x6, $x4, $r0 + $wl8, $p1 = VLD_pstm_pseudo $p1, $m1 :: (load (<8 x s32>), align 32, addrspace 6) + $wl6, $p0 = VLD_pstm_pseudo $p0, $m0 :: (load (<8 x s32>), align 32, addrspace 5) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<8 x s32>), align 32, addrspace 5) + $wl4 = VLD_idx_imm_3x32_pseudo $p1, 32 :: (load (<8 x s32>), align 32, addrspace 6) + $wh1 = VMOV_mv_w $wl0 + $cm0 = VMUL_vmac_cm_core_dense $x1, $x10, $r0 + $wl10, $p0 = VLD_pstm_pseudo $p0, $m0 :: (load (<8 x s32>), align 32, addrspace 5) + $wl1, $p1 = VLD_pstm_pseudo $p1, $m1 :: (load (<8 x s32>), align 32, addrspace 6) + $r3 = ADD_add_r_ri $r3, -4, implicit-def $srcarry + $wh3 = VMOV_mv_w $wl0 + $wh7 = VMOV_mv_w $wl0 + $wh2 = VMOV_mv_w $wl0 + $wh10 = VMOV_mv_w $wl0 + JNZ $r3, %bb.1 + DelayedSchedBarrier + bb.2: + liveins: $r1, $r2 + successors: %bb.3 + $r2 = OR $r2, $r1 + bb.3: + liveins: $r2 + RET implicit $lr + DelayedSchedBarrier implicit $r2 +...