From fdab2afbb0062d5d0831fb5335b9927f2bfbc8b1 Mon Sep 17 00:00:00 2001 From: Krishnam Tibrewala Date: Tue, 22 Oct 2024 11:32:24 -0700 Subject: [PATCH] [AIEX] Re-assign multi-slot instructions during iterative scheduling --- llvm/lib/Target/AIE/AIEAlternateDescriptors.h | 7 + llvm/lib/Target/AIE/AIEHazardRecognizer.cpp | 2 +- .../Target/AIE/AIEInterBlockScheduling.cpp | 10 +- llvm/lib/Target/AIE/AIEMachineScheduler.cpp | 45 ++- llvm/lib/Target/AIE/AIEPostPipeliner.cpp | 8 +- .../schedule/loopaware/loop-multiSlot.mir | 301 ++++++++++++++++++ 6 files changed, 343 insertions(+), 30 deletions(-) create mode 100644 llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-multiSlot.mir diff --git a/llvm/lib/Target/AIE/AIEAlternateDescriptors.h b/llvm/lib/Target/AIE/AIEAlternateDescriptors.h index 01673a348efd..24636b21f085 100644 --- a/llvm/lib/Target/AIE/AIEAlternateDescriptors.h +++ b/llvm/lib/Target/AIE/AIEAlternateDescriptors.h @@ -31,6 +31,9 @@ class AIEAlternateDescriptors { AIEAlternateDescriptors() = default; ~AIEAlternateDescriptors() = default; + MIAltDescsMap::const_iterator begin() const { return AlternateDescs.begin(); } + MIAltDescsMap::const_iterator end() const { return AlternateDescs.end(); } + // Construct an alternate descriptor with the given alternate descriptors. AIEAlternateDescriptors(const MIAltDescsMap &AltDescs) : AlternateDescs(AltDescs) {} @@ -43,6 +46,10 @@ class AIEAlternateDescriptors { AlternateDescs[MI] = &TII->get(AltInstOpcode); } + void setAlternateDescriptor(MachineInstr *MI, const MCInstrDesc *AltDesc) { + AlternateDescs[MI] = AltDesc; + } + // Return the alternate descriptor for the given multi-opcode instruction. std::optional getSelectedDescriptor(MachineInstr *MI) const { diff --git a/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp b/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp index 0b535ef7354a..af674bfb2a44 100644 --- a/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp +++ b/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp @@ -451,7 +451,7 @@ ScheduleHazardRecognizer::HazardType AIEHazardRecognizer::getHazardType( bool AIEHazardRecognizer::checkConflict( const ResourceScoreboard &Scoreboard, MachineInstr &MI, int DeltaCycles) const { - const MCInstrDesc &Desc = MI.getDesc(); + const MCInstrDesc &Desc = *SelectedAltDescs.getDesc(&MI); const unsigned SchedClass = TII->getSchedClass(Desc, MI.operands(), MI.getMF()->getRegInfo()); const MemoryBankBits MemoryBanks = getMemoryBanks(&MI); diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp index c7e57db45a1b..6e7e54e2cabc 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp @@ -300,6 +300,7 @@ namespace { /// into the appropriate blockstate region. /// TimedRegion is built one bundle at the time class PipelineExtractor : public PipelineScheduleVisitor { + AIEAlternateDescriptors &AlternateDesc; BlockState &Loop; BlockState *Prologue = nullptr; BlockState *Epilogue = nullptr; @@ -330,14 +331,19 @@ class PipelineExtractor : public PipelineScheduleVisitor { // Prologue and epilogue obtain copies. MachineInstr *ToBeEmitted = InLoop ? MI : Loop.TheBlock->getParent()->CloneMachineInstr(MI); - CurrentBundle.add(ToBeEmitted); + if (auto AltDesc = AlternateDesc.getSelectedDescriptor(MI); + AltDesc.has_value()) + AlternateDesc.setAlternateDescriptor(ToBeEmitted, AltDesc.value()); + + CurrentBundle.add(ToBeEmitted, AlternateDesc.getOpcode(MI)); } void endBundle() override { TimedRegion.emplace_back(CurrentBundle); } public: PipelineExtractor(InterBlockScheduling &InterBlock, BlockState &BS, const AIEBaseInstrInfo &TII) - : Loop(BS), CurrentBundle(TII.getFormatInterface()) { + : AlternateDesc(InterBlock.getSelectedAltDescs()), Loop(BS), + CurrentBundle(TII.getFormatInterface()) { MachineBasicBlock *LoopBlock = Loop.TheBlock; for (auto *P : LoopBlock->predecessors()) { if (P == LoopBlock) { diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp index 4296bece91b5..0d8fdacb2891 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp @@ -88,6 +88,10 @@ static cl::opt UseLoopHeuristics( "aie-loop-sched-heuristics", cl::init(true), cl::desc("Use special picking heuristics when scheduling a loop region")); +static cl::opt ReAssignMultiSlotInstr( + "aie-reassign-multislot-instr", cl::init(true), + cl::desc("Re-assign multi-slot instructions during iterative scheduling")); + namespace { // A sentinel value to represent an unknown SUnit. const constexpr unsigned UnknownSUNum = ~0; @@ -269,7 +273,8 @@ void AIEPostRASchedStrategy::initializeBotScoreBoard(ScoreboardTrust Trust) { /// by starting in the earliest possible cycle, -Depth auto InsertInCycle = [=](MachineInstr &MI, int Cycle) { BotHazardRec->emitInScoreboard( - MI.getDesc(), BotHazardRec->getMemoryBanks(&MI), MI.operands(), + *BotHazardRec->getSelectedAltDescs().getDesc(&MI), + BotHazardRec->getMemoryBanks(&MI), MI.operands(), MI.getMF()->getRegInfo(), Cycle - Depth); }; auto BlockCycle = [=](int Cycle) { @@ -536,7 +541,20 @@ void AIEPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) { IsBottomRegion = true; } +void AIEPostRASchedStrategy::materializeMultiOpcodeInstrs() { + for (auto [MI, Desc] : make_range(InterBlock.getSelectedAltDescs().begin(), + InterBlock.getSelectedAltDescs().end())) { + MI->setDesc(*Desc); + } + + InterBlock.getSelectedAltDescs().clear(); +} + void AIEPostRASchedStrategy::commitBlockSchedule(MachineBasicBlock *BB) { + + if (ReAssignMultiSlotInstr) + materializeMultiOpcodeInstrs(); + auto &BS = InterBlock.getBlockState(BB); // Safety margin, swp epilogue @@ -599,8 +617,6 @@ void AIEPostRASchedStrategy::leaveRegion(const SUnit &ExitSU) { if (BS.FixPoint.Stage != SchedulingStage::Scheduling) { return; } - materializeMultiOpcodeInstrs(); - InterBlock.getSelectedAltDescs().clear(); if (IsBottomRegion) { // This is the earliest point where we can destroy the recorded // schedule in iterative scheduling. enterMBB and enterRegion are too early, @@ -616,6 +632,8 @@ void AIEPostRASchedStrategy::leaveRegion(const SUnit &ExitSU) { assert(BS.getCurrentRegion().Bundles.empty()); BS.addBundles(TopBundles); BS.addBundles(BotBundles); + if (!ReAssignMultiSlotInstr) + materializeMultiOpcodeInstrs(); RegionBegin = nullptr; RegionEnd = nullptr; IsBottomRegion = false; @@ -623,27 +641,6 @@ void AIEPostRASchedStrategy::leaveRegion(const SUnit &ExitSU) { DEBUG_BLOCKS(dbgs() << " << leaveRegion\n"); } -void AIEPostRASchedStrategy::materializeMultiOpcodeInstrs() { - const TargetInstrInfo *TII = getTII(CurMBB); - const AIEHazardRecognizer &TopHazardRec = *getAIEHazardRecognizer(Top); - const AIEHazardRecognizer &BotHazardRec = *getAIEHazardRecognizer(Bot); - - auto MaterializePseudo = [&TII](MachineInstr &MI, - const AIEHazardRecognizer &HazardRec) { - // Materialize instructions with multiple opcode options - if (std::optional AltOpcode = - HazardRec.getSelectedAltDescs().getSelectedOpcode(&MI)) { - MI.setDesc(TII->get(*AltOpcode)); - } - }; - - assert(DAG->top() == DAG->bottom()); - for (MachineInstr &MI : make_range(DAG->begin(), DAG->top())) - MaterializePseudo(MI, TopHazardRec); - for (MachineInstr &MI : make_range(DAG->bottom(), DAG->end())) - MaterializePseudo(MI, BotHazardRec); -} - bool AIEPostRASchedStrategy::checkInterZoneConflicts( const std::vector &BotBundles) const { const AIEHazardRecognizer *TopHazardRec = getAIEHazardRecognizer(Top); diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp index 961aacb6acca..a5193f24266e 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp @@ -112,7 +112,8 @@ int PostPipeliner::getResMII(MachineBasicBlock &LoopBlock) { std::vector Scoreboard(NInstr, 0); int MII = 1; for (auto &MI : LoopBlock) { - auto *SlotInfo = TII->getSlotInfo(TII->getSlotKind(MI.getOpcode())); + const unsigned Opcode = HR.getSelectedAltDescs().getOpcode(&MI); + auto *SlotInfo = TII->getSlotInfo(TII->getSlotKind(Opcode)); SlotBits Slots = SlotInfo ? SlotInfo->getSlotSet() : 0; int C = 0; @@ -290,6 +291,7 @@ bool PostPipeliner::scheduleFirstIteration() { return false; } const int LocalCycle = Actual % II; + const MCInstrDesc &Desc = *HR.getSelectedAltDescs().getDesc(MI); const MemoryBankBits MemoryBanks = HR.getMemoryBanks(MI); LLVM_DEBUG(dbgs() << " Emit in " << -Depth + LocalCycle << "\n"); int Cycle = -Depth + LocalCycle; @@ -299,8 +301,8 @@ bool PostPipeliner::scheduleFirstIteration() { return false; } - HR.emitInScoreboard(Scoreboard, MI->getDesc(), MemoryBanks, - MI->operands(), MI->getMF()->getRegInfo(), Cycle); + HR.emitInScoreboard(Scoreboard, Desc, MemoryBanks, MI->operands(), + MI->getMF()->getRegInfo(), Cycle); Cycle += II; } diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-multiSlot.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-multiSlot.mir new file mode 100644 index 000000000000..24f1269a1db1 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-multiSlot.mir @@ -0,0 +1,301 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 --run-pass=postmisched --aie-reassign-multislot-instr=true %s -o - | FileCheck %s --check-prefix=ON +# RUN: llc --mtriple=aie2 --run-pass=postmisched --aie-reassign-multislot-instr=false %s -o - | FileCheck %s --check-prefix=OFF + +--- +name: multislot_across_loop +alignment: 16 +tracksRegLiveness: true +body: | + ; ON-LABEL: name: multislot_across_loop + ; ON: bb.0: + ; ON-NEXT: successors: %bb.1(0x80000000) + ; ON-NEXT: liveins: $p0, $r0, $r1, $r2 + ; ON-NEXT: {{ $}} + ; ON-NEXT: bb.1: + ; ON-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; ON-NEXT: liveins: $cm0, $cm1, $cm2, $m0, $m1, $p0, $p1, $p2, $r0, $r1, $r2, $r3, $s0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10 + ; ON-NEXT: {{ $}} + ; ON-NEXT: $wh3 = VSRS_S8_S32_mv_w_srs killed $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: BUNDLE implicit-def $wl3, implicit-def $srsrs_of, implicit-def $cm4, implicit-def $bml4, implicit-def $amll4, implicit-def $amlh4, implicit-def $bmh4, implicit-def $amhl4, implicit-def $amhh4, implicit killed $cm2, implicit $s0, implicit $crsat, implicit $crrnd, implicit $x3, implicit $x5, implicit $r0 { + ; ON-NEXT: $wl3 = VSRS_S8_S32_mv_w_srs killed $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: $cm4 = VMUL_vmac_cm_core_dense $x3, $x5, $r0 + ; ON-NEXT: } + ; ON-NEXT: $cm3 = VMUL_vmac_cm_core_dense killed $x7, $x9, $r0 + ; ON-NEXT: BUNDLE implicit-def $wh2, implicit-def $srsrs_of, implicit-def $cm1, implicit-def $bml1, implicit-def $amll1, implicit-def $amlh1, implicit-def $bmh1, implicit-def $amhl1, implicit-def $amhh1, implicit killed $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit killed $x6, implicit $x8, implicit $r0 { + ; ON-NEXT: $wh2 = VSRS_S8_S32_mv_w_srs killed $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: $cm1 = VMUL_vmac_cm_core_dense killed $x6, $x8, $r0 + ; ON-NEXT: } + ; ON-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh3, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl3, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: BUNDLE implicit-def $wl2, implicit-def $srsrs_of, implicit-def $cm2, implicit-def $bml2, implicit-def $amll2, implicit-def $amlh2, implicit-def $bmh2, implicit-def $amhl2, implicit-def $amhh2, implicit killed $cm4, implicit $s0, implicit $crsat, implicit $crrnd, implicit $x2, implicit killed $x4, implicit $r0 { + ; ON-NEXT: $wl2 = VSRS_S8_S32_mv_w_srs killed $cm4, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: $cm2 = VMUL_vmac_cm_core_dense $x2, killed $x4, $r0 + ; ON-NEXT: } + ; ON-NEXT: $wh4 = VSRS_S8_S32_mv_w_srs killed $cm3, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: $wl4 = VSRS_S8_S32_mv_w_srs killed $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl2, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: VST_dmw_sts_w_ag_idx_imm $wh4, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl4, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: BUNDLE implicit-def $wl6, implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit $p0, implicit $x10, implicit killed $x1, implicit $r0 { + ; ON-NEXT: $wl6 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: $cm0 = VMUL_vmac_cm_core_dense $x10, killed $x1, $r0 + ; ON-NEXT: } + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: BUNDLE implicit-def $wl4, implicit-def $p1, implicit-def $wl6, implicit-def $p0, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0 { + ; ON-NEXT: $wl4, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: } + ; ON-NEXT: $wh2 = VSRS_S8_S32_mv_w_srs killed $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: BUNDLE implicit-def $wl10, implicit-def $wl1, implicit-def $wl2, implicit-def $srsrs_of, implicit-def $wh6, implicit $p1, implicit $p0, implicit killed $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit $wl0 { + ; ON-NEXT: $wl10 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: $wl2 = VSRS_S8_S32_mv_w_srs killed $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: $wh6 = VMOV_mv_w $wl0 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $wl5, implicit-def $p1, implicit-def $wl3, implicit-def $p0, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0 { + ; ON-NEXT: $wl5, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wl3, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $wl9, implicit-def $wl7, implicit-def $r3, implicit-def $srcarry, implicit-def $cm1, implicit-def $bml1, implicit-def $amll1, implicit-def $amlh1, implicit-def $bmh1, implicit-def $amhl1, implicit-def $amhh1, implicit $p1, implicit $p0, implicit killed $r3, implicit $x6, implicit $x2, implicit $r0 { + ; ON-NEXT: $wl9 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wl7 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: $r3 = ADD_add_r_ri killed $r3, -4, implicit-def $srcarry + ; ON-NEXT: $cm1 = VMUL_vmac_cm_core_dense $x6, $x2, $r0 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit killed $wh2, implicit $p2, implicit $r3 { + ; ON-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: JNZ $r3, %bb.1 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $p2, implicit-def $wh1, implicit killed $wl2, implicit killed $p2, implicit $wl0 { + ; ON-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl2, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: $wh1 = VMOV_mv_w $wl0 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $wl8, implicit-def $p1, implicit-def $wl6, implicit-def $p0, implicit-def $wh7, implicit-def $cm2, implicit-def $bml2, implicit-def $amll2, implicit-def $amlh2, implicit-def $bmh2, implicit-def $amhl2, implicit-def $amhh2, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0, implicit $wl0, implicit $x6, implicit $x4, implicit $r0 { + ; ON-NEXT: $wl8, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: $wh7 = VMOV_mv_w $wl0 + ; ON-NEXT: $cm2 = VMUL_vmac_cm_core_dense $x6, $x4, $r0 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $wl4, implicit-def $wh3, implicit $p1, implicit $wl0 { + ; ON-NEXT: $wl4 = VLDB_dmw_ldb_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wh3 = VMOV_mv_w $wl0 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $wl2, implicit-def $wh2, implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit $p0, implicit $wl0, implicit $x1, implicit killed $x10, implicit $r0 { + ; ON-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: $wh2 = VMOV_mv_w $wl0 + ; ON-NEXT: $cm0 = VMUL_vmac_cm_core_dense $x1, killed $x10, $r0 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $wl10, implicit-def $p0, implicit-def $wl1, implicit-def $p1, implicit-def $wh10, implicit killed $p0, implicit $m0, implicit killed $p1, implicit $m1, implicit $wl0 { + ; ON-NEXT: $wl10, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: $wl1, $p1 = VLDB_dmw_ldb_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wh10 = VMOV_mv_w $wl0 + ; ON-NEXT: } + ; ON-NEXT: DelayedSchedBarrier + ; ON-NEXT: {{ $}} + ; ON-NEXT: bb.2: + ; ON-NEXT: successors: %bb.3(0x80000000) + ; ON-NEXT: liveins: $r1, $r2 + ; ON-NEXT: {{ $}} + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: $r2 = OR killed $r2, killed $r1 + ; ON-NEXT: {{ $}} + ; ON-NEXT: bb.3: + ; ON-NEXT: liveins: $r2 + ; ON-NEXT: {{ $}} + ; ON-NEXT: RET implicit $lr + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: DelayedSchedBarrier implicit killed $r2 + ; + ; OFF-LABEL: name: multislot_across_loop + ; OFF: bb.0: + ; OFF-NEXT: successors: %bb.1(0x80000000) + ; OFF-NEXT: liveins: $p0, $r0, $r1, $r2 + ; OFF-NEXT: {{ $}} + ; OFF-NEXT: bb.1: + ; OFF-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; OFF-NEXT: liveins: $cm0, $cm1, $cm2, $m0, $m1, $p0, $p1, $p2, $r0, $r1, $r2, $r3, $s0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10 + ; OFF-NEXT: {{ $}} + ; OFF-NEXT: $wh3 = VSRS_S8_S32_mv_w_srs killed $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: BUNDLE implicit-def $wl3, implicit-def $srsrs_of, implicit-def $cm4, implicit-def $bml4, implicit-def $amll4, implicit-def $amlh4, implicit-def $bmh4, implicit-def $amhl4, implicit-def $amhh4, implicit killed $cm2, implicit $s0, implicit $crsat, implicit $crrnd, implicit $x3, implicit $x5, implicit $r0 { + ; OFF-NEXT: $wl3 = VSRS_S8_S32_mv_w_srs killed $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: $cm4 = VMUL_vmac_cm_core_dense $x3, $x5, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: $cm3 = VMUL_vmac_cm_core_dense killed $x7, $x9, $r0 + ; OFF-NEXT: BUNDLE implicit-def $wh2, implicit-def $srsrs_of, implicit-def $cm1, implicit-def $bml1, implicit-def $amll1, implicit-def $amlh1, implicit-def $bmh1, implicit-def $amhl1, implicit-def $amhh1, implicit killed $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit killed $x6, implicit $x8, implicit $r0 { + ; OFF-NEXT: $wh2 = VSRS_S8_S32_mv_w_srs killed $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: $cm1 = VMUL_vmac_cm_core_dense killed $x6, $x8, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh3, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl3, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: BUNDLE implicit-def $wl2, implicit-def $srsrs_of, implicit-def $cm2, implicit-def $bml2, implicit-def $amll2, implicit-def $amlh2, implicit-def $bmh2, implicit-def $amhl2, implicit-def $amhh2, implicit killed $cm4, implicit $s0, implicit $crsat, implicit $crrnd, implicit $x2, implicit killed $x4, implicit $r0 { + ; OFF-NEXT: $wl2 = VSRS_S8_S32_mv_w_srs killed $cm4, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: $cm2 = VMUL_vmac_cm_core_dense $x2, killed $x4, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: $wh4 = VSRS_S8_S32_mv_w_srs killed $cm3, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: $wl4 = VSRS_S8_S32_mv_w_srs killed $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl2, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: VST_dmw_sts_w_ag_idx_imm $wh4, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl4, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: BUNDLE implicit-def $wl6, implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit $p0, implicit $x10, implicit killed $x1, implicit $r0 { + ; OFF-NEXT: $wl6 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $cm0 = VMUL_vmac_cm_core_dense $x10, killed $x1, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: BUNDLE implicit-def $wl4, implicit-def $p1, implicit-def $wl6, implicit-def $p0, implicit-def $wh2, implicit-def $srsrs_of, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0, implicit killed $cm2, implicit $s0, implicit $crsat, implicit $crrnd { + ; OFF-NEXT: $wl4, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $wh2 = VSRS_S8_S32_mv_w_srs killed $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl10, implicit-def $wl1, implicit-def $wl2, implicit-def $srsrs_of, implicit-def $wh6, implicit $p1, implicit $p0, implicit killed $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit $wl0 { + ; OFF-NEXT: $wl10 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $wl2 = VSRS_S8_S32_mv_w_srs killed $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: $wh6 = VMOV_mv_w $wl0 + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl5, implicit-def $p1, implicit-def $wl3, implicit-def $p0, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0 { + ; OFF-NEXT: $wl5, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wl3, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl9, implicit-def $wl7, implicit-def $cm1, implicit-def $bml1, implicit-def $amll1, implicit-def $amlh1, implicit-def $bmh1, implicit-def $amhl1, implicit-def $amhh1, implicit $p1, implicit $p0, implicit $x6, implicit $x2, implicit $r0 { + ; OFF-NEXT: $wl9 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wl7 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $cm1 = VMUL_vmac_cm_core_dense $x6, $x2, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: BUNDLE implicit-def $p2, implicit-def $r3, implicit-def $srcarry, implicit killed $wl2, implicit killed $p2, implicit killed $r3 { + ; OFF-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl2, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: $r3 = ADD_add_r_ri killed $r3, -4, implicit-def $srcarry + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl8, implicit-def $p1, implicit-def $wl6, implicit-def $p0, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0, implicit $r3 { + ; OFF-NEXT: $wl8, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: JNZ $r3, %bb.1 + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl4, implicit-def $wl2, implicit-def $wh1, implicit $p1, implicit $p0, implicit $wl0 { + ; OFF-NEXT: $wl4 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $wh1 = VMOV_mv_w $wl0 + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl10, implicit-def $p0, implicit-def $wh7, implicit-def $cm2, implicit-def $bml2, implicit-def $amll2, implicit-def $amlh2, implicit-def $bmh2, implicit-def $amhl2, implicit-def $amhh2, implicit killed $p0, implicit $m0, implicit $wl0, implicit $x6, implicit $x4, implicit $r0 { + ; OFF-NEXT: $wl10, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $wh7 = VMOV_mv_w $wl0 + ; OFF-NEXT: $cm2 = VMUL_vmac_cm_core_dense $x6, $x4, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: $wh3 = VMOV_mv_w $wl0 + ; OFF-NEXT: BUNDLE implicit-def $wh2, implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit $wl0, implicit $x1, implicit $x10, implicit $r0 { + ; OFF-NEXT: $wh2 = VMOV_mv_w $wl0 + ; OFF-NEXT: $cm0 = VMUL_vmac_cm_core_dense $x1, $x10, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl1, implicit-def $p1, implicit-def $wh10, implicit killed $p1, implicit $m1, implicit $wl0 { + ; OFF-NEXT: $wl1, $p1 = VLDB_dmw_ldb_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wh10 = VMOV_mv_w $wl0 + ; OFF-NEXT: } + ; OFF-NEXT: DelayedSchedBarrier + ; OFF-NEXT: {{ $}} + ; OFF-NEXT: bb.2: + ; OFF-NEXT: successors: %bb.3(0x80000000) + ; OFF-NEXT: liveins: $r1, $r2 + ; OFF-NEXT: {{ $}} + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: $r2 = OR killed $r2, killed $r1 + ; OFF-NEXT: {{ $}} + ; OFF-NEXT: bb.3: + ; OFF-NEXT: liveins: $r2 + ; OFF-NEXT: {{ $}} + ; OFF-NEXT: RET implicit $lr + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: DelayedSchedBarrier implicit killed $r2 + bb.0: + liveins: $p0, $r0, $r1, $r2 + successors: %bb.1 + bb.1: + successors: %bb.1, %bb.2 + liveins: $cm0, $cm1, $cm2, $m0, $m1, $p0, $p1, $p2, $r0, $r1, $r2, $r3, $s0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10 + $cm4 = VMUL_vmac_cm_core_dense $x3, $x5, $r0 + $cm3 = VMUL_vmac_cm_core_dense $x7, $x9, $r0 + $wh3 = VSRS_S8_S32_mv_w_srs $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + $wl3 = VSRS_S8_S32_mv_w_srs $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + VST_dmw_sts_w_ag_idx_imm $wh3, $p2, 32 :: (store (<8 x s32>), addrspace 7) + $cm1 = VMUL_vmac_cm_core_dense $x6, $x8, $r0 + $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl3, $p2, 64 :: (store (<8 x s32>), addrspace 7) + $cm2 = VMUL_vmac_cm_core_dense $x2, $x4, $r0 + $wh2 = VSRS_S8_S32_mv_w_srs $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + $wl2 = VSRS_S8_S32_mv_w_srs $cm4, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + VST_dmw_sts_w_ag_idx_imm $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7) + $cm0 = VMUL_vmac_cm_core_dense $x10, $x1, $r0 + $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl2, $p2, 64 :: (store (<8 x s32>), addrspace 7) + $wl2 = VLD_idx_imm_3x32_pseudo $p1, 32 :: (load (<8 x s32>), align 32, addrspace 6) + $wh4 = VSRS_S8_S32_mv_w_srs $cm3, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + $wl4 = VSRS_S8_S32_mv_w_srs $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + VST_dmw_sts_w_ag_idx_imm $wh4, $p2, 32 :: (store (<8 x s32>), addrspace 7) + $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl4, $p2, 64 :: (store (<8 x s32>), addrspace 7) + $wl4, $p1 = VLD_pstm_pseudo $p1, $m1 :: (load (<8 x s32>), align 32, addrspace 6) + $wl6 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<8 x s32>), align 32, addrspace 5) + $wh6 = VMOV_mv_w $wl0 + $cm1 = VMUL_vmac_cm_core_dense $x6, $x2, $r0 + $wh2 = VSRS_S8_S32_mv_w_srs $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + $wl2 = VSRS_S8_S32_mv_w_srs $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + $wl10 = VLD_idx_imm_3x32_pseudo $p1, 32 :: (load (<8 x s32>), align 32, addrspace 6) + $wl6, $p0 = VLD_pstm_pseudo $p0, $m0 :: (load (<8 x s32>), align 32, addrspace 5) + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<8 x s32>), align 32, addrspace 5) + $wl5, $p1 = VLD_pstm_pseudo $p1, $m1 :: (load (<8 x s32>), align 32, addrspace 6) + $wl3, $p0 = VLD_pstm_pseudo $p0, $m0 :: (load (<8 x s32>), align 32, addrspace 5) + $wl9 = VLD_idx_imm_3x32_pseudo $p1, 32 :: (load (<8 x s32>), align 32, addrspace 6) + $wl7 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<8 x s32>), align 32, addrspace 5) + VST_dmw_sts_w_ag_idx_imm $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7) + $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl2, $p2, 64 :: (store (<8 x s32>), addrspace 7) + $cm2 = VMUL_vmac_cm_core_dense $x6, $x4, $r0 + $wl8, $p1 = VLD_pstm_pseudo $p1, $m1 :: (load (<8 x s32>), align 32, addrspace 6) + $wl6, $p0 = VLD_pstm_pseudo $p0, $m0 :: (load (<8 x s32>), align 32, addrspace 5) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<8 x s32>), align 32, addrspace 5) + $wl4 = VLD_idx_imm_3x32_pseudo $p1, 32 :: (load (<8 x s32>), align 32, addrspace 6) + $wh1 = VMOV_mv_w $wl0 + $cm0 = VMUL_vmac_cm_core_dense $x1, $x10, $r0 + $wl10, $p0 = VLD_pstm_pseudo $p0, $m0 :: (load (<8 x s32>), align 32, addrspace 5) + $wl1, $p1 = VLD_pstm_pseudo $p1, $m1 :: (load (<8 x s32>), align 32, addrspace 6) + $r3 = ADD_add_r_ri $r3, -4, implicit-def $srcarry + $wh3 = VMOV_mv_w $wl0 + $wh7 = VMOV_mv_w $wl0 + $wh2 = VMOV_mv_w $wl0 + $wh10 = VMOV_mv_w $wl0 + JNZ $r3, %bb.1 + DelayedSchedBarrier + bb.2: + liveins: $r1, $r2 + successors: %bb.3 + $r2 = OR $r2, $r1 + bb.3: + liveins: $r2 + RET implicit $lr + DelayedSchedBarrier implicit $r2 +...