From 904d379ab5b5ce8b9cfea6a8d1520b38960cf8d3 Mon Sep 17 00:00:00 2001 From: Krishnam Tibrewala Date: Thu, 3 Oct 2024 05:23:47 -0700 Subject: [PATCH] [AIEX] Re-assign multi-slot instructions during iterative scheduling --- llvm/lib/Target/AIE/AIEAlternateDescriptors.h | 8 + llvm/lib/Target/AIE/AIEHazardRecognizer.cpp | 2 +- .../Target/AIE/AIEInterBlockScheduling.cpp | 29 +- llvm/lib/Target/AIE/AIEMachineScheduler.cpp | 41 ++- llvm/lib/Target/AIE/AIEPostPipeliner.cpp | 9 +- .../schedule/loopaware/loop-multiSlot.mir | 301 ++++++++++++++++++ 6 files changed, 353 insertions(+), 37 deletions(-) create mode 100644 llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-multiSlot.mir diff --git a/llvm/lib/Target/AIE/AIEAlternateDescriptors.h b/llvm/lib/Target/AIE/AIEAlternateDescriptors.h index 01673a348efd..457eb947c8dd 100644 --- a/llvm/lib/Target/AIE/AIEAlternateDescriptors.h +++ b/llvm/lib/Target/AIE/AIEAlternateDescriptors.h @@ -31,6 +31,10 @@ class AIEAlternateDescriptors { AIEAlternateDescriptors() = default; ~AIEAlternateDescriptors() = default; + const MIAltDescsMap &getAlternateDescriptors() const { + return AlternateDescs; + } + // Construct an alternate descriptor with the given alternate descriptors. AIEAlternateDescriptors(const MIAltDescsMap &AltDescs) : AlternateDescs(AltDescs) {} @@ -43,6 +47,10 @@ class AIEAlternateDescriptors { AlternateDescs[MI] = &TII->get(AltInstOpcode); } + void setAlternateDescriptor(MachineInstr *MI, const MCInstrDesc *AltDesc) { + AlternateDescs[MI] = AltDesc; + } + // Return the alternate descriptor for the given multi-opcode instruction. std::optional getSelectedDescriptor(MachineInstr *MI) const { diff --git a/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp b/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp index 5eb8dfa3a943..fdb0b039bd41 100644 --- a/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp +++ b/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp @@ -447,7 +447,7 @@ ScheduleHazardRecognizer::HazardType AIEHazardRecognizer::getHazardType( bool AIEHazardRecognizer::checkConflict( const ResourceScoreboard &Scoreboard, MachineInstr &MI, int DeltaCycles) const { - const MCInstrDesc &Desc = MI.getDesc(); + const MCInstrDesc &Desc = *SelectedAltDescs.getDesc(&MI); const unsigned SchedClass = TII->getSchedClass(Desc, MI.operands(), MI.getMF()->getRegInfo()); const MemoryBankBits MemoryBanks = getMemoryBanks(&MI); diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp index be4fd26b295e..6392dcf34eaf 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp @@ -70,8 +70,9 @@ void emitBundlesTopDown(const std::vector &Bundles, // then this will not cause conflicts. for (int I = TotalBundles - AmountToEmit; I < TotalBundles; I++) { for (MachineInstr *MI : Bundles[I].getInstrs()) - HR->emitInScoreboard(Scoreboard, MI->getDesc(), HR->getMemoryBanks(MI), - MI->operands(), MI->getMF()->getRegInfo(), 0); + HR->emitInScoreboard(Scoreboard, *HR->getSelectedAltDescs().getDesc(MI), + HR->getMemoryBanks(MI), MI->operands(), + MI->getMF()->getRegInfo(), 0); Scoreboard.advance(); } } @@ -100,8 +101,9 @@ createBottomUpScoreboard(ArrayRef Bundles, Bundles.begin(), Bundles.begin() + std::min(NumBundles, RequiredCycles)); for (const MachineBundle &B : reverse(MinBundles)) { for (MachineInstr *MI : B.getInstrs()) - HR.emitInScoreboard(Scoreboard, MI->getDesc(), HR.getMemoryBanks(MI), - MI->operands(), MI->getMF()->getRegInfo(), 0); + HR.emitInScoreboard(Scoreboard, *HR.getSelectedAltDescs().getDesc(MI), + HR.getMemoryBanks(MI), MI->operands(), + MI->getMF()->getRegInfo(), 0); Scoreboard.recede(); } return Scoreboard; @@ -124,9 +126,9 @@ checkResourceConflicts(const ResourceScoreboard &Scoreboard, for (MachineInstr *MI : B.getInstrs()) { if (BottomUpCycle >= HR.getConflictHorizon()) break; - if (HR.getHazardType(Scoreboard, MI->getDesc(), HR.getMemoryBanks(MI), - MI->operands(), MI->getMF()->getRegInfo(), - -BottomUpCycle)) + if (HR.getHazardType(Scoreboard, *HR.getSelectedAltDescs().getDesc(MI), + HR.getMemoryBanks(MI), MI->operands(), + MI->getMF()->getRegInfo(), -BottomUpCycle)) return MI; } ++BottomUpCycle; @@ -233,6 +235,7 @@ namespace { /// into the appropriate blockstate region. /// TimedRegion is built one bundle at the time class PipelineExtractor : public PipelineScheduleVisitor { + InterBlockScheduling &InterBlock; BlockState &Loop; BlockState *Prologue = nullptr; BlockState *Epilogue = nullptr; @@ -263,14 +266,22 @@ class PipelineExtractor : public PipelineScheduleVisitor { // Prologue and epilogue obtain copies. MachineInstr *ToBeEmitted = InLoop ? MI : Loop.TheBlock->getParent()->CloneMachineInstr(MI); - CurrentBundle.add(ToBeEmitted); + if (auto AltDesc = + InterBlock.getSelectedAltDescs().getSelectedDescriptor(MI); + AltDesc.has_value()) + InterBlock.getSelectedAltDescs().setAlternateDescriptor(ToBeEmitted, + AltDesc.value()); + + CurrentBundle.add(ToBeEmitted, + InterBlock.getSelectedAltDescs().getOpcode(MI)); } void endBundle() override { TimedRegion.emplace_back(CurrentBundle); } public: PipelineExtractor(InterBlockScheduling &InterBlock, BlockState &BS, const AIEBaseInstrInfo &TII) - : Loop(BS), CurrentBundle(TII.getFormatInterface()) { + : InterBlock(InterBlock), Loop(BS), + CurrentBundle(TII.getFormatInterface()) { MachineBasicBlock *LoopBlock = Loop.TheBlock; for (auto *P : LoopBlock->predecessors()) { if (P == LoopBlock) { diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp index cda5c78c5c98..98172a66c7a6 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp @@ -83,6 +83,10 @@ static cl::opt InterBlockAlignment("aie-interblock-alignment", cl::init(true), cl::desc("Allow for alignment of successor blocks")); +static cl::opt ReAssignMultiSlotInstr( + "aie-reassign-multislot-instr", cl::init(true), + cl::desc("Re-assign multi-slot instructions during iterative scheduling")); + namespace { // A sentinel value to represent an unknown SUnit. const constexpr unsigned UnknownSUNum = ~0; @@ -531,7 +535,19 @@ void AIEPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) { IsBottomRegion = true; } +void AIEPostRASchedStrategy::materializeMultiOpcodeInstrs() { + for (auto [MI, Desc] : + InterBlock.getSelectedAltDescs().getAlternateDescriptors()) + MI->setDesc(*Desc); + + InterBlock.getSelectedAltDescs().clear(); +} + void AIEPostRASchedStrategy::commitBlockSchedule(MachineBasicBlock *BB) { + + if (ReAssignMultiSlotInstr) + materializeMultiOpcodeInstrs(); + auto &BS = InterBlock.getBlockState(BB); // Safety margin, swp epilogue @@ -594,8 +610,6 @@ void AIEPostRASchedStrategy::leaveRegion(const SUnit &ExitSU) { if (BS.FixPoint.Stage != SchedulingStage::Scheduling) { return; } - materializeMultiOpcodeInstrs(); - InterBlock.getSelectedAltDescs().clear(); if (IsBottomRegion) { // This is the earliest point where we can destroy the recorded // schedule in iterative scheduling. enterMBB and enterRegion are too early, @@ -611,6 +625,8 @@ void AIEPostRASchedStrategy::leaveRegion(const SUnit &ExitSU) { assert(BS.getCurrentRegion().Bundles.empty()); BS.addBundles(TopBundles); BS.addBundles(BotBundles); + if (!ReAssignMultiSlotInstr) + materializeMultiOpcodeInstrs(); RegionBegin = nullptr; RegionEnd = nullptr; IsBottomRegion = false; @@ -618,27 +634,6 @@ void AIEPostRASchedStrategy::leaveRegion(const SUnit &ExitSU) { DEBUG_BLOCKS(dbgs() << " << leaveRegion\n"); } -void AIEPostRASchedStrategy::materializeMultiOpcodeInstrs() { - const TargetInstrInfo *TII = getTII(CurMBB); - const AIEHazardRecognizer &TopHazardRec = *getAIEHazardRecognizer(Top); - const AIEHazardRecognizer &BotHazardRec = *getAIEHazardRecognizer(Bot); - - auto MaterializePseudo = [&TII](MachineInstr &MI, - const AIEHazardRecognizer &HazardRec) { - // Materialize instructions with multiple opcode options - if (std::optional AltOpcode = - HazardRec.getSelectedAltDescs().getSelectedOpcode(&MI)) { - MI.setDesc(TII->get(*AltOpcode)); - } - }; - - assert(DAG->top() == DAG->bottom()); - for (MachineInstr &MI : make_range(DAG->begin(), DAG->top())) - MaterializePseudo(MI, TopHazardRec); - for (MachineInstr &MI : make_range(DAG->bottom(), DAG->end())) - MaterializePseudo(MI, BotHazardRec); -} - bool AIEPostRASchedStrategy::checkInterZoneConflicts( const std::vector &BotBundles) const { const AIEHazardRecognizer *TopHazardRec = getAIEHazardRecognizer(Top); diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp index 950ea4ccfbad..36a741f949f4 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp @@ -266,11 +266,12 @@ bool PostPipeliner::scheduleFirstIteration() { return false; } const int LocalCycle = Actual % II; + const MCInstrDesc &Desc = *HR.getSelectedAltDescs().getDesc(MI); const MemoryBankBits MemoryBanks = HR.getMemoryBanks(MI); LLVM_DEBUG(dbgs() << " Emit in " << -Depth + LocalCycle << "\n"); int Cycle = -Depth + LocalCycle; LLVM_DEBUG(dbgs() << " Emit in " << Cycle << "\n"); - HR.emitInScoreboard(Scoreboard, MI->getDesc(), MemoryBanks, MI->operands(), + HR.emitInScoreboard(Scoreboard, Desc, MemoryBanks, MI->operands(), MI->getMF()->getRegInfo(), Cycle); scheduleNode(SU, Actual); @@ -317,12 +318,12 @@ bool PostPipeliner::scheduleOtherIterations() { LLVM_DEBUG(dbgs() << " Resource conflict\n"); return false; } + const MCInstrDesc &Desc = *HR.getSelectedAltDescs().getDesc(MI); const MemoryBankBits MemoryBanks = HR.getMemoryBanks(MI); const int LocalCycle = (Insert - CurrentCycle) % II; LLVM_DEBUG(dbgs() << " Emit in " << -Depth + LocalCycle << "\n"); - HR.emitInScoreboard(Scoreboard, MI->getDesc(), MemoryBanks, - MI->operands(), MI->getMF()->getRegInfo(), - -Depth + LocalCycle); + HR.emitInScoreboard(Scoreboard, Desc, MemoryBanks, MI->operands(), + MI->getMF()->getRegInfo(), -Depth + LocalCycle); scheduleNode(SU, Insert); } } diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-multiSlot.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-multiSlot.mir new file mode 100644 index 000000000000..d774ea6327a6 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-multiSlot.mir @@ -0,0 +1,301 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 --run-pass=postmisched --aie-reassign-multislot-instr=true %s -o - | FileCheck %s --check-prefix=ON +# RUN: llc --mtriple=aie2 --run-pass=postmisched --aie-reassign-multislot-instr=false %s -o - | FileCheck %s --check-prefix=OFF + +--- +name: multislot_across_loop +alignment: 16 +tracksRegLiveness: true +body: | + ; ON-LABEL: name: multislot_across_loop + ; ON: bb.0: + ; ON-NEXT: successors: %bb.1(0x80000000) + ; ON-NEXT: liveins: $p0, $r0, $r1, $r2 + ; ON-NEXT: {{ $}} + ; ON-NEXT: bb.1: + ; ON-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; ON-NEXT: liveins: $cm0, $cm1, $cm2, $m0, $m1, $p0, $p1, $p2, $r0, $r1, $r2, $r3, $s0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10 + ; ON-NEXT: {{ $}} + ; ON-NEXT: $wh3 = VSRS_S8_S32_mv_w_srs killed $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: BUNDLE implicit-def $wl3, implicit-def $srsrs_of, implicit-def $cm4, implicit-def $bml4, implicit-def $amll4, implicit-def $amlh4, implicit-def $bmh4, implicit-def $amhl4, implicit-def $amhh4, implicit killed $cm2, implicit $s0, implicit $crsat, implicit $crrnd, implicit $x3, implicit $x5, implicit $r0 { + ; ON-NEXT: $wl3 = VSRS_S8_S32_mv_w_srs killed $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: $cm4 = VMUL_vmac_cm_core_dense $x3, $x5, $r0 + ; ON-NEXT: } + ; ON-NEXT: $cm3 = VMUL_vmac_cm_core_dense killed $x7, $x9, $r0 + ; ON-NEXT: BUNDLE implicit-def $wh2, implicit-def $srsrs_of, implicit-def $cm1, implicit-def $bml1, implicit-def $amll1, implicit-def $amlh1, implicit-def $bmh1, implicit-def $amhl1, implicit-def $amhh1, implicit killed $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit killed $x6, implicit $x8, implicit $r0 { + ; ON-NEXT: $wh2 = VSRS_S8_S32_mv_w_srs killed $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: $cm1 = VMUL_vmac_cm_core_dense killed $x6, $x8, $r0 + ; ON-NEXT: } + ; ON-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh3, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl3, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: BUNDLE implicit-def $wl2, implicit-def $srsrs_of, implicit-def $cm2, implicit-def $bml2, implicit-def $amll2, implicit-def $amlh2, implicit-def $bmh2, implicit-def $amhl2, implicit-def $amhh2, implicit killed $cm4, implicit $s0, implicit $crsat, implicit $crrnd, implicit $x2, implicit killed $x4, implicit $r0 { + ; ON-NEXT: $wl2 = VSRS_S8_S32_mv_w_srs killed $cm4, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: $cm2 = VMUL_vmac_cm_core_dense $x2, killed $x4, $r0 + ; ON-NEXT: } + ; ON-NEXT: $wh4 = VSRS_S8_S32_mv_w_srs killed $cm3, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: $wl4 = VSRS_S8_S32_mv_w_srs killed $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl2, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: VST_dmw_sts_w_ag_idx_imm $wh4, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl4, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: BUNDLE implicit-def $wl6, implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit $p0, implicit $x10, implicit killed $x1, implicit $r0 { + ; ON-NEXT: $wl6 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: $cm0 = VMUL_vmac_cm_core_dense $x10, killed $x1, $r0 + ; ON-NEXT: } + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: BUNDLE implicit-def $wl4, implicit-def $p1, implicit-def $wl6, implicit-def $p0, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0 { + ; ON-NEXT: $wl4, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: } + ; ON-NEXT: $wh2 = VSRS_S8_S32_mv_w_srs killed $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: BUNDLE implicit-def $wl10, implicit-def $wl1, implicit-def $wl2, implicit-def $srsrs_of, implicit-def $wh6, implicit $p1, implicit $p0, implicit killed $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit $wl0 { + ; ON-NEXT: $wl10 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: $wl2 = VSRS_S8_S32_mv_w_srs killed $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; ON-NEXT: $wh6 = VMOV_mv_w $wl0 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $wl5, implicit-def $p1, implicit-def $wl3, implicit-def $p0, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0 { + ; ON-NEXT: $wl5, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wl3, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $wl9, implicit-def $wl7, implicit-def $r3, implicit-def $srcarry, implicit-def $cm1, implicit-def $bml1, implicit-def $amll1, implicit-def $amlh1, implicit-def $bmh1, implicit-def $amhl1, implicit-def $amhh1, implicit $p1, implicit $p0, implicit killed $r3, implicit $x6, implicit $x2, implicit $r0 { + ; ON-NEXT: $wl9 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wl7 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: $r3 = ADD_add_r_ri killed $r3, -4, implicit-def $srcarry + ; ON-NEXT: $cm1 = VMUL_vmac_cm_core_dense $x6, $x2, $r0 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit killed $wh2, implicit $p2, implicit $r3 { + ; ON-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: JNZ $r3, %bb.1 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $p2, implicit-def $wh1, implicit killed $wl2, implicit killed $p2, implicit $wl0 { + ; ON-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl2, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; ON-NEXT: $wh1 = VMOV_mv_w $wl0 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $wl8, implicit-def $p1, implicit-def $wl6, implicit-def $p0, implicit-def $wh7, implicit-def $cm2, implicit-def $bml2, implicit-def $amll2, implicit-def $amlh2, implicit-def $bmh2, implicit-def $amhl2, implicit-def $amhh2, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0, implicit $wl0, implicit $x6, implicit $x4, implicit $r0 { + ; ON-NEXT: $wl8, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: $wh7 = VMOV_mv_w $wl0 + ; ON-NEXT: $cm2 = VMUL_vmac_cm_core_dense $x6, $x4, $r0 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $wl4, implicit-def $wh3, implicit $p1, implicit $wl0 { + ; ON-NEXT: $wl4 = VLDB_dmw_ldb_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wh3 = VMOV_mv_w $wl0 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $wl2, implicit-def $wh2, implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit $p0, implicit $wl0, implicit $x1, implicit killed $x10, implicit $r0 { + ; ON-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: $wh2 = VMOV_mv_w $wl0 + ; ON-NEXT: $cm0 = VMUL_vmac_cm_core_dense $x1, killed $x10, $r0 + ; ON-NEXT: } + ; ON-NEXT: BUNDLE implicit-def $wl10, implicit-def $p0, implicit-def $wl1, implicit-def $p1, implicit-def $wh10, implicit killed $p0, implicit $m0, implicit killed $p1, implicit $m1, implicit $wl0 { + ; ON-NEXT: $wl10, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; ON-NEXT: $wl1, $p1 = VLDB_dmw_ldb_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; ON-NEXT: $wh10 = VMOV_mv_w $wl0 + ; ON-NEXT: } + ; ON-NEXT: DelayedSchedBarrier + ; ON-NEXT: {{ $}} + ; ON-NEXT: bb.2: + ; ON-NEXT: successors: %bb.3(0x80000000) + ; ON-NEXT: liveins: $r1, $r2 + ; ON-NEXT: {{ $}} + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: $r2 = OR killed $r2, killed $r1 + ; ON-NEXT: {{ $}} + ; ON-NEXT: bb.3: + ; ON-NEXT: liveins: $r2 + ; ON-NEXT: {{ $}} + ; ON-NEXT: RET implicit $lr + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: DelayedSchedBarrier implicit killed $r2 + ; + ; OFF-LABEL: name: multislot_across_loop + ; OFF: bb.0: + ; OFF-NEXT: successors: %bb.1(0x80000000) + ; OFF-NEXT: liveins: $p0, $r0, $r1, $r2 + ; OFF-NEXT: {{ $}} + ; OFF-NEXT: bb.1: + ; OFF-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; OFF-NEXT: liveins: $cm0, $cm1, $cm2, $m0, $m1, $p0, $p1, $p2, $r0, $r1, $r2, $r3, $s0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10 + ; OFF-NEXT: {{ $}} + ; OFF-NEXT: $wh3 = VSRS_S8_S32_mv_w_srs killed $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: BUNDLE implicit-def $wl3, implicit-def $srsrs_of, implicit-def $cm4, implicit-def $bml4, implicit-def $amll4, implicit-def $amlh4, implicit-def $bmh4, implicit-def $amhl4, implicit-def $amhh4, implicit killed $cm2, implicit $s0, implicit $crsat, implicit $crrnd, implicit $x3, implicit $x5, implicit $r0 { + ; OFF-NEXT: $wl3 = VSRS_S8_S32_mv_w_srs killed $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: $cm4 = VMUL_vmac_cm_core_dense $x3, $x5, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: $cm3 = VMUL_vmac_cm_core_dense killed $x7, $x9, $r0 + ; OFF-NEXT: BUNDLE implicit-def $wh2, implicit-def $srsrs_of, implicit-def $cm1, implicit-def $bml1, implicit-def $amll1, implicit-def $amlh1, implicit-def $bmh1, implicit-def $amhl1, implicit-def $amhh1, implicit killed $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit killed $x6, implicit $x8, implicit $r0 { + ; OFF-NEXT: $wh2 = VSRS_S8_S32_mv_w_srs killed $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: $cm1 = VMUL_vmac_cm_core_dense killed $x6, $x8, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh3, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl3, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: BUNDLE implicit-def $wl2, implicit-def $srsrs_of, implicit-def $cm2, implicit-def $bml2, implicit-def $amll2, implicit-def $amlh2, implicit-def $bmh2, implicit-def $amhl2, implicit-def $amhh2, implicit killed $cm4, implicit $s0, implicit $crsat, implicit $crrnd, implicit $x2, implicit killed $x4, implicit $r0 { + ; OFF-NEXT: $wl2 = VSRS_S8_S32_mv_w_srs killed $cm4, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: $cm2 = VMUL_vmac_cm_core_dense $x2, killed $x4, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: $wh4 = VSRS_S8_S32_mv_w_srs killed $cm3, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: $wl4 = VSRS_S8_S32_mv_w_srs killed $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl2, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: VST_dmw_sts_w_ag_idx_imm $wh4, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl4, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: BUNDLE implicit-def $wl6, implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit $p0, implicit $x10, implicit killed $x1, implicit $r0 { + ; OFF-NEXT: $wl6 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $cm0 = VMUL_vmac_cm_core_dense $x10, killed $x1, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: BUNDLE implicit-def $wl4, implicit-def $p1, implicit-def $wl6, implicit-def $p0, implicit-def $wh2, implicit-def $srsrs_of, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0, implicit killed $cm2, implicit $s0, implicit $crsat, implicit $crrnd { + ; OFF-NEXT: $wl4, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $wh2 = VSRS_S8_S32_mv_w_srs killed $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl10, implicit-def $wl1, implicit-def $wl2, implicit-def $srsrs_of, implicit-def $wh6, implicit $p1, implicit $p0, implicit killed $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit $wl0 { + ; OFF-NEXT: $wl10 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $wl2 = VSRS_S8_S32_mv_w_srs killed $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; OFF-NEXT: $wh6 = VMOV_mv_w $wl0 + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl5, implicit-def $p1, implicit-def $wl3, implicit-def $p0, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0 { + ; OFF-NEXT: $wl5, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wl3, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl9, implicit-def $wl7, implicit-def $cm1, implicit-def $bml1, implicit-def $amll1, implicit-def $amlh1, implicit-def $bmh1, implicit-def $amhl1, implicit-def $amhh1, implicit $p1, implicit $p0, implicit $x6, implicit $x2, implicit $r0 { + ; OFF-NEXT: $wl9 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wl7 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $cm1 = VMUL_vmac_cm_core_dense $x6, $x2, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $r3, implicit-def $srcarry, implicit killed $wh2, implicit $p2, implicit killed $r3 { + ; OFF-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: $r3 = ADD_add_r_ri killed $r3, -4, implicit-def $srcarry + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $p2, implicit killed $wl2, implicit killed $p2, implicit $r3 { + ; OFF-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl2, killed $p2, 64 :: (store (<8 x s32>), addrspace 7) + ; OFF-NEXT: JNZ $r3, %bb.1 + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl8, implicit-def $p1, implicit-def $wh1, implicit killed $p1, implicit $m1, implicit $wl0 { + ; OFF-NEXT: $wl8, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wh1 = VMOV_mv_w $wl0 + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl4, implicit-def $wl6, implicit-def $p0, implicit-def $wh7, implicit-def $cm2, implicit-def $bml2, implicit-def $amll2, implicit-def $amlh2, implicit-def $bmh2, implicit-def $amhl2, implicit-def $amhh2, implicit $p1, implicit killed $p0, implicit $m0, implicit $wl0, implicit $x6, implicit $x4, implicit $r0 { + ; OFF-NEXT: $wl4 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $wh7 = VMOV_mv_w $wl0 + ; OFF-NEXT: $cm2 = VMUL_vmac_cm_core_dense $x6, $x4, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: $wh3 = VMOV_mv_w $wl0 + ; OFF-NEXT: BUNDLE implicit-def $wl2, implicit-def $wh2, implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit $p0, implicit $wl0, implicit $x1, implicit killed $x10, implicit $r0 { + ; OFF-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $wh2 = VMOV_mv_w $wl0 + ; OFF-NEXT: $cm0 = VMUL_vmac_cm_core_dense $x1, killed $x10, $r0 + ; OFF-NEXT: } + ; OFF-NEXT: BUNDLE implicit-def $wl10, implicit-def $p0, implicit-def $wl1, implicit-def $p1, implicit-def $wh10, implicit killed $p0, implicit $m0, implicit killed $p1, implicit $m1, implicit $wl0 { + ; OFF-NEXT: $wl10, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5) + ; OFF-NEXT: $wl1, $p1 = VLDB_dmw_ldb_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6) + ; OFF-NEXT: $wh10 = VMOV_mv_w $wl0 + ; OFF-NEXT: } + ; OFF-NEXT: DelayedSchedBarrier + ; OFF-NEXT: {{ $}} + ; OFF-NEXT: bb.2: + ; OFF-NEXT: successors: %bb.3(0x80000000) + ; OFF-NEXT: liveins: $r1, $r2 + ; OFF-NEXT: {{ $}} + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: $r2 = OR killed $r2, killed $r1 + ; OFF-NEXT: {{ $}} + ; OFF-NEXT: bb.3: + ; OFF-NEXT: liveins: $r2 + ; OFF-NEXT: {{ $}} + ; OFF-NEXT: RET implicit $lr + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: DelayedSchedBarrier implicit killed $r2 + bb.0: + liveins: $p0, $r0, $r1, $r2 + successors: %bb.1 + bb.1: + successors: %bb.1, %bb.2 + liveins: $cm0, $cm1, $cm2, $m0, $m1, $p0, $p1, $p2, $r0, $r1, $r2, $r3, $s0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10 + $cm4 = VMUL_vmac_cm_core_dense $x3, $x5, $r0 + $cm3 = VMUL_vmac_cm_core_dense $x7, $x9, $r0 + $wh3 = VSRS_S8_S32_mv_w_srs $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + $wl3 = VSRS_S8_S32_mv_w_srs $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + VST_dmw_sts_w_ag_idx_imm $wh3, $p2, 32 :: (store (<8 x s32>), addrspace 7) + $cm1 = VMUL_vmac_cm_core_dense $x6, $x8, $r0 + $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl3, $p2, 64 :: (store (<8 x s32>), addrspace 7) + $cm2 = VMUL_vmac_cm_core_dense $x2, $x4, $r0 + $wh2 = VSRS_S8_S32_mv_w_srs $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + $wl2 = VSRS_S8_S32_mv_w_srs $cm4, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + VST_dmw_sts_w_ag_idx_imm $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7) + $cm0 = VMUL_vmac_cm_core_dense $x10, $x1, $r0 + $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl2, $p2, 64 :: (store (<8 x s32>), addrspace 7) + $wl2 = VLD_idx_imm_3x32_pseudo $p1, 32 :: (load (<8 x s32>), align 32, addrspace 6) + $wh4 = VSRS_S8_S32_mv_w_srs $cm3, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + $wl4 = VSRS_S8_S32_mv_w_srs $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + VST_dmw_sts_w_ag_idx_imm $wh4, $p2, 32 :: (store (<8 x s32>), addrspace 7) + $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl4, $p2, 64 :: (store (<8 x s32>), addrspace 7) + $wl4, $p1 = VLD_pstm_pseudo $p1, $m1 :: (load (<8 x s32>), align 32, addrspace 6) + $wl6 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<8 x s32>), align 32, addrspace 5) + $wh6 = VMOV_mv_w $wl0 + $cm1 = VMUL_vmac_cm_core_dense $x6, $x2, $r0 + $wh2 = VSRS_S8_S32_mv_w_srs $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + $wl2 = VSRS_S8_S32_mv_w_srs $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + $wl10 = VLD_idx_imm_3x32_pseudo $p1, 32 :: (load (<8 x s32>), align 32, addrspace 6) + $wl6, $p0 = VLD_pstm_pseudo $p0, $m0 :: (load (<8 x s32>), align 32, addrspace 5) + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<8 x s32>), align 32, addrspace 5) + $wl5, $p1 = VLD_pstm_pseudo $p1, $m1 :: (load (<8 x s32>), align 32, addrspace 6) + $wl3, $p0 = VLD_pstm_pseudo $p0, $m0 :: (load (<8 x s32>), align 32, addrspace 5) + $wl9 = VLD_idx_imm_3x32_pseudo $p1, 32 :: (load (<8 x s32>), align 32, addrspace 6) + $wl7 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<8 x s32>), align 32, addrspace 5) + VST_dmw_sts_w_ag_idx_imm $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7) + $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl2, $p2, 64 :: (store (<8 x s32>), addrspace 7) + $cm2 = VMUL_vmac_cm_core_dense $x6, $x4, $r0 + $wl8, $p1 = VLD_pstm_pseudo $p1, $m1 :: (load (<8 x s32>), align 32, addrspace 6) + $wl6, $p0 = VLD_pstm_pseudo $p0, $m0 :: (load (<8 x s32>), align 32, addrspace 5) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<8 x s32>), align 32, addrspace 5) + $wl4 = VLD_idx_imm_3x32_pseudo $p1, 32 :: (load (<8 x s32>), align 32, addrspace 6) + $wh1 = VMOV_mv_w $wl0 + $cm0 = VMUL_vmac_cm_core_dense $x1, $x10, $r0 + $wl10, $p0 = VLD_pstm_pseudo $p0, $m0 :: (load (<8 x s32>), align 32, addrspace 5) + $wl1, $p1 = VLD_pstm_pseudo $p1, $m1 :: (load (<8 x s32>), align 32, addrspace 6) + $r3 = ADD_add_r_ri $r3, -4, implicit-def $srcarry + $wh3 = VMOV_mv_w $wl0 + $wh7 = VMOV_mv_w $wl0 + $wh2 = VMOV_mv_w $wl0 + $wh10 = VMOV_mv_w $wl0 + JNZ $r3, %bb.1 + DelayedSchedBarrier + bb.2: + liveins: $r1, $r2 + successors: %bb.3 + $r2 = OR $r2, $r1 + bb.3: + liveins: $r2 + RET implicit $lr + DelayedSchedBarrier implicit $r2 +...