From 8193c578c46d457dc0b4bb632bf32635dc95dad4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Mon, 5 Aug 2024 11:57:47 +0100 Subject: [PATCH 1/4] [AIE2] NFC: Add baseline test for complex loop-aware sched convergence We want to increase the safety margin for one instruciton at a time here, instead of doing it for all instructions at once. --- .../aie2/schedule/loopaware/Add2D-like.mir | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir new file mode 100644 index 000000000000..74765446d4f5 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir @@ -0,0 +1,109 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py + +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 --run-pass=postmisched \ +# RUN: %s -o - | FileCheck %s + +# A simplified example of a SW-pipelined 2xVLD.UPS -> VADD -> VST.SRS loop. +# We want to make sure the VST.SRS ends up in the last cycle of the loop, +# and the VLD.UPS in the first cycles. +# This means that in the fixpoint loop for loop-aware-scheduling, one needs to +# increase the safety margin for one instruction at a time: The VLDs need to be +# pushed up, not the VST. +# FIXME: Actually do this. +--- +name: add2d +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: add2d + ; CHECK: bb.0 (align 16): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $dj0, $dj4, $dn0, $dn4, $m0, $m1, $p1, $p2, $p3, $r0, $r1, $r2, $r3, $r4, $r5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $dc0 = MOVA_lda_cg 0 + ; CHECK-NEXT: BUNDLE implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit-def $p1, implicit-def $srups_of, implicit-def $dc4, implicit $s1, implicit killed $p1, implicit $m1, implicit $crsat, implicit $crupssign, implicit $dc0 { + ; CHECK-NEXT: renamable $cm0, renamable $p1 = VLDA_UPS_S32_D8_ag_pstm_nrm renamable $s1, killed renamable $p1, renamable $m1, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + ; CHECK-NEXT: $dc4 = MOV_mv_scl $dc0 + ; CHECK-NEXT: } + ; CHECK-NEXT: $cm4, $p2, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 killed $s1, killed $p2, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $s1 = MOV_mv_scl killed $r2 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: BUNDLE implicit-def $r1, implicit-def dead $srcarry, implicit-def $s0, implicit killed $r1, implicit killed $r4 { + ; CHECK-NEXT: renamable $r1 = ADD_add_r_ri killed renamable $r1, -4, implicit-def dead $srcarry + ; CHECK-NEXT: $s0 = MOV_mv_scl killed $r4 + ; CHECK-NEXT: } + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1 (align 16): + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $cm0, $cm4, $dc0, $dc4, $dj0, $dj4, $dn0, $dn4, $m0, $m1, $p1, $p2, $p3, $r0, $r1, $s0, $s1, $d0_3d:0x000000000001C870 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $cm8 = VADD killed renamable $cm4, killed renamable $cm0, renamable $r0 + ; CHECK-NEXT: renamable $cm0, renamable $p1 = VLDA_UPS_S32_D8_ag_pstm_nrm renamable $s1, killed renamable $p1, renamable $m1, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + ; CHECK-NEXT: BUNDLE implicit-def $cm4, implicit-def $bml4, implicit-def $amll4, implicit-def $amlh4, implicit-def $bmh4, implicit-def $amhl4, implicit-def $amhh4, implicit-def $p2, implicit-def $dc0, implicit-def $dc4, implicit-def $srups_of, implicit-def $r1, implicit-def dead $srcarry, implicit $s1, implicit killed $p2, implicit $d0_3d, implicit $crsat, implicit $crupssign, implicit killed $r1 { + ; CHECK-NEXT: $cm4, $p2, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s1, killed $p2, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: renamable $r1 = ADD_add_r_ri killed renamable $r1, -4, implicit-def dead $srcarry + ; CHECK-NEXT: } + ; CHECK-NEXT: JNZ renamable $r1, %bb.1 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: renamable $p3 = VST_SRS_D8_S32_ag_pstm_nrm_imm killed renamable $p3, 32, killed renamable $cm8, renamable $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $cm0, $cm4, $p3, $r0, $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $cm0 = VADD killed renamable $cm4, killed renamable $cm0, killed renamable $r0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: NOP + ; CHECK-NEXT: renamable $p3 = VST_SRS_D8_S32_ag_pstm_nrm_imm killed renamable $p3, 32, killed renamable $cm0, killed renamable $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier + bb.0 (align 16): + successors: %bb.1 + liveins: $dj0, $dj4, $dn0, $dn4, $m0, $m1, $p1, $p2, $p3, $r0, $r1, $r2, $r3, $r4, $r5 + + renamable $dc0 = MOV_PD_imm10_pseudo 0 + $s1 = MOV_mv_scl killed $r2 + $s0 = MOV_mv_scl killed $r4 + renamable $cm0, renamable $p1 = VLDA_UPS_S32_D8_ag_pstm_nrm renamable $s1, killed renamable $p1, renamable $m1, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + $dc4 = MOV_mv_scl $dc0 + $cm4, $p2, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s1, killed $p2, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + renamable $r1 = ADD_add_r_ri killed renamable $r1, -4, implicit-def dead $srcarry + + bb.1 (align 16): + successors: %bb.1, %bb.2 + liveins: $cm0, $cm4, $dc0, $dc4, $dj0, $dj4, $dn0, $dn4, $m0, $m1, $p1, $p2, $p3, $r0, $r1, $s0, $s1, $d0_3d:0x000000000001C870 + + renamable $cm8 = VADD killed renamable $cm4, killed renamable $cm0, renamable $r0 + renamable $p3 = VST_SRS_D8_S32_ag_pstm_nrm_imm killed renamable $p3, 32, killed renamable $cm8, renamable $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + renamable $cm0, renamable $p1 = VLDA_UPS_S32_D8_ag_pstm_nrm renamable $s1, killed renamable $p1, renamable $m1, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + $cm4, $p2, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s1, killed $p2, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + renamable $r1 = ADD_add_r_ri killed renamable $r1, -4, implicit-def dead $srcarry + JNZ renamable $r1, %bb.1 + DelayedSchedBarrier + + bb.2: + liveins: $cm0, $cm4, $p3, $r0, $s0 + + renamable $cm0 = VADD killed renamable $cm4, killed renamable $cm0, renamable $r0 + renamable $p3 = VST_SRS_D8_S32_ag_pstm_nrm_imm killed renamable $p3, 32, killed renamable $cm0, renamable $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign + RET implicit $lr + DelayedSchedBarrier +... From 8dc4a1bbecd1c901180f16b31770d0dbc8cb8253 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Mon, 5 Aug 2024 12:27:46 +0100 Subject: [PATCH 2/4] [AIEX] Loop-aware sched: Increase latency margin per instruction --- .../Target/AIE/AIEInterBlockScheduling.cpp | 45 +++++++++++----- llvm/lib/Target/AIE/AIEInterBlockScheduling.h | 8 ++- llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp | 2 +- .../AIE/aie2/end-to-end/Conv2D-red-swp.ll | 52 +++++++++---------- .../aie2/schedule/loopaware/Add2D-like.mir | 16 +++--- .../aie2/schedule/loopaware/short-hwloop.mir | 7 ++- 6 files changed, 76 insertions(+), 54 deletions(-) diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp index e163e6f8ab29..b19a7545752c 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp @@ -37,6 +37,10 @@ static cl::opt LoopEpilogueAnalysis( "aie-loop-epilogue-analysis", cl::init(true), cl::desc("[AIE] Perform Loop/Epilogue analysis with loop scheduling")); +static cl::opt MaxExpensiveIterations( + "aie-loop-aware-expensive-iterations", cl::init(25), + cl::desc("[AIE] Perform Loop/Epilogue analysis with loop scheduling")); + namespace llvm::AIE { void dumpInterBlock(const InterBlockEdges &Edges) { @@ -166,10 +170,13 @@ bool InterBlockScheduling::leaveBlock() { if (BS.Kind == BlockType::Loop && !updateFixPoint(BS)) { BS.FixPoint.NumIters++; // Iterate on CurrentBlock + // We will first try to increase the latency margin for one instruction at + // a time, before increasing that margin for all instructions at once. // If we are very unlucky, we may step both the latency margin and // the resource margin to the max. Any more indicates failure to converge, // and we abort to prevent an infinite loop. - if (BS.FixPoint.NumIters > 2 * HR->getConflictHorizon()) { + if (BS.FixPoint.NumIters > + MaxExpensiveIterations + 2 * HR->getConflictHorizon()) { report_fatal_error("Inter-block scheduling did not converge."); } return false; @@ -219,7 +226,7 @@ bool InterBlockScheduling::resourcesConverged(BlockState &BS) const { return true; } -bool InterBlockScheduling::latencyConverged(BlockState &BS) const { +MachineInstr *InterBlockScheduling::latencyConverged(BlockState &BS) const { const auto &SubTarget = BS.TheBlock->getParent()->getSubtarget(); auto *TII = static_cast(SubTarget.getInstrInfo()); auto *ItinData = SubTarget.getInstrItineraryData(); @@ -283,7 +290,7 @@ bool InterBlockScheduling::latencyConverged(BlockState &BS) const { << " not met (" << Distance << ")\n"); DEBUG_LOOPAWARE(dbgs() << " " << Succ->NodeNum << ": " << *Succ->getInstr()); - return false; + return Pred->getInstr(); } } } @@ -296,7 +303,7 @@ bool InterBlockScheduling::latencyConverged(BlockState &BS) const { // upperbound of the latency safety margin that should be provided by // the epilogue BS.FixPoint.MaxLatencyExtent = MaxExtent; - return true; + return nullptr; } bool InterBlockScheduling::updateFixPoint(BlockState &BS) { @@ -316,11 +323,20 @@ bool InterBlockScheduling::updateFixPoint(BlockState &BS) { // Iterate on CurMBB return false; } - if (!latencyConverged(BS)) { - BS.FixPoint.LatencyMargin++; + + if (MachineInstr *MINeedsHigherCap = latencyConverged(BS)) { + auto Res = BS.FixPoint.PerMILatencyMargin.try_emplace(MINeedsHigherCap, 0); + // Increase the latency margin per instruction, unless we already iterated + // more than MaxExpensiveIterations without converging. + if (BS.FixPoint.NumIters <= MaxExpensiveIterations) { + ++Res.first->second; + } else { + BS.FixPoint.LatencyMargin++; + } DEBUG_LOOPAWARE(dbgs() << " not converged: latency RM=" - << BS.FixPoint.ResourceMargin << " LM=>" - << BS.FixPoint.LatencyMargin << "\n"); + << BS.FixPoint.ResourceMargin + << " LM=" << BS.FixPoint.LatencyMargin + << " MIM=" << Res.first->second << "\n"); // Iterate on CurMBB return false; } @@ -341,13 +357,18 @@ bool InterBlockScheduling::successorsAreScheduled( }); } -std::optional -InterBlockScheduling::getLatencyCap(MachineBasicBlock *BB) const { - auto &BS = getBlockState(BB); +std::optional InterBlockScheduling::getLatencyCap(MachineInstr &MI) const { + auto &BS = getBlockState(MI.getParent()); if (BS.Kind != BlockType::Loop) { return {}; } - return BS.FixPoint.LatencyMargin; + if (BS.FixPoint.LatencyMargin) + return BS.FixPoint.LatencyMargin; + if (const auto *It = BS.FixPoint.PerMILatencyMargin.find(&MI); + It != BS.FixPoint.PerMILatencyMargin.end()) { + return It->second; + } + return 0; } std::optional diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h index 71e2a10bbfdb..8fb452a656b3 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h @@ -110,6 +110,7 @@ class FixedpointState { public: bool IsScheduled = false; int LatencyMargin = 0; + SmallMapVector PerMILatencyMargin; int ResourceMargin = 0; // Results from the convergence test int MaxLatencyExtent = 0; @@ -245,7 +246,10 @@ class InterBlockScheduling { /// The two components of the convergence test bool resourcesConverged(BlockState &BS) const; - bool latencyConverged(BlockState &BS) const; + + /// Return one instruction that needs a higher latency cap, or nullptr if all + /// latencies converged. + MachineInstr *latencyConverged(BlockState &BS) const; /// After finding the loops, determine the epilogue blocks void markEpilogueBlocks(); @@ -301,7 +305,7 @@ class InterBlockScheduling { /// Return the maximum interblock latency we need to account for /// for the given successor. This represents the latency margin we assume for /// an unscheduled successor. - std::optional getLatencyCap(MachineBasicBlock *BB) const; + std::optional getLatencyCap(MachineInstr &MI) const; /// Return the maximum number of cycles to block for the given successor. /// This represents the resource usage we assume for an unscheduled successor. diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp index 6d4e0dd611d2..433c8a4780fa 100644 --- a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp +++ b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp @@ -173,7 +173,7 @@ unsigned MaxLatencyFinder::operator()(MachineInstr &MI) { // scheduling a loop. const AIE::InterBlockScheduling &IB = Scheduler->getInterBlock(); if (!InterBlock) { - if (auto Cap = IB.getLatencyCap(CurBB)) { + if (auto Cap = IB.getLatencyCap(MI)) { LLVM_DEBUG(dbgs() << "Capped at " << *Cap << "\n"); Latency = std::min(Latency, *Cap); } diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll index 7b514c67d750..31f18e67cb2d 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll @@ -300,7 +300,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: .LBB0_1: // %outer.loop.header ; DCL-NEXT: // =>This Loop Header: Depth=1 ; DCL-NEXT: // Child Loop BB0_2 Depth 2 -; DCL-NEXT: vlda wl6, [p1], #32; nopb ; nopx +; DCL-NEXT: vlda wl6, [p1], #32; nopxm ; DCL-NEXT: vlda wl5, [p0], m6; mov r0, p0 ; DCL-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32] ; DCL-NEXT: vlda wh6, [p1], #32 @@ -310,7 +310,6 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: vlda wl7, [p0], m6 ; DCL-NEXT: vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m1, p5 ; DCL-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m1 -; DCL-NEXT: vlda wh8, [p1], #32 ; DCL-NEXT: vlda.3d wh7, [p0], d0 ; DCL-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32] ; DCL-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m5 @@ -323,16 +322,16 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32] ; DCL-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m5 ; DCL-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32] -; DCL-NEXT: vlda wl10, [p1], #32 +; DCL-NEXT: vlda wh8, [p1], #32 ; DCL-NEXT: vlda wl3, [p0], m6; mov r1, p0 ; DCL-NEXT: vlda.ups.s32.s16 bml7, s0, [p2, #0]; and r0, r0, r9 ; DCL-NEXT: vlda wh3, [p0], m6; add r0, r0, #33 ; DCL-NEXT: vlda wl5, [p0], m6; vshift.align x4, x4, s1, x5, r0 ; DCL-NEXT: vlda.3d wh5, [p0], d0; and r10, r1, r9; vshift.align x2, x2, s1, x7, r0 -; DCL-NEXT: vlda wh10, [p1], #32; add r0, r10, #33; mov r10, p0 -; DCL-NEXT: vlda wl1, [p1], #32; add r1, r5, #-1; vshuffle x7, x4, x2, r2 -; DCL-NEXT: vlda wh1, [p1], #32; add r1, r1, #-1; vshuffle x9, x7, x0, r8 -; DCL-NEXT: and r10, r10, r9 +; DCL-NEXT: vlda wl10, [p1], #32; add r0, r10, #33; mov r10, p0 +; DCL-NEXT: vlda wh10, [p1], #32; add r1, r5, #-1; vshuffle x7, x4, x2, r2 +; DCL-NEXT: vlda wl1, [p1], #32; add r1, r1, #-1; vshuffle x9, x7, x0, r8 +; DCL-NEXT: vlda wh1, [p1], #32; and r10, r10, r9 ; DCL-NEXT: .p2align 4 ; DCL-NEXT: .LBB0_2: // %inner.loop ; DCL-NEXT: // Parent Loop BB0_1 Depth=1 @@ -340,14 +339,13 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4 ; DCL-NEXT: nopa ; nopb ; nopx ; vshift.align x4, x4, s1, x3, r0; vmac cm5, cm5, x9, x8, r4 ; DCL-NEXT: vlda wl3, [p0], m6; vshift.align x2, x2, s1, x5, r0 -; DCL-NEXT: vlda wh3, [p0], m6; vshuffle x11, x9, x0, r8 -; DCL-NEXT: vlda wl5, [p0], m6; add r1, r1, #-1; vshuffle x7, x4, x2, r2; vmac cm0, cm0, x7, x6, r4 -; DCL-NEXT: vlda wl10, [p1], #32; jnz r1, #.LBB0_2; vmac cm4, cm4, x7, x8, r4 -; DCL-NEXT: vlda.3d wh5, [p0], d0; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 // Delay Slot 5 -; DCL-NEXT: vlda wh10, [p1], #32; vmov x6, x10; vmac cm6, cm6, x9, x8, r4 // Delay Slot 4 -; DCL-NEXT: vlda wl1, [p1], #32; vmov x8, x1; vmac cm3, cm3, x11, x6, r4 // Delay Slot 3 -; DCL-NEXT: vlda wh1, [p1], #32; add r0, r10, #33; mov r10, p0; vmac cm7, cm7, x11, x8, r4 // Delay Slot 2 -; DCL-NEXT: and r10, r10, r9 // Delay Slot 1 +; DCL-NEXT: vlda wh3, [p0], m6; add r1, r1, #-1; vshuffle x11, x9, x0, r8 +; DCL-NEXT: vlda wl5, [p0], m6; jnz r1, #.LBB0_2; vmac cm0, cm0, x7, x6, r4 +; DCL-NEXT: vlda.3d wh5, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 // Delay Slot 5 +; DCL-NEXT: vlda wl10, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 // Delay Slot 4 +; DCL-NEXT: vlda wh10, [p1], #32; vmov x6, x10; vmac cm6, cm6, x9, x8, r4 // Delay Slot 3 +; DCL-NEXT: vlda wl1, [p1], #32; add r0, r10, #33; mov r10, p0; vmac cm3, cm3, x11, x6, r4 // Delay Slot 2 +; DCL-NEXT: vlda wh1, [p1], #32; and r10, r10, r9; vmov x8, x1; vmac cm7, cm7, x11, x8, r4 // Delay Slot 1 ; DCL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 ; DCL-NEXT: nopa ; nopb ; nopx ; vmov x11, x0 ; DCL-NEXT: vshuffle x0, x4, x2, r3 @@ -481,7 +479,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: .LBB0_1: // %outer.loop.header ; ZOL-NEXT: // =>This Loop Header: Depth=1 ; ZOL-NEXT: // Child Loop BB0_2 Depth 2 -; ZOL-NEXT: vlda wl6, [p1], #32; nopb ; nopx +; ZOL-NEXT: vlda wl6, [p1], #32; nopb ; nopxm ; ZOL-NEXT: vlda wl3, [p0], m6; mov r0, p0 ; ZOL-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32] ; ZOL-NEXT: vlda wh6, [p1], #32 @@ -495,7 +493,6 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: vlda.3d wh5, [p0], d0 ; ZOL-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32] ; ZOL-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m5 -; ZOL-NEXT: vlda wl10, [p1], #32 ; ZOL-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32]; mov m2, r14 ; ZOL-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m2 ; ZOL-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32] @@ -504,32 +501,31 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m1 ; ZOL-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32]; add.nc r1, r5, #-2 ; ZOL-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m5; movxm ls, #.LBB0_2 -; ZOL-NEXT: vlda wh10, [p1], #32; movxm le, #.L_LEnd0 +; ZOL-NEXT: vlda wl10, [p1], #32; movxm le, #.L_LEnd0 ; ZOL-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32]; mov lc, r1 ; ZOL-NEXT: nopb ; vlda wl3, [p0], m6; nops ; nopx ; mov r1, p0; nopv ; ZOL-NEXT: nopb ; vlda wh3, [p0], m6; nops ; and r0, r0, r9; nopm ; nopv ; ZOL-NEXT: nopb ; vlda wl5, [p0], m6; nops ; add r0, r0, #33; nopm ; nopv ; ZOL-NEXT: nopb ; vlda.3d wh5, [p0], d0; nops ; nopx ; vshift.align x4, x4, s1, x3, r0; nopv ; ZOL-NEXT: nopb ; vlda.ups.s32.s16 bml7, s0, [p2, #0]; nops ; nopx ; vshift.align x2, x2, s1, x5, r0; nopv -; ZOL-NEXT: nopb ; vlda wl1, [p1], #32; nops ; and r1, r1, r9; vshuffle x7, x4, x2, r2; nopv -; ZOL-NEXT: nopb ; vlda wh1, [p1], #32; nops ; add r0, r1, #33; mov r1, p0; nopv -; ZOL-NEXT: nopa ; nopb ; nopx ; vshuffle x9, x7, x0, r8 +; ZOL-NEXT: nopb ; vlda wh10, [p1], #32; nops ; and r1, r1, r9; vshuffle x7, x4, x2, r2; nopv +; ZOL-NEXT: nopb ; vlda wl1, [p1], #32; nops ; add r0, r1, #33; mov r1, p0; nopv +; ZOL-NEXT: vlda wh1, [p1], #32; nopb ; nopx ; vshuffle x9, x7, x0, r8 ; ZOL-NEXT: and r1, r1, r9 ; ZOL-NEXT: .p2align 4 ; ZOL-NEXT: .LBB0_2: // %inner.loop ; ZOL-NEXT: // Parent Loop BB0_1 Depth=1 ; ZOL-NEXT: // => This Inner Loop Header: Depth=2 -; ZOL-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4 -; ZOL-NEXT: vlda wl3, [p0], m6; nopx ; vshift.align x4, x4, s1, x3, r0; vmac cm5, cm5, x9, x8, r4 +; ZOL-NEXT: nopa ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4 +; ZOL-NEXT: vlda wl3, [p0], m6; vshift.align x4, x4, s1, x3, r0; vmac cm5, cm5, x9, x8, r4 ; ZOL-NEXT: vlda wh3, [p0], m6; vshift.align x2, x2, s1, x5, r0 ; ZOL-NEXT: vlda wl5, [p0], m6; vshuffle x11, x9, x0, r8; vmac cm0, cm0, x7, x6, r4 -; ZOL-NEXT: vlda wl10, [p1], #32; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 -; ZOL-NEXT: vlda.3d wh5, [p0], d0; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 +; ZOL-NEXT: vlda.3d wh5, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 +; ZOL-NEXT: vlda wl10, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 ; ZOL-NEXT: vlda wh10, [p1], #32; vmov x6, x10; vmac cm6, cm6, x9, x8, r4 -; ZOL-NEXT: vlda wl1, [p1], #32; vmov x8, x1; vmac cm3, cm3, x11, x6, r4 -; ZOL-NEXT: vlda wh1, [p1], #32; add r0, r1, #33; mov r1, p0; vmac cm7, cm7, x11, x8, r4 +; ZOL-NEXT: vlda wl1, [p1], #32; add r0, r1, #33; mov r1, p0; vmac cm3, cm3, x11, x6, r4 ; ZOL-NEXT: .L_LEnd0: -; ZOL-NEXT: nopb ; nopa ; nops ; and r1, r1, r9; nopm ; nopv +; ZOL-NEXT: nopb ; vlda wh1, [p1], #32; nops ; and r1, r1, r9; vmov x8, x1; vmac cm7, cm7, x11, x8, r4 ; ZOL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 ; ZOL-NEXT: nopa ; nopb ; nopx ; vmov x11, x0 ; ZOL-NEXT: vshuffle x0, x4, x2, r3 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir index 74765446d4f5..9211647b6e94 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir @@ -15,7 +15,6 @@ # This means that in the fixpoint loop for loop-aware-scheduling, one needs to # increase the safety margin for one instruction at a time: The VLDs need to be # pushed up, not the VST. -# FIXME: Actually do this. --- name: add2d alignment: 16 @@ -48,23 +47,26 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $cm0, $cm4, $dc0, $dc4, $dj0, $dj4, $dn0, $dn4, $m0, $m1, $p1, $p2, $p3, $r0, $r1, $s0, $s1, $d0_3d:0x000000000001C870 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $cm8 = VADD killed renamable $cm4, killed renamable $cm0, renamable $r0 - ; CHECK-NEXT: renamable $cm0, renamable $p1 = VLDA_UPS_S32_D8_ag_pstm_nrm renamable $s1, killed renamable $p1, renamable $m1, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) - ; CHECK-NEXT: BUNDLE implicit-def $cm4, implicit-def $bml4, implicit-def $amll4, implicit-def $amlh4, implicit-def $bmh4, implicit-def $amhl4, implicit-def $amhh4, implicit-def $p2, implicit-def $dc0, implicit-def $dc4, implicit-def $srups_of, implicit-def $r1, implicit-def dead $srcarry, implicit $s1, implicit killed $p2, implicit $d0_3d, implicit $crsat, implicit $crupssign, implicit killed $r1 { - ; CHECK-NEXT: $cm4, $p2, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s1, killed $p2, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: BUNDLE implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit-def $p1, implicit-def $srups_of, implicit-def $r1, implicit-def dead $srcarry, implicit $s1, implicit killed $p1, implicit $m1, implicit $crsat, implicit $crupssign, implicit killed $r1 { + ; CHECK-NEXT: renamable $cm0, renamable $p1 = VLDA_UPS_S32_D8_ag_pstm_nrm renamable $s1, killed renamable $p1, renamable $m1, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) ; CHECK-NEXT: renamable $r1 = ADD_add_r_ri killed renamable $r1, -4, implicit-def dead $srcarry ; CHECK-NEXT: } - ; CHECK-NEXT: JNZ renamable $r1, %bb.1 + ; CHECK-NEXT: BUNDLE implicit-def $cm4, implicit-def $bml4, implicit-def $amll4, implicit-def $amlh4, implicit-def $bmh4, implicit-def $amhl4, implicit-def $amhh4, implicit-def $p2, implicit-def $dc0, implicit-def $dc4, implicit-def $srups_of, implicit-def $cm8, implicit-def $bml8, implicit-def $amll8, implicit-def $amlh8, implicit-def $bmh8, implicit-def $amhl8, implicit-def $amhh8, implicit $s1, implicit killed $p2, implicit $d0_3d, implicit $crsat, implicit $crupssign, implicit $r1, implicit $cm0, implicit $r0 { + ; CHECK-NEXT: $cm4, $p2, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s1, killed $p2, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: JNZ renamable $r1, %bb.1 + ; CHECK-NEXT: renamable $cm8 = VADD internal renamable $cm4, renamable $cm0, renamable $r0 + ; CHECK-NEXT: } ; CHECK-NEXT: NOP - ; CHECK-NEXT: renamable $p3 = VST_SRS_D8_S32_ag_pstm_nrm_imm killed renamable $p3, 32, killed renamable $cm8, renamable $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP + ; CHECK-NEXT: renamable $p3 = VST_SRS_D8_S32_ag_pstm_nrm_imm killed renamable $p3, 32, killed renamable $cm8, renamable $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) ; CHECK-NEXT: DelayedSchedBarrier ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: liveins: $cm0, $cm4, $p3, $r0, $s0 ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: NOP ; CHECK-NEXT: renamable $cm0 = VADD killed renamable $cm4, killed renamable $cm0, killed renamable $r0 ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/short-hwloop.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/short-hwloop.mir index 6f533ae34bfa..eb61b815734d 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/short-hwloop.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/short-hwloop.mir @@ -32,12 +32,11 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $p0, $r0, $r1, $r2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $r0 = ADD_NC_GPR $r1, 1 - ; CHECK-NEXT: BUNDLE implicit-def $p0, implicit-def $r1, implicit $r0, implicit killed $p0, implicit killed $r1 { - ; CHECK-NEXT: $p0 = ST_dms_sts_pstm_nrm_imm $r0, killed $p0, 4 + ; CHECK-NEXT: BUNDLE implicit-def $r1, implicit-def $r0, implicit killed $r1 { ; CHECK-NEXT: $r1 = MUL_mul_r_rr killed $r1, $r1 + ; CHECK-NEXT: $r0 = ADD_NC_GPR internal $r1, 1 ; CHECK-NEXT: } - ; CHECK-NEXT: NOP + ; CHECK-NEXT: $p0 = ST_dms_sts_pstm_nrm_imm $r0, killed $p0, 4 ; CHECK-NEXT: PseudoLoopEnd %bb.2, %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: From 73f1cd4643c6e3445f91486d0151d4f5e077d8db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Mon, 5 Aug 2024 16:00:18 +0100 Subject: [PATCH 3/4] [AIE2] NFC: Add baseline test with critical CM reg pressure In a follow-up commit, the premisched will re-order the instructions to reduce the pressure and avoid spills during RA. --- .../AIE/aie2/schedule/pre_ra/add2d_inner.mir | 125 ++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir new file mode 100644 index 000000000000..ddf2c75b1e5e --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir @@ -0,0 +1,125 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -march=aie2 -run-pass=machine-scheduler %s -o - | FileCheck %s + + +# This represents the innermost loop of Add2D after SW pipelining. +# We should see most of the VLDA.UPS instructions move down in the loop +# BB to reduce the reg pressure and avoid spills. They can later be moved back +# up by the post-RA scheduler. This should also make the 4 acc1024 COPY +# instructions coalesce-able. +--- +name: add2d_innermost +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: add2d_innermost + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0, $m0, $cm0, $cm1, $s0, $d1, $x0, $r0, $d0_3d + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:em = COPY $m0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:ep_as_32bit = COPY $p0 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:ep_as_32bit = COPY $p0 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = COPY $p0 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:eds = COPY $d0_3d + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:mss = COPY $s0 + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:mss = COPY $s0 + ; CHECK-NEXT: PseudoJ_jump_imm %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:acc1024 = COPY [[COPY4]] + ; CHECK-NEXT: [[VADD:%[0-9]+]]:acc1024 = VADD [[COPY5]], [[COPY17]], [[COPY14]] + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:acc1024 = COPY [[COPY3]] + ; CHECK-NEXT: [[VADD1:%[0-9]+]]:acc1024 = VADD [[COPY6]], [[COPY18]], [[COPY14]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:acc1024 = COPY [[COPY2]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: [[VADD2:%[0-9]+]]:acc1024 = VADD [[COPY7]], [[COPY19]], [[COPY14]] + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:acc1024 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: [[VADD3:%[0-9]+]]:acc1024 = VADD [[COPY8]], [[COPY20]], [[COPY14]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc1024, [[COPY10:%[0-9]+]]:ep_as_32bit, [[COPY12:%[0-9]+]].sub_dim_count:eds, [[COPY12:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 [[COPY15]], [[COPY10]], [[COPY12]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD1]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc1024, [[COPY10:%[0-9]+]]:ep_as_32bit, [[COPY12:%[0-9]+]].sub_dim_count:eds, [[COPY12:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 [[COPY15]], [[COPY10]], [[COPY12]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD2]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:acc1024, [[COPY10:%[0-9]+]]:ep_as_32bit, [[COPY12:%[0-9]+]].sub_dim_count:eds, [[COPY12:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 [[COPY15]], [[COPY10]], [[COPY12]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD3]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc1024, [[COPY10:%[0-9]+]]:ep_as_32bit, [[COPY12:%[0-9]+]].sub_dim_count:eds, [[COPY12:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 [[COPY15]], [[COPY10]], [[COPY12]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:er = ADD_add_r_ri [[COPY13]], -4, implicit-def dead $srcarry + ; CHECK-NEXT: PseudoJNZ [[COPY13]], %bb.1 + ; CHECK-NEXT: PseudoJ_jump_imm %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: PseudoRET implicit $lr + + bb.0.entry: + liveins: $p0, $m0, $cm0, $cm1, $s0, $d1, $x0, $r0, $d0_3d + + %367:acc1024 = COPY $cm0 + %365:acc1024 = COPY $cm0 + %363:acc1024 = COPY $cm0 + %361:acc1024 = COPY $cm0 + %362:acc1024 = COPY $cm0 + %364:acc1024 = COPY $cm0 + %366:acc1024 = COPY $cm0 + %368:acc1024 = COPY $cm0 + %248:mss = COPY $s0 + %245:mss = COPY $s0 + %355:ep_as_32bit = COPY $p0 + %358:ep_as_32bit = COPY $p0 + %359:ep_as_32bit = COPY $p0 + %82:em = COPY $m0 + %272:eds = COPY $d0_3d + %360:er = COPY $r0 + %206:er = COPY $r0 + PseudoJ_jump_imm %bb.1 + + bb.1: + successors: %bb.2(0x04000000), %bb.1(0x7c000000) + + %327:acc1024 = COPY %367 + %325:acc1024 = COPY %365 + %323:acc1024 = COPY %363 + %321:acc1024 = COPY %361 + %361:acc1024, %355:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm %248, %355, %82, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + %360:er = ADD_add_r_ri %360, -4, implicit-def dead $srcarry + %281:acc1024 = VADD %362, %321, %206 + %363:acc1024, %355:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm %248, %355, %82, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + %284:acc1024 = VADD %364, %323, %206 + %365:acc1024, %355:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm %248, %355, %82, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + %287:acc1024 = VADD %366, %325, %206 + %367:acc1024, %355:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm %248, %355, %82, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + %362:acc1024, %358:ep_as_32bit, %272.sub_dim_count:eds, %272.sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 %248, %358, %272, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + %295:acc1024 = VADD %368, %327, %206 + %364:acc1024, %358:ep_as_32bit, %272.sub_dim_count:eds, %272.sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 %248, %358, %272, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + %366:acc1024, %358:ep_as_32bit, %272.sub_dim_count:eds, %272.sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 %248, %358, %272, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + %359:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm %359, 32, %281, %245, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + %359:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm %359, 32, %284, %245, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + %368:acc1024, %358:ep_as_32bit, %272.sub_dim_count:eds, %272.sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 %248, %358, %272, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + %359:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm %359, 32, %287, %245, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + %359:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm %359, 32, %295, %245, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + PseudoJNZ %360, %bb.1 + PseudoJ_jump_imm %bb.2 + + bb.2: + PseudoRET implicit $lr +... From 71614b91f1618094cacf70ec03f2458571b6791d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Mon, 5 Aug 2024 17:07:42 +0100 Subject: [PATCH 4/4] [AIEX] Premisched: more conservative reg pressure reduction - Reserve a certain number of registers, not regunits - Be extra careful when the region max pressure exceeds limits --- llvm/lib/Target/AIE/AIEMachineScheduler.cpp | 54 +++-- llvm/lib/Target/AIE/AIEMachineScheduler.h | 4 + .../GlobalISel/legalize-dyn-stackalloc.ll | 31 ++- llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll | 31 ++- .../AIE/aie2/end-to-end/Conv2D-red-swp.ll | 212 +++++++++--------- .../CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll | 36 +-- .../test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll | 8 +- .../AIE/aie2/ra/tie-subregs-flow-3d.mir | 60 ++--- .../AIE/aie2/schedule/pre_ra/add2d_inner.mir | 12 +- .../AIE/aie2/schedule/pre_ra/conv2d_inner.mir | 12 +- .../aie2/schedule/pre_ra/reduce_pressure.mir | 94 ++++---- 11 files changed, 293 insertions(+), 261 deletions(-) diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp index 38d4776e2c79..40f95f4ab998 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp @@ -40,7 +40,7 @@ static cl::opt cl::desc("Track reg pressure more accurately and " "delay some instructions to avoid spills.")); static cl::opt NumCriticalFreeRegs( - "aie-premisched-near-critical-regs", cl::init(4), + "aie-premisched-near-critical-regs", cl::init(2), cl::desc("Number of free registers below which premisched should actively " "try to reduce the pressure.")); @@ -761,6 +761,33 @@ bool AIEPostRASchedStrategy::tryCandidate(SchedCandidate &Cand, return false; } +void AIEPreRASchedStrategy::initialize(ScheduleDAGMI *DAG) { + GenericScheduler::initialize(DAG); + + // Cache the threshold for each pressure set. + const std::vector &RegionMaxPressure = + static_cast(DAG)->getRegPressure().MaxSetPressure; + PSetThresholds.clear(); + for (unsigned PSet = 0, EndPSet = RegionMaxPressure.size(); PSet < EndPSet; + ++PSet) { + unsigned MaxPressure = RegionMaxPressure[PSet]; + unsigned Limit = Context->RegClassInfo->getRegPressureSetLimit(PSet); + + // If the region has a maximum pressure that exceeds the target threshold, + // artificially reduce that threshold to force more conservative scheduling. + if (MaxPressure > Limit) { + unsigned ExtraPressure = MaxPressure - Limit; + if (Limit > ExtraPressure) + Limit -= ExtraPressure; + else + Limit = 0; + LLVM_DEBUG(dbgs() << TRI->getRegPressureSetName(PSet) + << " Decreased Threshold to " << Limit << "\n"); + } + PSetThresholds.push_back(Limit); + } +} + void AIEPreRASchedStrategy::enterRegion(MachineBasicBlock *BB, MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, @@ -874,8 +901,9 @@ bool AIEPreRASchedStrategy::isAvailableNode(SUnit &SU, SchedBoundary &Zone, } unsigned CurrPressure = BotRPT.getRegSetPressureAtPos()[WorstPC.getPSet()]; - if (CurrPressure + WorstPC.getUnitInc() < - TRI->getRegPressureSetLimit(*CurMBB->getParent(), WorstPC.getPSet())) { + if (CurrPressure + WorstPC.getUnitInc() + + (NumCriticalFreeRegs * WorstPC.getUnitInc()) < + PSetThresholds[WorstPC.getPSet()]) { // Worsening pressure, but still within limits, keep node as available return true; } @@ -960,10 +988,11 @@ bool AIEPreRASchedStrategy::tryCandidate(SchedCandidate &Cand, if (!PC.isValid()) return false; unsigned CurrPressure = BotRPT.getRegSetPressureAtPos()[PC.getPSet()]; - unsigned Threshold = - TRI->getRegPressureSetLimit(*CurMBB->getParent(), PC.getPSet()); - return Threshold <= NumCriticalFreeRegs || - CurrPressure >= Threshold - NumCriticalFreeRegs; + unsigned Threshold = PSetThresholds[PC.getPSet()]; + unsigned NumCriticalFreeUnits = + NumCriticalFreeRegs * std::abs(PC.getUnitInc()); + return Threshold <= NumCriticalFreeUnits || + CurrPressure >= Threshold - NumCriticalFreeUnits; }; PressureChange TryCandPC = getPressureChange(estimatePressureDiff(*TryCand.SU, BotRPT)); @@ -972,13 +1001,12 @@ bool AIEPreRASchedStrategy::tryCandidate(SchedCandidate &Cand, if ((IsNearCritical(TryCandPC) || IsNearCritical(CandPC)) && tryPressure(TryCandPC, CandPC, TryCand, Cand, RegMax, TRI, DAG->MF)) return TryCand.Reason != NoCand; - } - // Avoid increasing the max pressure of the entire region. - if (DAG->isTrackingPressure() && - tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand, - Cand, RegMax, TRI, DAG->MF)) - return TryCand.Reason != NoCand; + // Avoid increasing the max pressure of the entire region. + if (tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, + TryCand, Cand, RegMax, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + } // Fall through to original instruction order. if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.h b/llvm/lib/Target/AIE/AIEMachineScheduler.h index da184abf12c7..b2f68a07129b 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.h +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.h @@ -151,6 +151,8 @@ class AIEPreRASchedStrategy : public GenericScheduler { public: AIEPreRASchedStrategy(const MachineSchedContext *C) : GenericScheduler(C) {} + void initialize(ScheduleDAGMI *DAG) override; + void enterRegion(MachineBasicBlock *BB, MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, unsigned RegionInstrs); void leaveRegion(const SUnit &ExitSU); @@ -182,6 +184,8 @@ class AIEPreRASchedStrategy : public GenericScheduler { /// pressure-reducing SU to be scheduled first. /// SUDelayerMap[0] = 2 means that SU(0) is waiting on SU(2). std::vector SUDelayerMap; + + std::vector PSetThresholds; }; /// An extension to ScheduleDAGMI that provides callbacks on region entry/exit diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-dyn-stackalloc.ll b/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-dyn-stackalloc.ll index f67ae5792250..2915bbb173ef 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-dyn-stackalloc.ll +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-dyn-stackalloc.ll @@ -150,34 +150,33 @@ define void @test_huge_stack(i32 noundef %n) #0 { ; CHECK-NEXT: mov p2, p7 ; CHECK-NEXT: mov p6, p7 ; CHECK-NEXT: paddb [p0], m0 -; CHECK-NEXT: paddb [p2], #-32 +; CHECK-NEXT: paddb [p6], #-32 +; CHECK-NEXT: movxm m0, #-40032 ; CHECK-NEXT: st r0, [p0, #0] ; CHECK-NEXT: lda r0, [p0, #0] -; CHECK-NEXT: mov r16, p2 +; CHECK-NEXT: paddb [p2], m0 ; CHECK-NEXT: mov p0, sp -; CHECK-NEXT: st p0, [p2, #0] -; CHECK-NEXT: mov p0, p1 -; CHECK-NEXT: mov p2, p7 -; CHECK-NEXT: paddb [p2], #-24 +; CHECK-NEXT: mov r16, p2 +; CHECK-NEXT: st p0, [p6, #0] +; CHECK-NEXT: mov p0, p7 +; CHECK-NEXT: paddb [p0], #-24 ; CHECK-NEXT: lshl r2, r0, r2 -; CHECK-NEXT: st r0, [p2], #4 +; CHECK-NEXT: st r0, [p0], #4 ; CHECK-NEXT: add r2, r2, #31 -; CHECK-NEXT: st r1, [p2, #0] -; CHECK-NEXT: and r2, r2, r3 +; CHECK-NEXT: st r1, [p0, #0] ; CHECK-NEXT: jl #extern_call -; CHECK-NEXT: mov m0, r2 // Delay Slot 5 -; CHECK-NEXT: paddb [p1], m0 // Delay Slot 4 -; CHECK-NEXT: movxm m0, #-40032 // Delay Slot 3 -; CHECK-NEXT: paddb [p6], m0 // Delay Slot 2 +; CHECK-NEXT: mov p0, p1 // Delay Slot 5 +; CHECK-NEXT: and r2, r2, r3 // Delay Slot 4 +; CHECK-NEXT: mov m0, r2 // Delay Slot 3 +; CHECK-NEXT: paddb [p1], m0 // Delay Slot 2 ; CHECK-NEXT: mov sp, p1 // Delay Slot 1 ; CHECK-NEXT: nopb ; nopa ; nops ; jl #extern_call; nopv ; CHECK-NEXT: nopa ; nopx // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 ; CHECK-NEXT: nop // Delay Slot 2 -; CHECK-NEXT: mov p0, p6 // Delay Slot 1 -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov p0, r16; nopv -; CHECK-NEXT: lda p0, [p0, #0]; nopx +; CHECK-NEXT: mov p0, r16 // Delay Slot 1 +; CHECK-NEXT: lda p0, [p6, #0]; nopx ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll b/llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll index 09ea5c39ff53..337fecd1e4bd 100644 --- a/llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll +++ b/llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll @@ -150,34 +150,33 @@ define void @test_huge_stack(i32 noundef %n) #0 { ; CHECK-NEXT: mov p2, p7 ; CHECK-NEXT: mov p6, p7 ; CHECK-NEXT: paddb [p0], m0 -; CHECK-NEXT: paddb [p2], #-32 +; CHECK-NEXT: paddb [p6], #-32 +; CHECK-NEXT: movxm m0, #-40032 ; CHECK-NEXT: st r0, [p0, #0] ; CHECK-NEXT: lda r0, [p0, #0] -; CHECK-NEXT: mov r16, p2 +; CHECK-NEXT: paddb [p2], m0 ; CHECK-NEXT: mov p0, sp -; CHECK-NEXT: st p0, [p2, #0] -; CHECK-NEXT: mov p0, p1 -; CHECK-NEXT: mov p2, p7 -; CHECK-NEXT: paddb [p2], #-24 +; CHECK-NEXT: mov r16, p2 +; CHECK-NEXT: st p0, [p6, #0] +; CHECK-NEXT: mov p0, p7 +; CHECK-NEXT: paddb [p0], #-24 ; CHECK-NEXT: lshl r2, r0, r2 -; CHECK-NEXT: st r0, [p2], #4 +; CHECK-NEXT: st r0, [p0], #4 ; CHECK-NEXT: add r2, r2, #31 -; CHECK-NEXT: st r1, [p2, #0] -; CHECK-NEXT: and r2, r2, r3 +; CHECK-NEXT: st r1, [p0, #0] ; CHECK-NEXT: jl #extern_call -; CHECK-NEXT: mov m0, r2 // Delay Slot 5 -; CHECK-NEXT: paddb [p1], m0 // Delay Slot 4 -; CHECK-NEXT: movxm m0, #-40032 // Delay Slot 3 -; CHECK-NEXT: paddb [p6], m0 // Delay Slot 2 +; CHECK-NEXT: mov p0, p1 // Delay Slot 5 +; CHECK-NEXT: and r2, r2, r3 // Delay Slot 4 +; CHECK-NEXT: mov m0, r2 // Delay Slot 3 +; CHECK-NEXT: paddb [p1], m0 // Delay Slot 2 ; CHECK-NEXT: mov sp, p1 // Delay Slot 1 ; CHECK-NEXT: nopb ; nopa ; nops ; jl #extern_call; nopv ; CHECK-NEXT: nopa ; nopx // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 ; CHECK-NEXT: nop // Delay Slot 2 -; CHECK-NEXT: mov p0, p6 // Delay Slot 1 -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov p0, r16; nopv -; CHECK-NEXT: lda p0, [p0, #0]; nopx +; CHECK-NEXT: mov p0, r16 // Delay Slot 1 +; CHECK-NEXT: lda p0, [p6, #0]; nopx ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll index 31f18e67cb2d..fc78d7e13404 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll @@ -238,50 +238,50 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: mov p7, sp ; DCL-NEXT: paddb [p7], #-272; st p6, [sp, #-188] // 4-byte Folded Spill ; DCL-NEXT: lda r25, [p7, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-204; mov dc3, dj3 +; DCL-NEXT: paddb [p6], #-292; mov dc3, dj3 ; DCL-NEXT: lda m0, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-208; mov r28, dj3 +; DCL-NEXT: paddb [p6], #-296; mov r28, dj3 ; DCL-NEXT: lda dj0, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-212; mov dc7, dj3 -; DCL-NEXT: lda dj4, [p6, #0] +; DCL-NEXT: paddb [p6], #-300; mov dc7, dj3 +; DCL-NEXT: lda dn0, [p6, #0] ; DCL-NEXT: mov p6, sp -; DCL-NEXT: paddb [p6], #-216 -; DCL-NEXT: lda dn0, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-220 +; DCL-NEXT: paddb [p6], #-204 +; DCL-NEXT: lda m0, [p6, #0]; mov p6, sp +; DCL-NEXT: paddb [p6], #-208 +; DCL-NEXT: lda dj0, [p6, #0]; mov p6, sp +; DCL-NEXT: paddb [p6], #-212 +; DCL-NEXT: lda dj4, [p6, #0]; mov p6, sp +; DCL-NEXT: paddb [p6], #-216; mov p7, sp +; DCL-NEXT: lda dn0, [p6, #0]; st m0, [sp, #-96] // 4-byte Folded Spill +; DCL-NEXT: paddb [p7], #-200; mov p6, sp +; DCL-NEXT: lda m6, [p7, #0]; paddb [p6], #-220; st dj0, [sp, #-88] // 4-byte Folded Spill ; DCL-NEXT: lda dn4, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-228 ; DCL-NEXT: lda r11, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-232; mov p7, sp -; DCL-NEXT: lda dj1, [p6, #0] -; DCL-NEXT: paddb [p7], #-200; mov p6, sp -; DCL-NEXT: lda m6, [p7, #0]; paddb [p6], #-236 -; DCL-NEXT: lda r12, [p6, #0]; mov p6, sp +; DCL-NEXT: paddb [p6], #-232; st dn0, [sp, #-92] // 4-byte Folded Spill +; DCL-NEXT: lda dj1, [p6, #0]; mov p6, sp +; DCL-NEXT: paddb [p6], #-236 +; DCL-NEXT: lda r12, [p6, #0] +; DCL-NEXT: mov p6, sp ; DCL-NEXT: paddb [p6], #-240 ; DCL-NEXT: lda dn1, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-244 ; DCL-NEXT: lda dn5, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-292 -; DCL-NEXT: lda m2, [p6, #0] -; DCL-NEXT: mov p6, sp -; DCL-NEXT: paddb [p6], #-296 -; DCL-NEXT: lda dj2, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-300 -; DCL-NEXT: lda dn2, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-248 ; DCL-NEXT: lda r13, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-252; mov p7, sp -; DCL-NEXT: lda dj2, [p6, #0]; st m2, [sp, #-96] // 4-byte Folded Spill +; DCL-NEXT: lda dj2, [p6, #0] ; DCL-NEXT: mov p6, sp ; DCL-NEXT: lda m7, [sp, #-96]; paddb [p6], #-256 // 4-byte Folded Reload ; DCL-NEXT: lda dj6, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-260 ; DCL-NEXT: lda dn2, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-264; st dj2, [sp, #-88] // 4-byte Folded Spill +; DCL-NEXT: paddb [p6], #-264 ; DCL-NEXT: lda dn6, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-268 ; DCL-NEXT: lda r14, [p6, #0] ; DCL-NEXT: mov p6, sp -; DCL-NEXT: lda dj7, [sp, #-88]; paddb [p6], #-276; st dn2, [sp, #-92] // 4-byte Folded Reload4-byte Folded Spill +; DCL-NEXT: lda dj7, [sp, #-88]; paddb [p6], #-276 // 4-byte Folded Reload ; DCL-NEXT: lda dn3, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-280 ; DCL-NEXT: lda r26, [p6, #0]; mov p6, sp @@ -301,10 +301,10 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: // =>This Loop Header: Depth=1 ; DCL-NEXT: // Child Loop BB0_2 Depth 2 ; DCL-NEXT: vlda wl6, [p1], #32; nopxm -; DCL-NEXT: vlda wl5, [p0], m6; mov r0, p0 +; DCL-NEXT: vlda wl3, [p0], m6; mov r0, p0 ; DCL-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32] ; DCL-NEXT: vlda wh6, [p1], #32 -; DCL-NEXT: vlda wh5, [p0], m6; mov m5, p4 +; DCL-NEXT: vlda wh3, [p0], m6; mov m5, p4 ; DCL-NEXT: vlda.ups.s32.s16 bml0, s0, [p2], m5 ; DCL-NEXT: vlda wl8, [p1], #32 ; DCL-NEXT: vlda wl7, [p0], m6 @@ -323,29 +323,29 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m5 ; DCL-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32] ; DCL-NEXT: vlda wh8, [p1], #32 -; DCL-NEXT: vlda wl3, [p0], m6; mov r1, p0 +; DCL-NEXT: vlda wl5, [p0], m6; mov r1, p0 ; DCL-NEXT: vlda.ups.s32.s16 bml7, s0, [p2, #0]; and r0, r0, r9 -; DCL-NEXT: vlda wh3, [p0], m6; add r0, r0, #33 -; DCL-NEXT: vlda wl5, [p0], m6; vshift.align x4, x4, s1, x5, r0 -; DCL-NEXT: vlda.3d wh5, [p0], d0; and r10, r1, r9; vshift.align x2, x2, s1, x7, r0 -; DCL-NEXT: vlda wl10, [p1], #32; add r0, r10, #33; mov r10, p0 -; DCL-NEXT: vlda wh10, [p1], #32; add r1, r5, #-1; vshuffle x7, x4, x2, r2 -; DCL-NEXT: vlda wl1, [p1], #32; add r1, r1, #-1; vshuffle x9, x7, x0, r8 -; DCL-NEXT: vlda wh1, [p1], #32; and r10, r10, r9 +; DCL-NEXT: vlda wh5, [p0], m6; add r0, r0, #33 +; DCL-NEXT: vlda wl3, [p0], m6; vshift.align x4, x4, s1, x3, r0 +; DCL-NEXT: vlda.3d wh3, [p0], d0; and r10, r1, r9; vshift.align x2, x2, s1, x7, r0 +; DCL-NEXT: vlda wl1, [p1], #32; add r0, r10, #33; mov r10, p0 +; DCL-NEXT: vlda wh1, [p1], #32; add r1, r5, #-1; vshuffle x7, x4, x2, r2 +; DCL-NEXT: vlda wl10, [p1], #32; add r1, r1, #-1; vshuffle x9, x7, x0, r8 +; DCL-NEXT: vlda wh10, [p1], #32; and r10, r10, r9 ; DCL-NEXT: .p2align 4 ; DCL-NEXT: .LBB0_2: // %inner.loop ; DCL-NEXT: // Parent Loop BB0_1 Depth=1 ; DCL-NEXT: // => This Inner Loop Header: Depth=2 ; DCL-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4 -; DCL-NEXT: nopa ; nopb ; nopx ; vshift.align x4, x4, s1, x3, r0; vmac cm5, cm5, x9, x8, r4 -; DCL-NEXT: vlda wl3, [p0], m6; vshift.align x2, x2, s1, x5, r0 -; DCL-NEXT: vlda wh3, [p0], m6; add r1, r1, #-1; vshuffle x11, x9, x0, r8 -; DCL-NEXT: vlda wl5, [p0], m6; jnz r1, #.LBB0_2; vmac cm0, cm0, x7, x6, r4 -; DCL-NEXT: vlda.3d wh5, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 // Delay Slot 5 -; DCL-NEXT: vlda wl10, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 // Delay Slot 4 -; DCL-NEXT: vlda wh10, [p1], #32; vmov x6, x10; vmac cm6, cm6, x9, x8, r4 // Delay Slot 3 -; DCL-NEXT: vlda wl1, [p1], #32; add r0, r10, #33; mov r10, p0; vmac cm3, cm3, x11, x6, r4 // Delay Slot 2 -; DCL-NEXT: vlda wh1, [p1], #32; and r10, r10, r9; vmov x8, x1; vmac cm7, cm7, x11, x8, r4 // Delay Slot 1 +; DCL-NEXT: nopa ; nopb ; nopx ; vshift.align x4, x4, s1, x5, r0; vmac cm5, cm5, x9, x8, r4 +; DCL-NEXT: vlda wl5, [p0], m6; vshift.align x2, x2, s1, x3, r0 +; DCL-NEXT: vlda wh5, [p0], m6; add r1, r1, #-1; vshuffle x11, x9, x0, r8 +; DCL-NEXT: vlda wl3, [p0], m6; jnz r1, #.LBB0_2; vmac cm0, cm0, x7, x6, r4 +; DCL-NEXT: vlda.3d wh3, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 // Delay Slot 5 +; DCL-NEXT: vlda wl1, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 // Delay Slot 4 +; DCL-NEXT: vlda wh1, [p1], #32; vmov x6, x1; vmac cm6, cm6, x9, x8, r4 // Delay Slot 3 +; DCL-NEXT: vlda wl10, [p1], #32; add r0, r10, #33; mov r10, p0; vmac cm3, cm3, x11, x6, r4 // Delay Slot 2 +; DCL-NEXT: vlda wh10, [p1], #32; and r10, r10, r9; vmov x8, x10; vmac cm7, cm7, x11, x8, r4 // Delay Slot 1 ; DCL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 ; DCL-NEXT: nopa ; nopb ; nopx ; vmov x11, x0 ; DCL-NEXT: vshuffle x0, x4, x2, r3 @@ -359,25 +359,25 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: vlda wh0, [sp, #-32]; vmac cm2, cm2, x0, x6, r4 // 32-byte Folded Reload ; DCL-NEXT: lda dn7, [sp, #-92]; vmac cm5, cm6, x0, x8, r4 // 4-byte Folded Reload ; DCL-NEXT: vmac cm4, cm5, x9, x8, r4 -; DCL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x3, r0; vmac cm8, cm4, x7, x8, r4 // 4-byte Folded Reload -; DCL-NEXT: vshift.align x2, x2, s1, x5, r0; vmac cm3, cm3, x11, x6, r4 +; DCL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x5, r0; vmac cm8, cm4, x7, x8, r4 // 4-byte Folded Reload +; DCL-NEXT: vshift.align x2, x2, s1, x3, r0; vmac cm3, cm3, x11, x6, r4 ; DCL-NEXT: vshuffle x6, x4, x2, r2 ; DCL-NEXT: vmac cm6, cm7, x6, x8, r4 -; DCL-NEXT: vshuffle x8, x6, x0, r8; vmac cm7, cm0, x6, x10, r4 +; DCL-NEXT: vshuffle x8, x6, x0, r8; vmac cm7, cm0, x6, x1, r4 ; DCL-NEXT: st dn7, [sp, #-92] // 4-byte Folded Spill -; DCL-NEXT: vshuffle x3, x4, x2, r3; vmac cm0, cm1, x8, x10, r4 +; DCL-NEXT: vshuffle x3, x4, x2, r3; vmac cm0, cm1, x8, x1, r4 ; DCL-NEXT: st dj7, [sp, #-88] // 4-byte Folded Spill -; DCL-NEXT: vshuffle x5, x3, x0, r8; vmac cm1, cm2, x3, x10, r4 +; DCL-NEXT: vshuffle x5, x3, x0, r8; vmac cm1, cm2, x3, x1, r4 ; DCL-NEXT: vst.srs.s16.s32 bmh7, s2, [p3, #32]; mov s3, r6 -; DCL-NEXT: vst.srs.s16.s32 bml7, s3, [p3], #64; vmac cm2, cm3, x5, x10, r4 +; DCL-NEXT: vst.srs.s16.s32 bml7, s3, [p3], #64; vmac cm2, cm3, x5, x1, r4 ; DCL-NEXT: vst.srs.s16.s32 bmh0, s3, [p3, #32] -; DCL-NEXT: vst.srs.s16.s32 bml0, s3, [p3], m4; vmac cm3, cm8, x6, x1, r4 +; DCL-NEXT: vst.srs.s16.s32 bml0, s3, [p3], m4; vmac cm3, cm8, x6, x10, r4 ; DCL-NEXT: vst.srs.s16.s32 bmh1, s3, [p3, #32] -; DCL-NEXT: lda m7, [sp, #-96]; vst.srs.s16.s32 bml1, s3, [p3], #64; vmac cm8, cm4, x8, x1, r4 // 4-byte Folded Reload +; DCL-NEXT: lda m7, [sp, #-96]; vst.srs.s16.s32 bml1, s3, [p3], #64; vmac cm8, cm4, x8, x10, r4 // 4-byte Folded Reload ; DCL-NEXT: lda dc7, [sp, #-84]; vst.srs.s16.s32 bmh2, s3, [p3, #32]; mov m1, r27 // 4-byte Folded Reload -; DCL-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m1; vmac cm5, cm5, x3, x1, r4 +; DCL-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m1; vmac cm5, cm5, x3, x10, r4 ; DCL-NEXT: vst.srs.s16.s32 bmh3, s3, [p3, #32]; mov dj5, r12 -; DCL-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64; mov m2, r13; vmac cm4, cm6, x5, x1, r4 +; DCL-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64; mov m2, r13; vmac cm4, cm6, x5, x10, r4 ; DCL-NEXT: vst.srs.s16.s32 bmh8, s3, [p3, #32]; mov m3, r14 ; DCL-NEXT: vst.srs.s16.s32 bml8, s3, [p3], m4 ; DCL-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32] @@ -417,50 +417,50 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: mov p7, sp ; ZOL-NEXT: paddb [p7], #-272; st p6, [sp, #-188] // 4-byte Folded Spill ; ZOL-NEXT: lda r24, [p7, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-204; mov dc3, dj3 +; ZOL-NEXT: paddb [p6], #-292; mov dc3, dj3 ; ZOL-NEXT: lda m0, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-208; mov r27, dj3 +; ZOL-NEXT: paddb [p6], #-296; mov r27, dj3 ; ZOL-NEXT: lda dj0, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-212; mov dc7, dj3 -; ZOL-NEXT: lda dj4, [p6, #0] +; ZOL-NEXT: paddb [p6], #-300; mov dc7, dj3 +; ZOL-NEXT: lda dn0, [p6, #0] ; ZOL-NEXT: mov p6, sp -; ZOL-NEXT: paddb [p6], #-216 -; ZOL-NEXT: lda dn0, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-220 +; ZOL-NEXT: paddb [p6], #-204 +; ZOL-NEXT: lda m0, [p6, #0]; mov p6, sp +; ZOL-NEXT: paddb [p6], #-208 +; ZOL-NEXT: lda dj0, [p6, #0]; mov p6, sp +; ZOL-NEXT: paddb [p6], #-212 +; ZOL-NEXT: lda dj4, [p6, #0]; mov p6, sp +; ZOL-NEXT: paddb [p6], #-216; mov p7, sp +; ZOL-NEXT: lda dn0, [p6, #0]; st m0, [sp, #-96] // 4-byte Folded Spill +; ZOL-NEXT: paddb [p7], #-200; mov p6, sp +; ZOL-NEXT: lda m6, [p7, #0]; paddb [p6], #-220; st dj0, [sp, #-88] // 4-byte Folded Spill ; ZOL-NEXT: lda dn4, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-228 ; ZOL-NEXT: lda r10, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-232; mov p7, sp -; ZOL-NEXT: lda dj1, [p6, #0] -; ZOL-NEXT: paddb [p7], #-200; mov p6, sp -; ZOL-NEXT: lda m6, [p7, #0]; paddb [p6], #-236 -; ZOL-NEXT: lda r11, [p6, #0]; mov p6, sp +; ZOL-NEXT: paddb [p6], #-232; st dn0, [sp, #-92] // 4-byte Folded Spill +; ZOL-NEXT: lda dj1, [p6, #0]; mov p6, sp +; ZOL-NEXT: paddb [p6], #-236 +; ZOL-NEXT: lda r11, [p6, #0] +; ZOL-NEXT: mov p6, sp ; ZOL-NEXT: paddb [p6], #-240 ; ZOL-NEXT: lda dn1, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-244 ; ZOL-NEXT: lda dn5, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-292 -; ZOL-NEXT: lda m2, [p6, #0] -; ZOL-NEXT: mov p6, sp -; ZOL-NEXT: paddb [p6], #-296 -; ZOL-NEXT: lda dj2, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-300 -; ZOL-NEXT: lda dn2, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-248 ; ZOL-NEXT: lda r12, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-252; mov p7, sp -; ZOL-NEXT: lda dj2, [p6, #0]; st m2, [sp, #-96] // 4-byte Folded Spill +; ZOL-NEXT: lda dj2, [p6, #0] ; ZOL-NEXT: mov p6, sp ; ZOL-NEXT: lda m7, [sp, #-96]; paddb [p6], #-256 // 4-byte Folded Reload ; ZOL-NEXT: lda dj6, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-260 ; ZOL-NEXT: lda dn2, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-264; st dj2, [sp, #-88] // 4-byte Folded Spill +; ZOL-NEXT: paddb [p6], #-264 ; ZOL-NEXT: lda dn6, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-268 ; ZOL-NEXT: lda r13, [p6, #0] ; ZOL-NEXT: mov p6, sp -; ZOL-NEXT: lda dj7, [sp, #-88]; paddb [p6], #-276; st dn2, [sp, #-92] // 4-byte Folded Reload4-byte Folded Spill +; ZOL-NEXT: lda dj7, [sp, #-88]; paddb [p6], #-276 // 4-byte Folded Reload ; ZOL-NEXT: lda dn3, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-280 ; ZOL-NEXT: lda r25, [p6, #0]; mov p6, sp @@ -479,53 +479,53 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: .LBB0_1: // %outer.loop.header ; ZOL-NEXT: // =>This Loop Header: Depth=1 ; ZOL-NEXT: // Child Loop BB0_2 Depth 2 -; ZOL-NEXT: vlda wl6, [p1], #32; nopb ; nopxm +; ZOL-NEXT: vlda wl6, [p1], #32; nopx ; ZOL-NEXT: vlda wl3, [p0], m6; mov r0, p0 ; ZOL-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32] ; ZOL-NEXT: vlda wh6, [p1], #32 ; ZOL-NEXT: vlda wh3, [p0], m6; mov m5, p4 ; ZOL-NEXT: vlda.ups.s32.s16 bml0, s0, [p2], m5 ; ZOL-NEXT: vlda wl8, [p1], #32 -; ZOL-NEXT: vlda wl5, [p0], m6 +; ZOL-NEXT: vlda wl7, [p0], m6 ; ZOL-NEXT: vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m1, p5 ; ZOL-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m1 ; ZOL-NEXT: vlda wh8, [p1], #32 -; ZOL-NEXT: vlda.3d wh5, [p0], d0 +; ZOL-NEXT: vlda.3d wh7, [p0], d0 ; ZOL-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32] ; ZOL-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m5 +; ZOL-NEXT: vlda wl1, [p1], #32 ; ZOL-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32]; mov m2, r14 ; ZOL-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m2 ; ZOL-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32] ; ZOL-NEXT: vlda.ups.s32.s16 bml4, s0, [p2], m5 ; ZOL-NEXT: vlda.ups.s32.s16 bmh5, s0, [p2, #32] -; ZOL-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m1 +; ZOL-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m1; movxm ls, #.LBB0_2 ; ZOL-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32]; add.nc r1, r5, #-2 -; ZOL-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m5; movxm ls, #.LBB0_2 -; ZOL-NEXT: vlda wl10, [p1], #32; movxm le, #.L_LEnd0 -; ZOL-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32]; mov lc, r1 -; ZOL-NEXT: nopb ; vlda wl3, [p0], m6; nops ; nopx ; mov r1, p0; nopv -; ZOL-NEXT: nopb ; vlda wh3, [p0], m6; nops ; and r0, r0, r9; nopm ; nopv -; ZOL-NEXT: nopb ; vlda wl5, [p0], m6; nops ; add r0, r0, #33; nopm ; nopv -; ZOL-NEXT: nopb ; vlda.3d wh5, [p0], d0; nops ; nopx ; vshift.align x4, x4, s1, x3, r0; nopv -; ZOL-NEXT: nopb ; vlda.ups.s32.s16 bml7, s0, [p2, #0]; nops ; nopx ; vshift.align x2, x2, s1, x5, r0; nopv -; ZOL-NEXT: nopb ; vlda wh10, [p1], #32; nops ; and r1, r1, r9; vshuffle x7, x4, x2, r2; nopv -; ZOL-NEXT: nopb ; vlda wl1, [p1], #32; nops ; add r0, r1, #33; mov r1, p0; nopv -; ZOL-NEXT: vlda wh1, [p1], #32; nopb ; nopx ; vshuffle x9, x7, x0, r8 +; ZOL-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m5; mov lc, r1 +; ZOL-NEXT: vlda wl5, [p0], m6; mov r1, p0 +; ZOL-NEXT: vlda wh5, [p0], m6; movxm le, #.L_LEnd0 +; ZOL-NEXT: nopb ; vlda.ups.s32.s16 bmh7, s0, [p2, #32]; nops ; and r0, r0, r9; nopm ; nopv +; ZOL-NEXT: nopb ; vlda wl3, [p0], m6; nops ; add r0, r0, #33; nopm ; nopv +; ZOL-NEXT: nopb ; vlda.3d wh3, [p0], d0; nops ; nopx ; vshift.align x4, x4, s1, x3, r0; nopv +; ZOL-NEXT: nopb ; vlda.ups.s32.s16 bml7, s0, [p2, #0]; nops ; and r1, r1, r9; vshift.align x2, x2, s1, x7, r0; nopv +; ZOL-NEXT: nopb ; vlda wh1, [p1], #32; nops ; add r0, r1, #33; mov r1, p0; nopv +; ZOL-NEXT: nopb ; vlda wl10, [p1], #32; nops ; nopx ; vshuffle x7, x4, x2, r2; nopv +; ZOL-NEXT: vlda wh10, [p1], #32; nopb ; nopx ; vshuffle x9, x7, x0, r8 ; ZOL-NEXT: and r1, r1, r9 ; ZOL-NEXT: .p2align 4 ; ZOL-NEXT: .LBB0_2: // %inner.loop ; ZOL-NEXT: // Parent Loop BB0_1 Depth=1 ; ZOL-NEXT: // => This Inner Loop Header: Depth=2 ; ZOL-NEXT: nopa ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4 -; ZOL-NEXT: vlda wl3, [p0], m6; vshift.align x4, x4, s1, x3, r0; vmac cm5, cm5, x9, x8, r4 -; ZOL-NEXT: vlda wh3, [p0], m6; vshift.align x2, x2, s1, x5, r0 -; ZOL-NEXT: vlda wl5, [p0], m6; vshuffle x11, x9, x0, r8; vmac cm0, cm0, x7, x6, r4 -; ZOL-NEXT: vlda.3d wh5, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 -; ZOL-NEXT: vlda wl10, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 -; ZOL-NEXT: vlda wh10, [p1], #32; vmov x6, x10; vmac cm6, cm6, x9, x8, r4 -; ZOL-NEXT: vlda wl1, [p1], #32; add r0, r1, #33; mov r1, p0; vmac cm3, cm3, x11, x6, r4 +; ZOL-NEXT: vlda wl5, [p0], m6; vshift.align x4, x4, s1, x5, r0; vmac cm5, cm5, x9, x8, r4 +; ZOL-NEXT: vlda wh5, [p0], m6; vshift.align x2, x2, s1, x3, r0 +; ZOL-NEXT: vlda wl3, [p0], m6; vshuffle x11, x9, x0, r8; vmac cm0, cm0, x7, x6, r4 +; ZOL-NEXT: vlda.3d wh3, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 +; ZOL-NEXT: vlda wl1, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 +; ZOL-NEXT: vlda wh1, [p1], #32; vmov x6, x1; vmac cm6, cm6, x9, x8, r4 +; ZOL-NEXT: vlda wl10, [p1], #32; add r0, r1, #33; mov r1, p0; vmac cm3, cm3, x11, x6, r4 ; ZOL-NEXT: .L_LEnd0: -; ZOL-NEXT: nopb ; vlda wh1, [p1], #32; nops ; and r1, r1, r9; vmov x8, x1; vmac cm7, cm7, x11, x8, r4 +; ZOL-NEXT: nopb ; vlda wh10, [p1], #32; nops ; and r1, r1, r9; vmov x8, x10; vmac cm7, cm7, x11, x8, r4 ; ZOL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 ; ZOL-NEXT: nopa ; nopb ; nopx ; vmov x11, x0 ; ZOL-NEXT: vshuffle x0, x4, x2, r3 @@ -539,25 +539,25 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: vlda wh0, [sp, #-32]; vmac cm2, cm2, x0, x6, r4 // 32-byte Folded Reload ; ZOL-NEXT: lda dn7, [sp, #-92]; vmac cm5, cm6, x0, x8, r4 // 4-byte Folded Reload ; ZOL-NEXT: vmac cm4, cm5, x9, x8, r4 -; ZOL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x3, r0; vmac cm8, cm4, x7, x8, r4 // 4-byte Folded Reload -; ZOL-NEXT: vshift.align x2, x2, s1, x5, r0; vmac cm3, cm3, x11, x6, r4 +; ZOL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x5, r0; vmac cm8, cm4, x7, x8, r4 // 4-byte Folded Reload +; ZOL-NEXT: vshift.align x2, x2, s1, x3, r0; vmac cm3, cm3, x11, x6, r4 ; ZOL-NEXT: vshuffle x6, x4, x2, r2 ; ZOL-NEXT: vmac cm6, cm7, x6, x8, r4 -; ZOL-NEXT: vshuffle x8, x6, x0, r8; vmac cm7, cm0, x6, x10, r4 +; ZOL-NEXT: vshuffle x8, x6, x0, r8; vmac cm7, cm0, x6, x1, r4 ; ZOL-NEXT: st dn7, [sp, #-92] // 4-byte Folded Spill -; ZOL-NEXT: vshuffle x3, x4, x2, r3; vmac cm0, cm1, x8, x10, r4 +; ZOL-NEXT: vshuffle x3, x4, x2, r3; vmac cm0, cm1, x8, x1, r4 ; ZOL-NEXT: st dj7, [sp, #-88] // 4-byte Folded Spill -; ZOL-NEXT: vshuffle x5, x3, x0, r8; vmac cm1, cm2, x3, x10, r4 +; ZOL-NEXT: vshuffle x5, x3, x0, r8; vmac cm1, cm2, x3, x1, r4 ; ZOL-NEXT: vst.srs.s16.s32 bmh7, s2, [p3, #32]; mov s3, r6 -; ZOL-NEXT: vst.srs.s16.s32 bml7, s3, [p3], #64; vmac cm2, cm3, x5, x10, r4 +; ZOL-NEXT: vst.srs.s16.s32 bml7, s3, [p3], #64; vmac cm2, cm3, x5, x1, r4 ; ZOL-NEXT: vst.srs.s16.s32 bmh0, s3, [p3, #32] -; ZOL-NEXT: vst.srs.s16.s32 bml0, s3, [p3], m4; vmac cm3, cm8, x6, x1, r4 +; ZOL-NEXT: vst.srs.s16.s32 bml0, s3, [p3], m4; vmac cm3, cm8, x6, x10, r4 ; ZOL-NEXT: vst.srs.s16.s32 bmh1, s3, [p3, #32] -; ZOL-NEXT: lda m7, [sp, #-96]; vst.srs.s16.s32 bml1, s3, [p3], #64; vmac cm8, cm4, x8, x1, r4 // 4-byte Folded Reload +; ZOL-NEXT: lda m7, [sp, #-96]; vst.srs.s16.s32 bml1, s3, [p3], #64; vmac cm8, cm4, x8, x10, r4 // 4-byte Folded Reload ; ZOL-NEXT: lda dc7, [sp, #-84]; vst.srs.s16.s32 bmh2, s3, [p3, #32]; mov m1, r26 // 4-byte Folded Reload -; ZOL-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m1; vmac cm5, cm5, x3, x1, r4 +; ZOL-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m1; vmac cm5, cm5, x3, x10, r4 ; ZOL-NEXT: vst.srs.s16.s32 bmh3, s3, [p3, #32]; mov dj5, r11 -; ZOL-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64; mov m2, r12; vmac cm4, cm6, x5, x1, r4 +; ZOL-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64; mov m2, r12; vmac cm4, cm6, x5, x10, r4 ; ZOL-NEXT: vst.srs.s16.s32 bmh8, s3, [p3, #32]; mov m3, r13 ; ZOL-NEXT: vst.srs.s16.s32 bml8, s3, [p3], m4 ; ZOL-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32] diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll index eac8073544b8..ece27639a0ac 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll @@ -45,35 +45,35 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ASM-NEXT: mov dc0, dj3 ; ASM-NEXT: st p6, [sp, #-28] // 4-byte Folded Spill ; ASM-NEXT: mov p6, sp -; ASM-NEXT: paddb [p6], #-44; mov dc4, dj3 +; ASM-NEXT: paddb [p6], #-132; mov dc4, dj3 +; ASM-NEXT: lda m5, [p6, #0]; mov p6, sp +; ASM-NEXT: paddb [p6], #-136; mov dc1, dj3 +; ASM-NEXT: lda r28, [p6, #0]; mov r25, dj3 +; ASM-NEXT: mov p6, sp +; ASM-NEXT: paddb [p6], #-140; mov dc2, dj3 +; ASM-NEXT: lda r27, [p6, #0]; mov p6, sp +; ASM-NEXT: paddb [p6], #-44 ; ASM-NEXT: lda m0, [p6, #0]; mov p6, sp -; ASM-NEXT: paddb [p6], #-48; mov dc1, dj3 -; ASM-NEXT: lda dj0, [p6, #0]; mov r25, dj3 +; ASM-NEXT: paddb [p6], #-48 +; ASM-NEXT: lda dj0, [p6, #0]; mov p6, sp +; ASM-NEXT: paddb [p6], #-52; mov dc6, dj3 +; ASM-NEXT: lda dj4, [p6, #0] ; ASM-NEXT: mov p6, sp -; ASM-NEXT: paddb [p6], #-52; mov dc2, dj3 -; ASM-NEXT: lda dj4, [p6, #0]; mov p6, sp ; ASM-NEXT: paddb [p6], #-56 ; ASM-NEXT: lda dn0, [p6, #0]; mov p6, sp ; ASM-NEXT: paddb [p6], #-60 ; ASM-NEXT: lda dn4, [p6, #0]; mov p6, sp -; ASM-NEXT: paddb [p6], #-68; mov dc6, dj3 -; ASM-NEXT: lda r10, [p6, #0] +; ASM-NEXT: paddb [p6], #-68 +; ASM-NEXT: lda r10, [p6, #0]; mov p6, sp +; ASM-NEXT: paddb [p6], #-72; mov dc3, dj3 +; ASM-NEXT: lda dj1, [p6, #0] ; ASM-NEXT: mov p6, sp -; ASM-NEXT: paddb [p6], #-72 -; ASM-NEXT: lda dj1, [p6, #0]; mov p6, sp ; ASM-NEXT: paddb [p6], #-76 ; ASM-NEXT: lda r11, [p6, #0]; mov p6, sp ; ASM-NEXT: paddb [p6], #-80 ; ASM-NEXT: lda dn1, [p6, #0]; mov p6, sp -; ASM-NEXT: paddb [p6], #-84; mov dc3, dj3 -; ASM-NEXT: lda r12, [p6, #0] -; ASM-NEXT: mov p6, sp -; ASM-NEXT: paddb [p6], #-132 -; ASM-NEXT: lda m5, [p6, #0]; mov p6, sp -; ASM-NEXT: paddb [p6], #-136 -; ASM-NEXT: lda r28, [p6, #0]; mov p6, sp -; ASM-NEXT: paddb [p6], #-140 -; ASM-NEXT: lda r27, [p6, #0]; mov p6, sp +; ASM-NEXT: paddb [p6], #-84 +; ASM-NEXT: lda r12, [p6, #0]; mov p6, sp ; ASM-NEXT: paddb [p6], #-88; mov dc7, dj3 ; ASM-NEXT: lda r13, [p6, #0] ; ASM-NEXT: mov p6, sp diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll index 24b62e2d76d5..a292a15d449f 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll @@ -67,10 +67,10 @@ define void @mul2d(ptr noalias %in_ptr0, ptr noalias %in_ptr1, ptr noalias %out_ ; CHECK-NEXT: paddb [p3], #-20 ; CHECK-NEXT: lda dn4, [p3, #0]; mov p3, sp ; CHECK-NEXT: mova dc0, #0; paddb [p3], #-24; movx r2, #1 -; CHECK-NEXT: lda m0, [p3, #0]; movx r3, #0 -; CHECK-NEXT: movx r4, #-1; mov dc4, dc0 -; CHECK-NEXT: extend.u8 r5, r5 -; CHECK-NEXT: lshl r1, r1, r4; mov s0, r5 +; CHECK-NEXT: lda m0, [p3, #0]; extend.u8 r5, r5 +; CHECK-NEXT: movx r3, #0; mov s0, r5 +; CHECK-NEXT: movx r4, #-1 +; CHECK-NEXT: lshl r1, r1, r4; mov dc4, dc0 ; CHECK-NEXT: ne r2, r0, r2; vbcst.8 x0, r3 ; CHECK-NEXT: movx r0, #808; mov crSRSSign, r2 ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-flow-3d.mir b/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-flow-3d.mir index b71fca714ae3..53d67b09d727 100644 --- a/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-flow-3d.mir +++ b/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-flow-3d.mir @@ -47,11 +47,9 @@ body: | %7:edc = COPY $r7 %8:ep = COPY $p0 - ; ISel code for: %200(p0), %300(i20), %400(i20) = G_INTRINSIC(add_3d_byte) %8, %0, %1, %2, %3, %4, %5, %6, %7 %100:eds = REG_SEQUENCE %0, %subreg.sub_mod, %1, %subreg.sub_dim_size, %2, %subreg.sub_dim_stride, %3, %subreg.sub_dim_count, %4, %subreg.sub_hi_dim_then_sub_mod, %5, %subreg.sub_hi_dim_then_sub_dim_size, %6, %subreg.sub_hi_dim_then_sub_dim_stride, %7, %subreg.sub_hi_dim_then_sub_dim_count %200:ep, %300:edc, %400:edc = PADDA_3D %8, %100 - ; ISel code for: %201(p0), %301(i20), %401(i20) = G_INTRINSIC(add_3d_byte) %8, %0, %1, %2, %300, %4, %5, %6, %400 %101:eds = REG_SEQUENCE %0, %subreg.sub_mod, %1, %subreg.sub_dim_size, %2, %subreg.sub_dim_stride, %300, %subreg.sub_dim_count, %4, %subreg.sub_hi_dim_then_sub_mod, %5, %subreg.sub_hi_dim_then_sub_dim_size, %6, %subreg.sub_hi_dim_then_sub_dim_stride, %400, %subreg.sub_hi_dim_then_sub_dim_count %201:ep, %301:edc, %401:edc = PADDA_3D %200, %101 @@ -84,38 +82,42 @@ body: | ; CHECK-LABEL: name: test_4_padd_scarce ; CHECK: liveins: $m0, $p0, $p1, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r13, $r14, $r15, $d3_3d ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $m2 = MOV_mv_scl killed $r8 - ; CHECK-NEXT: dead renamable $m6 = KILL killed $r12 - ; CHECK-NEXT: $dn2 = MOV_mv_scl killed $r9 - ; CHECK-NEXT: $dj2 = MOV_mv_scl killed $r10 - ; CHECK-NEXT: $dc2 = MOV_mv_scl killed $r11 - ; CHECK-NEXT: $dn6 = MOV_mv_scl killed $r13 - ; CHECK-NEXT: $dj6 = MOV_mv_scl killed $r14 - ; CHECK-NEXT: $dc6 = MOV_mv_scl killed $r15 - ; CHECK-NEXT: $p2 = MOV_mv_scl $p1 - ; CHECK-NEXT: $p1, dead $dc2, dead $dc6 = PADDA_3D killed $p1, $d2_3d - ; CHECK-NEXT: $m1 = MOV_mv_scl $r0 - ; CHECK-NEXT: $dn1 = MOV_mv_scl $r1 - ; CHECK-NEXT: $dj1 = MOV_mv_scl $r2 - ; CHECK-NEXT: $dc1 = MOV_mv_scl $r3 - ; CHECK-NEXT: $m5 = MOV_mv_scl $r4 - ; CHECK-NEXT: $dn5 = MOV_mv_scl $r5 - ; CHECK-NEXT: $dj5 = MOV_mv_scl $r6 - ; CHECK-NEXT: $dc5 = MOV_mv_scl $r7 + ; CHECK-NEXT: frame-setup PADDB_sp_imm 32, implicit-def $sp, implicit $sp + ; CHECK-NEXT: $m2 = MOV_mv_scl killed $r0 + ; CHECK-NEXT: $m1 = MOV_mv_scl killed $r8 ; CHECK-NEXT: $dn2 = MOV_mv_scl killed $r1 ; CHECK-NEXT: $dj2 = MOV_mv_scl killed $r2 - ; CHECK-NEXT: $m2 = MOV_mv_scl killed $r0 + ; CHECK-NEXT: $dc2 = MOV_mv_scl killed $r3 + ; CHECK-NEXT: $m6 = MOV_mv_scl killed $r4 ; CHECK-NEXT: $dn6 = MOV_mv_scl killed $r5 ; CHECK-NEXT: $dj6 = MOV_mv_scl killed $r6 - ; CHECK-NEXT: $p2, dead $dc1, dead $dc5 = PADDA_3D killed $p2, $d1_3d - ; CHECK-NEXT: $dc2 = MOV_mv_scl killed $r3 - ; CHECK-NEXT: RET implicit $lr ; CHECK-NEXT: $dc6 = MOV_mv_scl killed $r7 - ; CHECK-NEXT: $p0, $dc2, $dc6 = PADDA_3D killed $p0, killed $d2_3d - ; CHECK-NEXT: dead renamable $m6 = KILL killed $r4 - ; CHECK-NEXT: $dc1 = MOV_mv_scl killed $dc2 - ; CHECK-NEXT: $dc5 = MOV_mv_scl killed $dc6 - ; CHECK-NEXT: $p0, dead $dc1, dead $dc5 = PADDA_3D killed $p0, killed $d1_3d + ; CHECK-NEXT: ST_dms_spill killed $m1, -32, implicit $sp :: (store (s32) into %stack.0) + ; CHECK-NEXT: $m1 = MOV_mv_scl $m2 + ; CHECK-NEXT: $dn1 = MOV_mv_scl $dn2 + ; CHECK-NEXT: $dj1 = MOV_mv_scl $dj2 + ; CHECK-NEXT: $dc1 = MOV_mv_scl $dc2 + ; CHECK-NEXT: $m5 = MOV_mv_scl $m6 + ; CHECK-NEXT: $dn5 = MOV_mv_scl $dn6 + ; CHECK-NEXT: $dj5 = MOV_mv_scl $dj6 + ; CHECK-NEXT: $dc5 = MOV_mv_scl $dc6 + ; CHECK-NEXT: $p0, $dc1, $dc5 = PADDA_3D killed $p0, $d1_3d + ; CHECK-NEXT: $m1 = LDA_dms_spill -32, implicit $sp :: (load (s32) from %stack.0) + ; CHECK-NEXT: $p2 = MOV_mv_scl $p1 + ; CHECK-NEXT: $p2, $dc2, $dc6 = PADDA_3D killed $p2, $d2_3d + ; CHECK-NEXT: $dn1 = MOV_mv_scl killed $r9 + ; CHECK-NEXT: $dj1 = MOV_mv_scl killed $r10 + ; CHECK-NEXT: $dn5 = MOV_mv_scl killed $r13 + ; CHECK-NEXT: frame-destroy PADDB_sp_imm -32, implicit-def $sp, implicit $sp + ; CHECK-NEXT: $dj5 = MOV_mv_scl killed $r14 + ; CHECK-NEXT: $dc2 = MOV_mv_scl killed $dc1 + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: $dc6 = MOV_mv_scl killed $dc5 + ; CHECK-NEXT: $dc1 = MOV_mv_scl killed $r11 + ; CHECK-NEXT: $dc5 = MOV_mv_scl killed $r15 + ; CHECK-NEXT: $p0, dead $dc2, dead $dc6 = PADDA_3D killed $p0, killed $d2_3d + ; CHECK-NEXT: $p1, dead $dc1, dead $dc5 = PADDA_3D killed $p1, killed $d1_3d + ; CHECK-NEXT: dead renamable $m5 = KILL killed $r12 ; CHECK-NEXT: DelayedSchedBarrier implicit killed renamable $p0, implicit killed renamable $p1, implicit killed renamable $p2, implicit killed $m0, implicit killed $d3_3d %0:em = COPY $r0 %1:edn = COPY $r1 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir index ddf2c75b1e5e..3e5609195bee 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir @@ -49,20 +49,20 @@ body: | ; CHECK-NEXT: [[COPY18:%[0-9]+]]:acc1024 = COPY [[COPY3]] ; CHECK-NEXT: [[VADD1:%[0-9]+]]:acc1024 = VADD [[COPY6]], [[COPY18]], [[COPY14]] ; CHECK-NEXT: [[COPY19:%[0-9]+]]:acc1024 = COPY [[COPY2]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) ; CHECK-NEXT: [[VADD2:%[0-9]+]]:acc1024 = VADD [[COPY7]], [[COPY19]], [[COPY14]] ; CHECK-NEXT: [[COPY20:%[0-9]+]]:acc1024 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) ; CHECK-NEXT: [[VADD3:%[0-9]+]]:acc1024 = VADD [[COPY8]], [[COPY20]], [[COPY14]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD1]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD2]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD3]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc1024, [[COPY10:%[0-9]+]]:ep_as_32bit, [[COPY12:%[0-9]+]].sub_dim_count:eds, [[COPY12:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 [[COPY15]], [[COPY10]], [[COPY12]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD1]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc1024, [[COPY10:%[0-9]+]]:ep_as_32bit, [[COPY12:%[0-9]+]].sub_dim_count:eds, [[COPY12:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 [[COPY15]], [[COPY10]], [[COPY12]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD2]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) ; CHECK-NEXT: [[COPY7:%[0-9]+]]:acc1024, [[COPY10:%[0-9]+]]:ep_as_32bit, [[COPY12:%[0-9]+]].sub_dim_count:eds, [[COPY12:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 [[COPY15]], [[COPY10]], [[COPY12]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD3]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc1024, [[COPY10:%[0-9]+]]:ep_as_32bit, [[COPY12:%[0-9]+]].sub_dim_count:eds, [[COPY12:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 [[COPY15]], [[COPY10]], [[COPY12]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) ; CHECK-NEXT: [[COPY13:%[0-9]+]]:er = ADD_add_r_ri [[COPY13]], -4, implicit-def dead $srcarry ; CHECK-NEXT: PseudoJNZ [[COPY13]], %bb.1 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/conv2d_inner.mir b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/conv2d_inner.mir index 1eee41a450b1..f1e3015a6ddf 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/conv2d_inner.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/conv2d_inner.mir @@ -65,22 +65,22 @@ body: | ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY3]], [[COPY17]], [[COPY33]], [[COPY22]] ; CHECK-NEXT: [[VSHUFFLE:%[0-9]+]]:vec512 = VSHUFFLE [[COPY14]], [[COPY15]], [[COPY21]] ; CHECK-NEXT: [[VSHUFFLE1:%[0-9]+]]:vec512 = VSHUFFLE [[VSHUFFLE]], [[COPY9]], [[COPY21]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY1]], [[COPY16]], [[COPY32]], [[COPY22]] ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY4]], [[VSHUFFLE1]], [[COPY32]], [[COPY22]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY5]], [[VSHUFFLE1]], [[COPY33]], [[COPY22]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY1]], [[COPY16]], [[COPY32]], [[COPY22]] ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY6]], [[VSHUFFLE]], [[COPY32]], [[COPY22]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vec512 = COPY [[COPY29]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY5]], [[VSHUFFLE1]], [[COPY33]], [[COPY22]] - ; CHECK-NEXT: undef [[COPY29:%[0-9]+]].sub_256_lo:vec512, [[COPY25:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY25]], 32 + ; CHECK-NEXT: [[COPY34:%[0-9]+]]:er = COPY [[COPY19]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY14]], [[COPY31]], [[COPY18]], [[COPY34]] ; CHECK-NEXT: [[COPY7:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY7]], [[COPY16]], [[COPY33]], [[COPY22]] + ; CHECK-NEXT: undef [[COPY29:%[0-9]+]].sub_256_lo:vec512, [[COPY25:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY25]], 32 ; CHECK-NEXT: [[COPY29:%[0-9]+]].sub_256_hi:vec512, [[COPY25:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY25]], 32 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY8]], [[VSHUFFLE]], [[COPY33]], [[COPY22]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vec512 = COPY [[COPY30]] ; CHECK-NEXT: undef [[COPY30:%[0-9]+]].sub_256_lo:vec512, [[COPY25:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY25]], 32 - ; CHECK-NEXT: [[COPY34:%[0-9]+]]:er = COPY [[COPY19]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY14]], [[COPY31]], [[COPY18]], [[COPY34]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY15]], [[COPY31]], [[COPY28]], [[COPY34]] ; CHECK-NEXT: undef [[COPY18:%[0-9]+]].sub_256_lo:vec512, [[COPY26:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm [[COPY26]], [[COPY]] ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub_256_hi:vec512, [[COPY26:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm [[COPY26]], [[COPY]] - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY15]], [[COPY31]], [[COPY28]], [[COPY34]] ; CHECK-NEXT: undef [[COPY28:%[0-9]+]].sub_256_lo:vec512, [[COPY26:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm [[COPY26]], [[COPY]] ; CHECK-NEXT: [[COPY28:%[0-9]+]].sub_256_hi:vec512, [[COPY26:%[0-9]+]]:ep_as_32bit, [[COPY27:%[0-9]+]].sub_dim_count:eds, [[COPY27:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_dmw_lda_w [[COPY26]], [[COPY27]] ; CHECK-NEXT: [[COPY30:%[0-9]+]].sub_256_hi:vec512, [[COPY25:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY25]], 32 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/reduce_pressure.mir b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/reduce_pressure.mir index 4910343c9f68..bf953955efc8 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/reduce_pressure.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/reduce_pressure.mir @@ -34,34 +34,34 @@ body: | ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vec512 = COPY $x0 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vec512 = COPY $x0 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc1024 = COPY $cm0 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vec512 = COPY $x0 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:er = COPY $r0 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = COPY $p0 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:mss = COPY $s0 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vec512 = COPY $x0 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:er = COPY $r0 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:ep_as_32bit = COPY $p0 - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:eds = COPY $d0_3d - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:acc1024 = COPY $cm0 - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:er = COPY $r0 - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:mss = COPY $s0 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:ep_as_32bit = COPY $p0 + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:eds = COPY $d0_3d + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:er = COPY $r0 ; CHECK-NEXT: PseudoJ_jump_imm %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vec512 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY8]], [[COPY4]], [[COPY17]], [[COPY10]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY14]], [[COPY5]], [[COPY17]], [[COPY10]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec512 = COPY [[COPY9]] - ; CHECK-NEXT: undef [[COPY9:%[0-9]+]].sub_256_lo:vec512, [[COPY11:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY11]], 32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]].sub_256_hi:vec512, [[COPY11:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY11]], 32 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY2]], [[COPY16]], [[COPY6]], [[COPY10]] - ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub_256_lo:vec512, [[COPY12:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY12]], 32 - ; CHECK-NEXT: [[COPY6:%[0-9]+]].sub_256_hi:vec512, [[COPY12:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY12]], 32 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY3]], [[COPY16]], [[COPY7]], [[COPY10]] - ; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub_256_lo:vec512, [[COPY12:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY12]], 32 - ; CHECK-NEXT: [[COPY7:%[0-9]+]].sub_256_hi:vec512, [[COPY12:%[0-9]+]]:ep_as_32bit, [[COPY13:%[0-9]+]].sub_dim_count:eds, [[COPY13:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_dmw_lda_w [[COPY12]], [[COPY13]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vec512 = VSHUFFLE [[COPY2]], [[COPY3]], [[COPY10]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vec512 = VSHUFFLE [[COPY4]], [[COPY]], [[COPY10]] - ; CHECK-NEXT: PseudoJNZ [[COPY15]], %bb.1 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY8]], [[COPY4]], [[COPY17]], [[COPY11]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY15]], [[COPY5]], [[COPY17]], [[COPY11]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec512 = COPY [[COPY10]] + ; CHECK-NEXT: undef [[COPY10:%[0-9]+]].sub_256_lo:vec512, [[COPY12:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY12]], 32 + ; CHECK-NEXT: [[COPY10:%[0-9]+]].sub_256_hi:vec512, [[COPY12:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY12]], 32 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY2]], [[COPY9]], [[COPY6]], [[COPY11]] + ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub_256_lo:vec512, [[COPY13:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY13]], 32 + ; CHECK-NEXT: [[COPY6:%[0-9]+]].sub_256_hi:vec512, [[COPY13:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY13]], 32 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY3]], [[COPY9]], [[COPY7]], [[COPY11]] + ; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub_256_lo:vec512, [[COPY13:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY13]], 32 + ; CHECK-NEXT: [[COPY7:%[0-9]+]].sub_256_hi:vec512, [[COPY13:%[0-9]+]]:ep_as_32bit, [[COPY14:%[0-9]+]].sub_dim_count:eds, [[COPY14:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_dmw_lda_w [[COPY13]], [[COPY14]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vec512 = VSHUFFLE [[COPY2]], [[COPY3]], [[COPY11]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vec512 = VSHUFFLE [[COPY4]], [[COPY]], [[COPY11]] + ; CHECK-NEXT: PseudoJNZ [[COPY16]], %bb.1 ; CHECK-NEXT: PseudoJ_jump_imm %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -126,17 +126,17 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec512 = COPY $x0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec1024 = COPY $y2 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec1024 = COPY $y2 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc1024 = COPY $cm0 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:er = COPY $r0 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:ep_as_32bit = COPY $p0 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:ep_as_32bit = COPY $p0 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:eds = COPY $d0_3d - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:acc1024 = COPY $cm0 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:er = COPY $r0 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vec512 = COPY $x0 - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vec512 = COPY $x0 - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vec1024 = COPY $y2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:eds = COPY $d0_3d + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vec512 = COPY $x0 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vec512 = COPY $x0 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vec1024 = COPY $y2 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vec1024 = COPY $y2 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:ep_as_32bit = COPY $p0 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:ep_as_32bit = COPY $p0 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vec512 = COPY $x0 ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vec512 = COPY $x0 ; CHECK-NEXT: [[COPY16:%[0-9]+]]:vec512 = COPY $x0 @@ -148,21 +148,21 @@ body: | ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY20:%[0-9]+]]:vec512 = COPY [[COPY11]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY4]], [[COPY14]], [[COPY20]], [[COPY5]], implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY13]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY12]], [[COPY19]], [[COPY16]], [[COPY5]] - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY9]], [[COPY15]], [[COPY20]], [[COPY5]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vec512 = COPY [[COPY18]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY1]], [[COPY19]], [[COPY17]], [[COPY5]] - ; CHECK-NEXT: undef [[COPY18:%[0-9]+]].sub_256_lo:vec512, [[COPY6:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY6]], 32 - ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub_256_hi:vec512, [[COPY6:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY6]], 32 - ; CHECK-NEXT: undef [[COPY16:%[0-9]+]].sub_256_lo:vec512, [[COPY7:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY7]], 32 - ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub_256_hi:vec512, [[COPY7:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY7]], 32 - ; CHECK-NEXT: undef [[COPY17:%[0-9]+]].sub_256_lo:vec512, [[COPY7:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY7]], 32 - ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub_256_hi:vec512, [[COPY7:%[0-9]+]]:ep_as_32bit, [[COPY8:%[0-9]+]].sub_dim_count:eds, [[COPY8:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_dmw_lda_w [[COPY7]], [[COPY8]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vec512 = VSHUFFLE [[COPY12]], [[COPY1]], [[COPY5]] - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vec512 = VSHUFFLE [[COPY14]], [[COPY]], [[COPY5]] - ; CHECK-NEXT: PseudoJNZ [[COPY10]], %bb.1 + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:vec512 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY3]], [[COPY14]], [[COPY20]], [[COPY4]], implicit [[COPY2]], implicit [[COPY10]], implicit [[COPY11]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY8]], [[COPY19]], [[COPY16]], [[COPY4]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY6]], [[COPY15]], [[COPY20]], [[COPY4]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec512 = COPY [[COPY18]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY9]], [[COPY19]], [[COPY17]], [[COPY4]] + ; CHECK-NEXT: undef [[COPY18:%[0-9]+]].sub_256_lo:vec512, [[COPY12:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY12]], 32 + ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub_256_hi:vec512, [[COPY12:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY12]], 32 + ; CHECK-NEXT: undef [[COPY16:%[0-9]+]].sub_256_lo:vec512, [[COPY13:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY13]], 32 + ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub_256_hi:vec512, [[COPY13:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY13]], 32 + ; CHECK-NEXT: undef [[COPY17:%[0-9]+]].sub_256_lo:vec512, [[COPY13:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY13]], 32 + ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub_256_hi:vec512, [[COPY13:%[0-9]+]]:ep_as_32bit, [[COPY5:%[0-9]+]].sub_dim_count:eds, [[COPY5:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_dmw_lda_w [[COPY13]], [[COPY5]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vec512 = VSHUFFLE [[COPY8]], [[COPY9]], [[COPY4]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vec512 = VSHUFFLE [[COPY14]], [[COPY]], [[COPY4]] + ; CHECK-NEXT: PseudoJNZ [[COPY7]], %bb.1 ; CHECK-NEXT: PseudoJ_jump_imm %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: