From 8b28bb39607102efc0d3c348205f16de377826af Mon Sep 17 00:00:00 2001 From: Martien de Jong Date: Mon, 29 Sep 2025 09:42:12 +0200 Subject: [PATCH] [AIE2P] split offsets before postincrment in large Load/Stores Put the zero-offset part last, since it doesn't need the pointer to survive. --- .../AIE/aie2p/AIE2PInstructionSelector.cpp | 8 +- .../GlobalIsel/inst-select-vector-load.mir | 70 +-- .../inst-select-vector-pre-post-increment.mir | 482 +++++++++--------- .../GlobalIsel/inst-select-vector-store.mir | 98 ++-- .../end-to-end/conv2d_bfp16_kernel_red.ll | 58 +-- llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll | 190 +++---- .../CodeGen/AIE/aie2p/ldst-fifo-stores.ll | 148 +++--- llvm/test/CodeGen/AIE/insertelement.ll | 12 +- 8 files changed, 535 insertions(+), 531 deletions(-) diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp index 0a6652773713..cb9100c3c427 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp @@ -1985,7 +1985,9 @@ bool AIE2PInstructionSelector::selectWideG_AIE_LOAD_STORE( case AIE2P::G_AIE_POSTINC_STORE: case AIE2P::G_AIE_POSTINC_2D_STORE: case AIE2P::G_AIE_POSTINC_3D_STORE: { - for (unsigned SubRegIdx = 0; SubRegIdx < SplitFactor; ++SubRegIdx) { + // Split Offsets first and then perform the postinc. + // For consistency, also handle G_STORE in the same manner. + for (int SubRegIdx = SplitFactor - 1; SubRegIdx >= 0; SubRegIdx--) { const unsigned Offset = SubRegIdx * 64; auto Copy = MIB.buildInstr(TargetOpcode::COPY, {SubRegs[SubRegIdx]}, {}) .addReg(AMI.SrcDstOp.getReg(), 0, @@ -2057,7 +2059,9 @@ bool AIE2PInstructionSelector::selectWideG_AIE_LOAD_STORE( case AIE2P::G_AIE_POSTINC_LOAD: case AIE2P::G_AIE_POSTINC_2D_LOAD: case AIE2P::G_AIE_POSTINC_3D_LOAD: { - for (unsigned SubRegIdx = 0; SubRegIdx < SplitFactor; ++SubRegIdx) { + // Split Offsets first and then perform the postinc. + // For consistency, also handle G_LOAD in the same manner. + for (int SubRegIdx = SplitFactor - 1; SubRegIdx >= 0; SubRegIdx--) { MachineInstrBuilder Load; if (SubRegIdx == 0) { Load = MIB.buildInstr(LSO.ISelOpcode, {SubRegs[0]}, {}); diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-load.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-load.mir index 6f5eea14b430..4c5c9208287b 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-load.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-load.mir @@ -4,7 +4,7 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates # RUN: llc -mtriple aie2p -run-pass=instruction-select %s -verify-machineinstrs -o - | FileCheck %s --- @@ -217,9 +217,9 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 0 :: (load (<16 x s32>), align 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm1:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from unknown-address + 64) - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_idx_imm]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm1]], %subreg.sub_hi_fifo + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 64 :: (load (<16 x s32>), align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm1:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 0 :: (load (<16 x s32>) from unknown-address + 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_idx_imm1]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm]], %subreg.sub_hi_fifo ; CHECK-NEXT: $lf0 = COPY [[REG_SEQUENCE]] %1:ptrregbank(p0) = COPY $p0 %0:fiforegbank(<32 x s32>) = G_LOAD %1(p0) :: (load (<32 x s32>)) @@ -238,9 +238,9 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 0 :: (load (<32 x s16>), align 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm1:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 64 :: (load (<32 x s16>) from unknown-address + 64) - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_idx_imm]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm1]], %subreg.sub_hi_fifo + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 64 :: (load (<32 x s16>), align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm1:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 0 :: (load (<32 x s16>) from unknown-address + 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_idx_imm1]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm]], %subreg.sub_hi_fifo ; CHECK-NEXT: $lf0 = COPY [[REG_SEQUENCE]] %1:ptrregbank(p0) = COPY $p0 %0:fiforegbank(<64 x s16>) = G_LOAD %1(p0) :: (load (<64 x s16>)) @@ -258,9 +258,9 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 0 :: (load (<64 x s8>), align 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm1:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 64 :: (load (<64 x s8>) from unknown-address + 64) - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_idx_imm]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm1]], %subreg.sub_hi_fifo + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 64 :: (load (<64 x s8>), align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm1:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 0 :: (load (<64 x s8>) from unknown-address + 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_idx_imm1]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm]], %subreg.sub_hi_fifo ; CHECK-NEXT: $lf0 = COPY [[REG_SEQUENCE]] %1:ptrregbank(p0) = COPY $p0 %0:fiforegbank(<128 x s8>) = G_LOAD %1(p0) :: (load (<128 x s8>)) @@ -278,9 +278,9 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[COPY]], 0 :: (load (<32 x s16>), align 128) - ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo1:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[COPY]], 64 :: (load (<32 x s16>) from unknown-address + 64) - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLD_x_idx_imm_pseudo]], %subreg.sub_512_lo, [[VLD_x_idx_imm_pseudo1]], %subreg.sub_512_hi + ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[COPY]], 64 :: (load (<32 x s16>), align 128) + ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo1:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[COPY]], 0 :: (load (<32 x s16>) from unknown-address + 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLD_x_idx_imm_pseudo1]], %subreg.sub_512_lo, [[VLD_x_idx_imm_pseudo]], %subreg.sub_512_hi ; CHECK-NEXT: $y0 = COPY [[REG_SEQUENCE]] %1:ptrregbank(p0) = COPY $p0 %0:vregbank(<64 x s16>) = G_LOAD %1(p0) :: (load (<64 x s16>)) @@ -298,9 +298,9 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[COPY]], 0 :: (load (<16 x s32>), align 128) - ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo1:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[COPY]], 64 :: (load (<16 x s32>) from unknown-address + 64) - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLD_x_idx_imm_pseudo]], %subreg.sub_512_lo, [[VLD_x_idx_imm_pseudo1]], %subreg.sub_512_hi + ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[COPY]], 64 :: (load (<16 x s32>), align 128) + ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo1:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[COPY]], 0 :: (load (<16 x s32>) from unknown-address + 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLD_x_idx_imm_pseudo1]], %subreg.sub_512_lo, [[VLD_x_idx_imm_pseudo]], %subreg.sub_512_hi ; CHECK-NEXT: $y0 = COPY [[REG_SEQUENCE]] %1:ptrregbank(p0) = COPY $p0 %0:vregbank(<32 x s32>) = G_LOAD %1(p0) :: (load (<32 x s32>)) @@ -318,9 +318,9 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[COPY]], 0 :: (load (<64 x s8>), align 128) - ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo1:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[COPY]], 64 :: (load (<64 x s8>) from unknown-address + 64) - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLD_x_idx_imm_pseudo]], %subreg.sub_512_lo, [[VLD_x_idx_imm_pseudo1]], %subreg.sub_512_hi + ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[COPY]], 64 :: (load (<64 x s8>), align 128) + ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo1:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[COPY]], 0 :: (load (<64 x s8>) from unknown-address + 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLD_x_idx_imm_pseudo1]], %subreg.sub_512_lo, [[VLD_x_idx_imm_pseudo]], %subreg.sub_512_hi ; CHECK-NEXT: $y0 = COPY [[REG_SEQUENCE]] %1:ptrregbank(p0) = COPY $p0 %0:vregbank(<128 x s8>) = G_LOAD %1(p0) :: (load (<128 x s8>)) @@ -338,9 +338,9 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 0 :: (load (<16 x s32>), align 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from unknown-address + 64) - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_512_acc_hi + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<16 x s32>), align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 0 :: (load (<16 x s32>) from unknown-address + 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_hi ; CHECK-NEXT: $cml0 = COPY [[REG_SEQUENCE]] %1:ptrregbank(p0) = COPY $p0 %0:accregbank(<32 x s32>) = G_LOAD %1(p0) :: (load (<32 x s32>)) @@ -358,9 +358,9 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 0 :: (load (<8 x s64>), align 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<8 x s64>) from unknown-address + 64) - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_512_acc_hi + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<8 x s64>), align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 0 :: (load (<8 x s64>) from unknown-address + 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_hi ; CHECK-NEXT: $cml0 = COPY [[REG_SEQUENCE]] %1:ptrregbank(p0) = COPY $p0 %0:accregbank(<16 x s64>) = G_LOAD %1(p0) :: (load (<16 x s64>)) @@ -378,11 +378,11 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 0 :: (load (<16 x s32>), align 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from unknown-address + 64) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm2:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 128 :: (load (<16 x s32>) from unknown-address + 128, align 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm3:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 192 :: (load (<16 x s32>) from unknown-address + 192) - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm2]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm3]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 192 :: (load (<16 x s32>), align 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 128 :: (load (<16 x s32>) from unknown-address + 64) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm2:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from unknown-address + 128, align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm3:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 0 :: (load (<16 x s32>) from unknown-address + 192) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_idx_imm3]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm2]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi ; CHECK-NEXT: $dm0 = COPY [[REG_SEQUENCE]] %1:ptrregbank(p0) = COPY $p0 %0:accregbank(<64 x s32>) = G_LOAD %1(p0) :: (load (<64 x s32>)) @@ -400,11 +400,11 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 0 :: (load (<8 x s64>), align 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<8 x s64>) from unknown-address + 64) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm2:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 128 :: (load (<8 x s64>) from unknown-address + 128, align 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm3:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 192 :: (load (<8 x s64>) from unknown-address + 192) - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm2]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm3]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 192 :: (load (<8 x s64>), align 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 128 :: (load (<8 x s64>) from unknown-address + 64) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm2:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<8 x s64>) from unknown-address + 128, align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm3:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 0 :: (load (<8 x s64>) from unknown-address + 192) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_idx_imm3]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm2]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi ; CHECK-NEXT: $dm0 = COPY [[REG_SEQUENCE]] %1:ptrregbank(p0) = COPY $p0 %0:accregbank(<32 x s64>) = G_LOAD %1(p0) :: (load (<32 x s64>)) diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-pre-post-increment.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-pre-post-increment.mir index 50d8da16b9cd..affed5a04b5e 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-pre-post-increment.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-pre-post-increment.mir @@ -842,23 +842,23 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 32 ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:em = MOV_PD_imm11_pseudo 512 - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm [[COPY]], [[COPY2]] :: (load (<16 x s32>) from stack - 64, basealign 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from stack, align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from stack - 64, basealign 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm [[COPY]], [[COPY2]] :: (load (<16 x s32>) from stack, align 128) ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_pstm_nrm]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm]], %subreg.sub_hi_fifo - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm_imm:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm_imm [[VLDA_dmx_lda_fifohl_pstm_nrm1]], 448 :: (load (<16 x s32>) from stack - 64, basealign 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm1:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[VLDA_dmx_lda_fifohl_pstm_nrm1]], 64 :: (load (<16 x s32>) from stack, align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm1:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[VLDA_dmx_lda_fifohl_pstm_nrm1]], 64 :: (load (<16 x s32>) from stack - 64, basealign 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm_imm:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm_imm [[VLDA_dmx_lda_fifohl_pstm_nrm1]], 448 :: (load (<16 x s32>) from stack, align 128) ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_pstm_nrm_imm]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm1]], %subreg.sub_hi_fifo - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm_imm2:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm_imm3:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm_imm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm1]], -512 :: (load (<16 x s32>) from stack - 64, basealign 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm2:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm1]], 64 :: (load (<16 x s32>) from stack, align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm2:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm1]], 64 :: (load (<16 x s32>) from stack - 64, basealign 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm_imm2:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm_imm3:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm_imm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm1]], -512 :: (load (<16 x s32>) from stack, align 128) ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_pstm_nrm_imm2]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm2]], %subreg.sub_hi_fifo - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm_imm4:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm_imm5:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm_imm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm3]], 0 :: (load (<16 x s32>) from stack - 64, basealign 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm3:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm3]], 64 :: (load (<16 x s32>) from stack, align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm3:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm3]], 64 :: (load (<16 x s32>) from stack - 64, basealign 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm_imm4:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm_imm5:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm_imm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm3]], 0 :: (load (<16 x s32>) from stack, align 128) ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_pstm_nrm_imm4]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm3]], %subreg.sub_hi_fifo - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm2:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm3:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm5]], [[MOV_PD_imm11_pseudo]] :: (load (<16 x s32>) from stack - 64, basealign 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm4:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm5]], 64 :: (load (<16 x s32>) from stack, align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm4:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm5]], 64 :: (load (<16 x s32>) from stack - 64, basealign 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm2:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm3:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm5]], [[MOV_PD_imm11_pseudo]] :: (load (<16 x s32>) from stack, align 128) ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_pstm_nrm2]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm4]], %subreg.sub_hi_fifo - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm4:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm5:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm [[VLDA_dmx_lda_fifohl_pstm_nrm3]], [[MOV_PD_imm11_pseudo1]] :: (load (<16 x s32>) from stack - 64, basealign 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm5:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[VLDA_dmx_lda_fifohl_pstm_nrm3]], 64 :: (load (<16 x s32>) from stack, align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm5:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[VLDA_dmx_lda_fifohl_pstm_nrm3]], 64 :: (load (<16 x s32>) from stack - 64, basealign 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm4:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm5:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm [[VLDA_dmx_lda_fifohl_pstm_nrm3]], [[MOV_PD_imm11_pseudo1]] :: (load (<16 x s32>) from stack, align 128) ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_pstm_nrm4]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm5]], %subreg.sub_hi_fifo ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE5]] %0:ptrregbank(p0) = COPY $p0 @@ -895,8 +895,8 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 3 ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 4 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_count - ; CHECK-NEXT: [[VLDA_2D_dmx_lda_fifohl:%[0-9]+]]:fifo512, [[VLDA_2D_dmx_lda_fifohl1:%[0-9]+]]:ep, [[VLDA_2D_dmx_lda_fifohl2:%[0-9]+]]:edc = VLDA_2D_dmx_lda_fifohl [[COPY]], [[REG_SEQUENCE]] :: (load (<16 x s32>), align 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from unknown-address + 64) + ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 64 :: (load (<16 x s32>), align 128) + ; CHECK-NEXT: [[VLDA_2D_dmx_lda_fifohl:%[0-9]+]]:fifo512, [[VLDA_2D_dmx_lda_fifohl1:%[0-9]+]]:ep, [[VLDA_2D_dmx_lda_fifohl2:%[0-9]+]]:edc = VLDA_2D_dmx_lda_fifohl [[COPY]], [[REG_SEQUENCE]] :: (load (<16 x s32>) from unknown-address + 64) ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_2D_dmx_lda_fifohl]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm]], %subreg.sub_hi_fifo ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE1]] %0:ptrregbank(p0) = COPY $p0 @@ -959,30 +959,30 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 32 ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:em = MOV_PD_imm11_pseudo 512 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fifo1024 = COPY $lf0 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_lo_fifo - ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm [[COPY4]], [[COPY]], [[COPY2]] :: (store (<16 x s32>) into stack - 64, basealign 128) - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_hi_fifo - ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY5]], [[COPY]], 64 :: (store (<16 x s32>) into stack, align 128) - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_lo_fifo - ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm_imm:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm_imm [[COPY6]], [[VST_dmx_sts_fifohl_pstm_nrm]], 448 :: (store (<16 x s32>) into stack - 64, basealign 128) - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_hi_fifo - ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY7]], [[VST_dmx_sts_fifohl_pstm_nrm]], 64 :: (store (<16 x s32>) into stack, align 128) - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_lo_fifo - ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm_imm1:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm_imm [[COPY8]], [[VST_dmx_sts_fifohl_pstm_nrm_imm]], -512 :: (store (<16 x s32>) into stack - 64, basealign 128) - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_hi_fifo - ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY9]], [[VST_dmx_sts_fifohl_pstm_nrm_imm]], 64 :: (store (<16 x s32>) into stack, align 128) - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_lo_fifo - ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm_imm2:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm_imm [[COPY10]], [[VST_dmx_sts_fifohl_pstm_nrm_imm1]], 0 :: (store (<16 x s32>) into stack - 64, basealign 128) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_hi_fifo - ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY11]], [[VST_dmx_sts_fifohl_pstm_nrm_imm1]], 64 :: (store (<16 x s32>) into stack, align 128) - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_lo_fifo - ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm1:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm [[COPY12]], [[VST_dmx_sts_fifohl_pstm_nrm_imm2]], [[MOV_PD_imm11_pseudo]] :: (store (<16 x s32>) into stack - 64, basealign 128) - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_hi_fifo - ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY13]], [[VST_dmx_sts_fifohl_pstm_nrm_imm2]], 64 :: (store (<16 x s32>) into stack, align 128) - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_lo_fifo - ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm2:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm [[COPY14]], [[VST_dmx_sts_fifohl_pstm_nrm1]], [[MOV_PD_imm11_pseudo1]] :: (store (<16 x s32>) into stack - 64, basealign 128) - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_hi_fifo - ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY15]], [[VST_dmx_sts_fifohl_pstm_nrm1]], 64 :: (store (<16 x s32>) into stack, align 128) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_hi_fifo + ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY4]], [[COPY]], 64 :: (store (<16 x s32>) into stack - 64, basealign 128) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_lo_fifo + ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm [[COPY5]], [[COPY]], [[COPY2]] :: (store (<16 x s32>) into stack, align 128) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_hi_fifo + ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY6]], [[VST_dmx_sts_fifohl_pstm_nrm]], 64 :: (store (<16 x s32>) into stack - 64, basealign 128) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_lo_fifo + ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm_imm:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm_imm [[COPY7]], [[VST_dmx_sts_fifohl_pstm_nrm]], 448 :: (store (<16 x s32>) into stack, align 128) + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_hi_fifo + ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY8]], [[VST_dmx_sts_fifohl_pstm_nrm_imm]], 64 :: (store (<16 x s32>) into stack - 64, basealign 128) + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_lo_fifo + ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm_imm1:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm_imm [[COPY9]], [[VST_dmx_sts_fifohl_pstm_nrm_imm]], -512 :: (store (<16 x s32>) into stack, align 128) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_hi_fifo + ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY10]], [[VST_dmx_sts_fifohl_pstm_nrm_imm1]], 64 :: (store (<16 x s32>) into stack - 64, basealign 128) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_lo_fifo + ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm_imm2:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm_imm [[COPY11]], [[VST_dmx_sts_fifohl_pstm_nrm_imm1]], 0 :: (store (<16 x s32>) into stack, align 128) + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_hi_fifo + ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY12]], [[VST_dmx_sts_fifohl_pstm_nrm_imm2]], 64 :: (store (<16 x s32>) into stack - 64, basealign 128) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_lo_fifo + ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm1:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm [[COPY13]], [[VST_dmx_sts_fifohl_pstm_nrm_imm2]], [[MOV_PD_imm11_pseudo]] :: (store (<16 x s32>) into stack, align 128) + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_hi_fifo + ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY14]], [[VST_dmx_sts_fifohl_pstm_nrm1]], 64 :: (store (<16 x s32>) into stack - 64, basealign 128) + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_lo_fifo + ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm2:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm [[COPY15]], [[VST_dmx_sts_fifohl_pstm_nrm1]], [[MOV_PD_imm11_pseudo1]] :: (store (<16 x s32>) into stack, align 128) ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VST_dmx_sts_fifohl_pstm_nrm2]] %0:ptrregbank(p0) = COPY $p0 %1:gprregbank(s32) = COPY $r0 @@ -1020,11 +1020,11 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fifo1024 = COPY $lf0 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_count - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_lo_fifo - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:mxs = COPY [[COPY2]] - ; CHECK-NEXT: [[VST_2D_dmx_sts_x:%[0-9]+]]:ep, [[VST_2D_dmx_sts_x1:%[0-9]+]]:edc = VST_2D_dmx_sts_x [[COPY3]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>), align 128) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_hi_fifo - ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY4]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_hi_fifo + ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY2]], [[COPY]], 64 :: (store (<16 x s32>), align 128) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_lo_fifo + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:mxs = COPY [[COPY3]] + ; CHECK-NEXT: [[VST_2D_dmx_sts_x:%[0-9]+]]:ep, [[VST_2D_dmx_sts_x1:%[0-9]+]]:edc = VST_2D_dmx_sts_x [[COPY4]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>) into unknown-address + 64) ; CHECK-NEXT: PseudoRET implicit $lr %0:ptrregbank(p0) = COPY $p0 %1:em(s20) = G_CONSTANT i20 1 @@ -1057,11 +1057,11 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm11_pseudo6:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 7 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fifo1024 = COPY $lf0 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo5]], %subreg.sub_dim_count, [[MOV_PD_imm11_pseudo4]], %subreg.sub_hi_dim_then_sub_dim_size, [[MOV_PD_imm11_pseudo2]], %subreg.sub_hi_dim_then_sub_dim_stride, [[MOV_PD_imm11_pseudo6]], %subreg.sub_hi_dim_then_sub_dim_count - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_lo_fifo - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:mxs = COPY [[COPY2]] - ; CHECK-NEXT: [[VST_3D_dmx_sts_x:%[0-9]+]]:ep, [[VST_3D_dmx_sts_x1:%[0-9]+]]:edcl, [[VST_3D_dmx_sts_x2:%[0-9]+]]:edch = VST_3D_dmx_sts_x [[COPY3]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>), align 128) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_hi_fifo - ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY4]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_hi_fifo + ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY2]], [[COPY]], 64 :: (store (<16 x s32>), align 128) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_lo_fifo + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:mxs = COPY [[COPY3]] + ; CHECK-NEXT: [[VST_3D_dmx_sts_x:%[0-9]+]]:ep, [[VST_3D_dmx_sts_x1:%[0-9]+]]:edcl, [[VST_3D_dmx_sts_x2:%[0-9]+]]:edch = VST_3D_dmx_sts_x [[COPY4]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>) into unknown-address + 64) ; CHECK-NEXT: PseudoRET implicit $lr %0:ptrregbank(p0) = COPY $p0 %1:em(s20) = G_CONSTANT i20 1 @@ -1092,23 +1092,23 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 32 ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:em = MOV_PD_imm11_pseudo 512 - ; CHECK-NEXT: [[VLD_x_pstm_nrm_pseudo:%[0-9]+]]:vec512, [[VLD_x_pstm_nrm_pseudo1:%[0-9]+]]:ep = VLD_x_pstm_nrm_pseudo [[COPY]], [[COPY2]] :: (load (<16 x s32>) from stack - 64, basealign 128) - ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[COPY]], 64 :: (load (<16 x s32>) from stack, align 128) + ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[COPY]], 64 :: (load (<16 x s32>) from stack - 64, basealign 128) + ; CHECK-NEXT: [[VLD_x_pstm_nrm_pseudo:%[0-9]+]]:vec512, [[VLD_x_pstm_nrm_pseudo1:%[0-9]+]]:ep = VLD_x_pstm_nrm_pseudo [[COPY]], [[COPY2]] :: (load (<16 x s32>) from stack, align 128) ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLD_x_pstm_nrm_pseudo]], %subreg.sub_512_lo, [[VLD_x_idx_imm_pseudo]], %subreg.sub_512_hi - ; CHECK-NEXT: [[VLD_x_pstm_nrm_imm_pseudo:%[0-9]+]]:vec512, [[VLD_x_pstm_nrm_imm_pseudo1:%[0-9]+]]:ep = VLD_x_pstm_nrm_imm_pseudo [[VLD_x_pstm_nrm_pseudo1]], 448 :: (load (<16 x s32>) from stack - 64, basealign 128) - ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo1:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[VLD_x_pstm_nrm_pseudo1]], 64 :: (load (<16 x s32>) from stack, align 128) + ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo1:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[VLD_x_pstm_nrm_pseudo1]], 64 :: (load (<16 x s32>) from stack - 64, basealign 128) + ; CHECK-NEXT: [[VLD_x_pstm_nrm_imm_pseudo:%[0-9]+]]:vec512, [[VLD_x_pstm_nrm_imm_pseudo1:%[0-9]+]]:ep = VLD_x_pstm_nrm_imm_pseudo [[VLD_x_pstm_nrm_pseudo1]], 448 :: (load (<16 x s32>) from stack, align 128) ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLD_x_pstm_nrm_imm_pseudo]], %subreg.sub_512_lo, [[VLD_x_idx_imm_pseudo1]], %subreg.sub_512_hi - ; CHECK-NEXT: [[VLD_x_pstm_nrm_imm_pseudo2:%[0-9]+]]:vec512, [[VLD_x_pstm_nrm_imm_pseudo3:%[0-9]+]]:ep = VLD_x_pstm_nrm_imm_pseudo [[VLD_x_pstm_nrm_imm_pseudo1]], -512 :: (load (<16 x s32>) from stack - 64, basealign 128) - ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo2:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[VLD_x_pstm_nrm_imm_pseudo1]], 64 :: (load (<16 x s32>) from stack, align 128) + ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo2:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[VLD_x_pstm_nrm_imm_pseudo1]], 64 :: (load (<16 x s32>) from stack - 64, basealign 128) + ; CHECK-NEXT: [[VLD_x_pstm_nrm_imm_pseudo2:%[0-9]+]]:vec512, [[VLD_x_pstm_nrm_imm_pseudo3:%[0-9]+]]:ep = VLD_x_pstm_nrm_imm_pseudo [[VLD_x_pstm_nrm_imm_pseudo1]], -512 :: (load (<16 x s32>) from stack, align 128) ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLD_x_pstm_nrm_imm_pseudo2]], %subreg.sub_512_lo, [[VLD_x_idx_imm_pseudo2]], %subreg.sub_512_hi - ; CHECK-NEXT: [[VLD_x_pstm_nrm_imm_pseudo4:%[0-9]+]]:vec512, [[VLD_x_pstm_nrm_imm_pseudo5:%[0-9]+]]:ep = VLD_x_pstm_nrm_imm_pseudo [[VLD_x_pstm_nrm_imm_pseudo3]], 0 :: (load (<16 x s32>) from stack - 64, basealign 128) - ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo3:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[VLD_x_pstm_nrm_imm_pseudo3]], 64 :: (load (<16 x s32>) from stack, align 128) + ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo3:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[VLD_x_pstm_nrm_imm_pseudo3]], 64 :: (load (<16 x s32>) from stack - 64, basealign 128) + ; CHECK-NEXT: [[VLD_x_pstm_nrm_imm_pseudo4:%[0-9]+]]:vec512, [[VLD_x_pstm_nrm_imm_pseudo5:%[0-9]+]]:ep = VLD_x_pstm_nrm_imm_pseudo [[VLD_x_pstm_nrm_imm_pseudo3]], 0 :: (load (<16 x s32>) from stack, align 128) ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLD_x_pstm_nrm_imm_pseudo4]], %subreg.sub_512_lo, [[VLD_x_idx_imm_pseudo3]], %subreg.sub_512_hi - ; CHECK-NEXT: [[VLD_x_pstm_nrm_pseudo2:%[0-9]+]]:vec512, [[VLD_x_pstm_nrm_pseudo3:%[0-9]+]]:ep = VLD_x_pstm_nrm_pseudo [[VLD_x_pstm_nrm_imm_pseudo5]], [[MOV_PD_imm11_pseudo]] :: (load (<16 x s32>) from stack - 64, basealign 128) - ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo4:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[VLD_x_pstm_nrm_imm_pseudo5]], 64 :: (load (<16 x s32>) from stack, align 128) + ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo4:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[VLD_x_pstm_nrm_imm_pseudo5]], 64 :: (load (<16 x s32>) from stack - 64, basealign 128) + ; CHECK-NEXT: [[VLD_x_pstm_nrm_pseudo2:%[0-9]+]]:vec512, [[VLD_x_pstm_nrm_pseudo3:%[0-9]+]]:ep = VLD_x_pstm_nrm_pseudo [[VLD_x_pstm_nrm_imm_pseudo5]], [[MOV_PD_imm11_pseudo]] :: (load (<16 x s32>) from stack, align 128) ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLD_x_pstm_nrm_pseudo2]], %subreg.sub_512_lo, [[VLD_x_idx_imm_pseudo4]], %subreg.sub_512_hi - ; CHECK-NEXT: [[VLD_x_pstm_nrm_pseudo4:%[0-9]+]]:vec512, [[VLD_x_pstm_nrm_pseudo5:%[0-9]+]]:ep = VLD_x_pstm_nrm_pseudo [[VLD_x_pstm_nrm_pseudo3]], [[MOV_PD_imm11_pseudo1]] :: (load (<16 x s32>) from stack - 64, basealign 128) - ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo5:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[VLD_x_pstm_nrm_pseudo3]], 64 :: (load (<16 x s32>) from stack, align 128) + ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo5:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[VLD_x_pstm_nrm_pseudo3]], 64 :: (load (<16 x s32>) from stack - 64, basealign 128) + ; CHECK-NEXT: [[VLD_x_pstm_nrm_pseudo4:%[0-9]+]]:vec512, [[VLD_x_pstm_nrm_pseudo5:%[0-9]+]]:ep = VLD_x_pstm_nrm_pseudo [[VLD_x_pstm_nrm_pseudo3]], [[MOV_PD_imm11_pseudo1]] :: (load (<16 x s32>) from stack, align 128) ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLD_x_pstm_nrm_pseudo4]], %subreg.sub_512_lo, [[VLD_x_idx_imm_pseudo5]], %subreg.sub_512_hi ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE5]] %0:ptrregbank(p0) = COPY $p0 @@ -1145,8 +1145,8 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 3 ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 4 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_count - ; CHECK-NEXT: [[VLD_2D_x_pseudo:%[0-9]+]]:vec512, [[VLD_2D_x_pseudo1:%[0-9]+]]:ep, [[VLD_2D_x_pseudo2:%[0-9]+]]:edc = VLD_2D_x_pseudo [[COPY]], [[REG_SEQUENCE]] :: (load (<16 x s32>), align 128) - ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[COPY]], 64 :: (load (<16 x s32>) from unknown-address + 64) + ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[COPY]], 64 :: (load (<16 x s32>), align 128) + ; CHECK-NEXT: [[VLD_2D_x_pseudo:%[0-9]+]]:vec512, [[VLD_2D_x_pseudo1:%[0-9]+]]:ep, [[VLD_2D_x_pseudo2:%[0-9]+]]:edc = VLD_2D_x_pseudo [[COPY]], [[REG_SEQUENCE]] :: (load (<16 x s32>) from unknown-address + 64) ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLD_2D_x_pseudo]], %subreg.sub_512_lo, [[VLD_x_idx_imm_pseudo]], %subreg.sub_512_hi ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE1]] %0:ptrregbank(p0) = COPY $p0 @@ -1178,8 +1178,8 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm11_pseudo5:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 6 ; CHECK-NEXT: [[MOV_PD_imm11_pseudo6:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 7 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo5]], %subreg.sub_dim_count, [[MOV_PD_imm11_pseudo4]], %subreg.sub_hi_dim_then_sub_dim_size, [[MOV_PD_imm11_pseudo2]], %subreg.sub_hi_dim_then_sub_dim_stride, [[MOV_PD_imm11_pseudo6]], %subreg.sub_hi_dim_then_sub_dim_count - ; CHECK-NEXT: [[VLD_3D_x_pseudo:%[0-9]+]]:vec512, [[VLD_3D_x_pseudo1:%[0-9]+]]:ep, [[VLD_3D_x_pseudo2:%[0-9]+]]:edcl, [[VLD_3D_x_pseudo3:%[0-9]+]]:edch = VLD_3D_x_pseudo [[COPY]], [[REG_SEQUENCE]] :: (load (<16 x s32>), align 128) - ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[COPY]], 64 :: (load (<16 x s32>) from unknown-address + 64) + ; CHECK-NEXT: [[VLD_x_idx_imm_pseudo:%[0-9]+]]:vec512 = VLD_x_idx_imm_pseudo [[COPY]], 64 :: (load (<16 x s32>), align 128) + ; CHECK-NEXT: [[VLD_3D_x_pseudo:%[0-9]+]]:vec512, [[VLD_3D_x_pseudo1:%[0-9]+]]:ep, [[VLD_3D_x_pseudo2:%[0-9]+]]:edcl, [[VLD_3D_x_pseudo3:%[0-9]+]]:edch = VLD_3D_x_pseudo [[COPY]], [[REG_SEQUENCE]] :: (load (<16 x s32>) from unknown-address + 64) ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLD_3D_x_pseudo]], %subreg.sub_512_lo, [[VLD_x_idx_imm_pseudo]], %subreg.sub_512_hi ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE1]] %0:ptrregbank(p0) = COPY $p0 @@ -1211,30 +1211,30 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 32 ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:em = MOV_PD_imm11_pseudo 512 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec1024 = COPY $y0 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_lo - ; CHECK-NEXT: [[VST_dmx_sts_x_pstm_nrm:%[0-9]+]]:ep = VST_dmx_sts_x_pstm_nrm [[COPY4]], [[COPY]], [[COPY2]] :: (store (<16 x s32>) into stack - 64, basealign 128) - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_hi - ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY5]], [[COPY]], 64 :: (store (<16 x s32>) into stack, align 128) - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_lo - ; CHECK-NEXT: [[VST_dmx_sts_x_pstm_nrm_imm:%[0-9]+]]:ep = VST_dmx_sts_x_pstm_nrm_imm [[COPY6]], [[VST_dmx_sts_x_pstm_nrm]], 448 :: (store (<16 x s32>) into stack - 64, basealign 128) - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_hi - ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY7]], [[VST_dmx_sts_x_pstm_nrm]], 64 :: (store (<16 x s32>) into stack, align 128) - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_lo - ; CHECK-NEXT: [[VST_dmx_sts_x_pstm_nrm_imm1:%[0-9]+]]:ep = VST_dmx_sts_x_pstm_nrm_imm [[COPY8]], [[VST_dmx_sts_x_pstm_nrm_imm]], -512 :: (store (<16 x s32>) into stack - 64, basealign 128) - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_hi - ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY9]], [[VST_dmx_sts_x_pstm_nrm_imm]], 64 :: (store (<16 x s32>) into stack, align 128) - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_lo - ; CHECK-NEXT: [[VST_dmx_sts_x_pstm_nrm_imm2:%[0-9]+]]:ep = VST_dmx_sts_x_pstm_nrm_imm [[COPY10]], [[VST_dmx_sts_x_pstm_nrm_imm1]], 0 :: (store (<16 x s32>) into stack - 64, basealign 128) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_hi - ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY11]], [[VST_dmx_sts_x_pstm_nrm_imm1]], 64 :: (store (<16 x s32>) into stack, align 128) - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_lo - ; CHECK-NEXT: [[VST_dmx_sts_x_pstm_nrm1:%[0-9]+]]:ep = VST_dmx_sts_x_pstm_nrm [[COPY12]], [[VST_dmx_sts_x_pstm_nrm_imm2]], [[MOV_PD_imm11_pseudo]] :: (store (<16 x s32>) into stack - 64, basealign 128) - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_hi - ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY13]], [[VST_dmx_sts_x_pstm_nrm_imm2]], 64 :: (store (<16 x s32>) into stack, align 128) - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_lo - ; CHECK-NEXT: [[VST_dmx_sts_x_pstm_nrm2:%[0-9]+]]:ep = VST_dmx_sts_x_pstm_nrm [[COPY14]], [[VST_dmx_sts_x_pstm_nrm1]], [[MOV_PD_imm11_pseudo1]] :: (store (<16 x s32>) into stack - 64, basealign 128) - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_hi - ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY15]], [[VST_dmx_sts_x_pstm_nrm1]], 64 :: (store (<16 x s32>) into stack, align 128) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_hi + ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY4]], [[COPY]], 64 :: (store (<16 x s32>) into stack - 64, basealign 128) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_lo + ; CHECK-NEXT: [[VST_dmx_sts_x_pstm_nrm:%[0-9]+]]:ep = VST_dmx_sts_x_pstm_nrm [[COPY5]], [[COPY]], [[COPY2]] :: (store (<16 x s32>) into stack, align 128) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_hi + ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY6]], [[VST_dmx_sts_x_pstm_nrm]], 64 :: (store (<16 x s32>) into stack - 64, basealign 128) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_lo + ; CHECK-NEXT: [[VST_dmx_sts_x_pstm_nrm_imm:%[0-9]+]]:ep = VST_dmx_sts_x_pstm_nrm_imm [[COPY7]], [[VST_dmx_sts_x_pstm_nrm]], 448 :: (store (<16 x s32>) into stack, align 128) + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_hi + ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY8]], [[VST_dmx_sts_x_pstm_nrm_imm]], 64 :: (store (<16 x s32>) into stack - 64, basealign 128) + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_lo + ; CHECK-NEXT: [[VST_dmx_sts_x_pstm_nrm_imm1:%[0-9]+]]:ep = VST_dmx_sts_x_pstm_nrm_imm [[COPY9]], [[VST_dmx_sts_x_pstm_nrm_imm]], -512 :: (store (<16 x s32>) into stack, align 128) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_hi + ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY10]], [[VST_dmx_sts_x_pstm_nrm_imm1]], 64 :: (store (<16 x s32>) into stack - 64, basealign 128) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_lo + ; CHECK-NEXT: [[VST_dmx_sts_x_pstm_nrm_imm2:%[0-9]+]]:ep = VST_dmx_sts_x_pstm_nrm_imm [[COPY11]], [[VST_dmx_sts_x_pstm_nrm_imm1]], 0 :: (store (<16 x s32>) into stack, align 128) + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_hi + ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY12]], [[VST_dmx_sts_x_pstm_nrm_imm2]], 64 :: (store (<16 x s32>) into stack - 64, basealign 128) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_lo + ; CHECK-NEXT: [[VST_dmx_sts_x_pstm_nrm1:%[0-9]+]]:ep = VST_dmx_sts_x_pstm_nrm [[COPY13]], [[VST_dmx_sts_x_pstm_nrm_imm2]], [[MOV_PD_imm11_pseudo]] :: (store (<16 x s32>) into stack, align 128) + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_hi + ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY14]], [[VST_dmx_sts_x_pstm_nrm1]], 64 :: (store (<16 x s32>) into stack - 64, basealign 128) + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_lo + ; CHECK-NEXT: [[VST_dmx_sts_x_pstm_nrm2:%[0-9]+]]:ep = VST_dmx_sts_x_pstm_nrm [[COPY15]], [[VST_dmx_sts_x_pstm_nrm1]], [[MOV_PD_imm11_pseudo1]] :: (store (<16 x s32>) into stack, align 128) ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VST_dmx_sts_x_pstm_nrm2]] %0:ptrregbank(p0) = COPY $p0 %1:gprregbank(s32) = COPY $r0 @@ -1272,10 +1272,10 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec1024 = COPY $y0 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_count - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_lo - ; CHECK-NEXT: [[VST_2D_dmx_sts_x:%[0-9]+]]:ep, [[VST_2D_dmx_sts_x1:%[0-9]+]]:edc = VST_2D_dmx_sts_x [[COPY2]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>), align 128) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_hi - ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_hi + ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY2]], [[COPY]], 64 :: (store (<16 x s32>), align 128) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_lo + ; CHECK-NEXT: [[VST_2D_dmx_sts_x:%[0-9]+]]:ep, [[VST_2D_dmx_sts_x1:%[0-9]+]]:edc = VST_2D_dmx_sts_x [[COPY3]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>) into unknown-address + 64) ; CHECK-NEXT: PseudoRET implicit $lr %0:ptrregbank(p0) = COPY $p0 %1:em(s20) = G_CONSTANT i20 1 @@ -1308,10 +1308,10 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm11_pseudo6:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 7 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec1024 = COPY $y0 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo5]], %subreg.sub_dim_count, [[MOV_PD_imm11_pseudo4]], %subreg.sub_hi_dim_then_sub_dim_size, [[MOV_PD_imm11_pseudo2]], %subreg.sub_hi_dim_then_sub_dim_stride, [[MOV_PD_imm11_pseudo6]], %subreg.sub_hi_dim_then_sub_dim_count - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_lo - ; CHECK-NEXT: [[VST_3D_dmx_sts_x:%[0-9]+]]:ep, [[VST_3D_dmx_sts_x1:%[0-9]+]]:edcl, [[VST_3D_dmx_sts_x2:%[0-9]+]]:edch = VST_3D_dmx_sts_x [[COPY2]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>), align 128) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_hi - ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_hi + ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY2]], [[COPY]], 64 :: (store (<16 x s32>), align 128) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_lo + ; CHECK-NEXT: [[VST_3D_dmx_sts_x:%[0-9]+]]:ep, [[VST_3D_dmx_sts_x1:%[0-9]+]]:edcl, [[VST_3D_dmx_sts_x2:%[0-9]+]]:edch = VST_3D_dmx_sts_x [[COPY3]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>) into unknown-address + 64) ; CHECK-NEXT: PseudoRET implicit $lr %0:ptrregbank(p0) = COPY $p0 %1:em(s20) = G_CONSTANT i20 1 @@ -1342,23 +1342,23 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 32 ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:em = MOV_PD_imm11_pseudo 512 - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm [[COPY]], [[COPY2]] :: (load (<16 x s32>) from stack - 64, basealign 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from stack, align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from stack - 64, basealign 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm [[COPY]], [[COPY2]] :: (load (<16 x s32>) from stack, align 128) ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_hi - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm_imm:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm_imm [[VLDA_dmx_lda_bm_pstm_nrm1]], 448 :: (load (<16 x s32>) from stack - 64, basealign 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm1]], 64 :: (load (<16 x s32>) from stack, align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm1]], 64 :: (load (<16 x s32>) from stack - 64, basealign 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm_imm:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm_imm [[VLDA_dmx_lda_bm_pstm_nrm1]], 448 :: (load (<16 x s32>) from stack, align 128) ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm_imm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_512_acc_hi - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm_imm2:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm_imm3:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm1]], -512 :: (load (<16 x s32>) from stack - 64, basealign 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm2:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm1]], 64 :: (load (<16 x s32>) from stack, align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm2:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm1]], 64 :: (load (<16 x s32>) from stack - 64, basealign 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm_imm2:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm_imm3:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm1]], -512 :: (load (<16 x s32>) from stack, align 128) ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm_imm2]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm2]], %subreg.sub_512_acc_hi - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm_imm4:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm_imm5:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm3]], 0 :: (load (<16 x s32>) from stack - 64, basealign 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm3:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm3]], 64 :: (load (<16 x s32>) from stack, align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm3:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm3]], 64 :: (load (<16 x s32>) from stack - 64, basealign 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm_imm4:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm_imm5:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm3]], 0 :: (load (<16 x s32>) from stack, align 128) ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm_imm4]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm3]], %subreg.sub_512_acc_hi - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm2:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm3:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm [[VLDA_dmx_lda_bm_pstm_nrm_imm5]], [[MOV_PD_imm11_pseudo]] :: (load (<16 x s32>) from stack - 64, basealign 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm4:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm5]], 64 :: (load (<16 x s32>) from stack, align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm4:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm5]], 64 :: (load (<16 x s32>) from stack - 64, basealign 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm2:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm3:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm [[VLDA_dmx_lda_bm_pstm_nrm_imm5]], [[MOV_PD_imm11_pseudo]] :: (load (<16 x s32>) from stack, align 128) ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm2]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm4]], %subreg.sub_512_acc_hi - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm4:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm5:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm [[VLDA_dmx_lda_bm_pstm_nrm3]], [[MOV_PD_imm11_pseudo1]] :: (load (<16 x s32>) from stack - 64, basealign 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm5:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm3]], 64 :: (load (<16 x s32>) from stack, align 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm5:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm3]], 64 :: (load (<16 x s32>) from stack - 64, basealign 128) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm4:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm5:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm [[VLDA_dmx_lda_bm_pstm_nrm3]], [[MOV_PD_imm11_pseudo1]] :: (load (<16 x s32>) from stack, align 128) ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm4]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm5]], %subreg.sub_512_acc_hi ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE5]] %0:ptrregbank(p0) = COPY $p0 @@ -1395,8 +1395,8 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 3 ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 4 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_count - ; CHECK-NEXT: [[VLDA_2D_dmx_lda_bm:%[0-9]+]]:acc512, [[VLDA_2D_dmx_lda_bm1:%[0-9]+]]:ep, [[VLDA_2D_dmx_lda_bm2:%[0-9]+]]:edc = VLDA_2D_dmx_lda_bm [[COPY]], [[REG_SEQUENCE]] :: (load (<16 x s32>), align 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from unknown-address + 64) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<16 x s32>), align 128) + ; CHECK-NEXT: [[VLDA_2D_dmx_lda_bm:%[0-9]+]]:acc512, [[VLDA_2D_dmx_lda_bm1:%[0-9]+]]:ep, [[VLDA_2D_dmx_lda_bm2:%[0-9]+]]:edc = VLDA_2D_dmx_lda_bm [[COPY]], [[REG_SEQUENCE]] :: (load (<16 x s32>) from unknown-address + 64) ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_2D_dmx_lda_bm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_hi ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE1]] %0:ptrregbank(p0) = COPY $p0 @@ -1459,30 +1459,30 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 32 ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:em = MOV_PD_imm11_pseudo 512 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc1024 = COPY $cml0 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo - ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm [[COPY4]], [[COPY]], [[COPY2]] :: (store (<16 x s32>) into stack - 64, basealign 128) - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY5]], [[COPY]], 64 :: (store (<16 x s32>) into stack, align 128) - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo - ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm_imm:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm_imm [[COPY6]], [[VST_dmx_sts_bm_pstm_nrm]], 448 :: (store (<16 x s32>) into stack - 64, basealign 128) - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY7]], [[VST_dmx_sts_bm_pstm_nrm]], 64 :: (store (<16 x s32>) into stack, align 128) - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo - ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm_imm1:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm_imm [[COPY8]], [[VST_dmx_sts_bm_pstm_nrm_imm]], -512 :: (store (<16 x s32>) into stack - 64, basealign 128) - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY9]], [[VST_dmx_sts_bm_pstm_nrm_imm]], 64 :: (store (<16 x s32>) into stack, align 128) - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo - ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm_imm2:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm_imm [[COPY10]], [[VST_dmx_sts_bm_pstm_nrm_imm1]], 0 :: (store (<16 x s32>) into stack - 64, basealign 128) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY11]], [[VST_dmx_sts_bm_pstm_nrm_imm1]], 64 :: (store (<16 x s32>) into stack, align 128) - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo - ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm1:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm [[COPY12]], [[VST_dmx_sts_bm_pstm_nrm_imm2]], [[MOV_PD_imm11_pseudo]] :: (store (<16 x s32>) into stack - 64, basealign 128) - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY13]], [[VST_dmx_sts_bm_pstm_nrm_imm2]], 64 :: (store (<16 x s32>) into stack, align 128) - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo - ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm2:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm [[COPY14]], [[VST_dmx_sts_bm_pstm_nrm1]], [[MOV_PD_imm11_pseudo1]] :: (store (<16 x s32>) into stack - 64, basealign 128) - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY15]], [[VST_dmx_sts_bm_pstm_nrm1]], 64 :: (store (<16 x s32>) into stack, align 128) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY4]], [[COPY]], 64 :: (store (<16 x s32>) into stack - 64, basealign 128) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo + ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm [[COPY5]], [[COPY]], [[COPY2]] :: (store (<16 x s32>) into stack, align 128) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY6]], [[VST_dmx_sts_bm_pstm_nrm]], 64 :: (store (<16 x s32>) into stack - 64, basealign 128) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo + ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm_imm:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm_imm [[COPY7]], [[VST_dmx_sts_bm_pstm_nrm]], 448 :: (store (<16 x s32>) into stack, align 128) + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY8]], [[VST_dmx_sts_bm_pstm_nrm_imm]], 64 :: (store (<16 x s32>) into stack - 64, basealign 128) + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo + ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm_imm1:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm_imm [[COPY9]], [[VST_dmx_sts_bm_pstm_nrm_imm]], -512 :: (store (<16 x s32>) into stack, align 128) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY10]], [[VST_dmx_sts_bm_pstm_nrm_imm1]], 64 :: (store (<16 x s32>) into stack - 64, basealign 128) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo + ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm_imm2:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm_imm [[COPY11]], [[VST_dmx_sts_bm_pstm_nrm_imm1]], 0 :: (store (<16 x s32>) into stack, align 128) + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY12]], [[VST_dmx_sts_bm_pstm_nrm_imm2]], 64 :: (store (<16 x s32>) into stack - 64, basealign 128) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo + ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm1:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm [[COPY13]], [[VST_dmx_sts_bm_pstm_nrm_imm2]], [[MOV_PD_imm11_pseudo]] :: (store (<16 x s32>) into stack, align 128) + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY14]], [[VST_dmx_sts_bm_pstm_nrm1]], 64 :: (store (<16 x s32>) into stack - 64, basealign 128) + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo + ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm2:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm [[COPY15]], [[VST_dmx_sts_bm_pstm_nrm1]], [[MOV_PD_imm11_pseudo1]] :: (store (<16 x s32>) into stack, align 128) ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VST_dmx_sts_bm_pstm_nrm2]] %0:ptrregbank(p0) = COPY $p0 %1:gprregbank(s32) = COPY $r0 @@ -1520,10 +1520,10 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc1024 = COPY $cml0 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_count - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo - ; CHECK-NEXT: [[VST_2D_dmx_sts_bm:%[0-9]+]]:ep, [[VST_2D_dmx_sts_bm1:%[0-9]+]]:edc = VST_2D_dmx_sts_bm [[COPY2]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>), align 128) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[COPY]], 64 :: (store (<16 x s32>), align 128) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo + ; CHECK-NEXT: [[VST_2D_dmx_sts_bm:%[0-9]+]]:ep, [[VST_2D_dmx_sts_bm1:%[0-9]+]]:edc = VST_2D_dmx_sts_bm [[COPY3]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>) into unknown-address + 64) ; CHECK-NEXT: PseudoRET implicit $lr %0:ptrregbank(p0) = COPY $p0 %1:em(s20) = G_CONSTANT i20 1 @@ -1556,10 +1556,10 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm11_pseudo6:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 7 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc1024 = COPY $cml0 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo5]], %subreg.sub_dim_count, [[MOV_PD_imm11_pseudo4]], %subreg.sub_hi_dim_then_sub_dim_size, [[MOV_PD_imm11_pseudo2]], %subreg.sub_hi_dim_then_sub_dim_stride, [[MOV_PD_imm11_pseudo6]], %subreg.sub_hi_dim_then_sub_dim_count - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo - ; CHECK-NEXT: [[VST_3D_dmx_sts_bm:%[0-9]+]]:ep, [[VST_3D_dmx_sts_bm1:%[0-9]+]]:edcl, [[VST_3D_dmx_sts_bm2:%[0-9]+]]:edch = VST_3D_dmx_sts_bm [[COPY2]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>), align 128) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[COPY]], 64 :: (store (<16 x s32>), align 128) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo + ; CHECK-NEXT: [[VST_3D_dmx_sts_bm:%[0-9]+]]:ep, [[VST_3D_dmx_sts_bm1:%[0-9]+]]:edcl, [[VST_3D_dmx_sts_bm2:%[0-9]+]]:edch = VST_3D_dmx_sts_bm [[COPY3]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>) into unknown-address + 64) ; CHECK-NEXT: PseudoRET implicit $lr %0:ptrregbank(p0) = COPY $p0 %1:em(s20) = G_CONSTANT i20 1 @@ -1590,36 +1590,36 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 32 ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:em = MOV_PD_imm11_pseudo 512 - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm [[COPY]], [[COPY2]] :: (load (<16 x s32>) from stack - 64, basealign 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from stack, align 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 128 :: (load (<16 x s32>) from stack + 64, basealign 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm2:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 192 :: (load (<16 x s32>) from stack + 128, align 128, basealign 256) - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm2]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm_imm:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm_imm [[VLDA_dmx_lda_bm_pstm_nrm1]], 448 :: (load (<16 x s32>) from stack - 64, basealign 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm3:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm1]], 64 :: (load (<16 x s32>) from stack, align 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm4:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm1]], 128 :: (load (<16 x s32>) from stack + 64, basealign 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm5:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm1]], 192 :: (load (<16 x s32>) from stack + 128, align 128, basealign 256) - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm_imm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm3]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm4]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm5]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm_imm2:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm_imm3:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm1]], -512 :: (load (<16 x s32>) from stack - 64, basealign 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm6:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm1]], 64 :: (load (<16 x s32>) from stack, align 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm7:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm1]], 128 :: (load (<16 x s32>) from stack + 64, basealign 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm8:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm1]], 192 :: (load (<16 x s32>) from stack + 128, align 128, basealign 256) - ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm_imm2]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm6]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm7]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm8]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm_imm4:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm_imm5:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm3]], 0 :: (load (<16 x s32>) from stack - 64, basealign 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm9:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm3]], 64 :: (load (<16 x s32>) from stack, align 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm10:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm3]], 128 :: (load (<16 x s32>) from stack + 64, basealign 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm11:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm3]], 192 :: (load (<16 x s32>) from stack + 128, align 128, basealign 256) - ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm_imm4]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm9]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm10]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm11]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm2:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm3:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm [[VLDA_dmx_lda_bm_pstm_nrm_imm5]], [[MOV_PD_imm11_pseudo]] :: (load (<16 x s32>) from stack - 64, basealign 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm12:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm5]], 64 :: (load (<16 x s32>) from stack, align 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm13:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm5]], 128 :: (load (<16 x s32>) from stack + 64, basealign 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm14:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm5]], 192 :: (load (<16 x s32>) from stack + 128, align 128, basealign 256) - ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm2]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm12]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm13]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm14]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm4:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm5:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm [[VLDA_dmx_lda_bm_pstm_nrm3]], [[MOV_PD_imm11_pseudo1]] :: (load (<16 x s32>) from stack - 64, basealign 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm15:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm3]], 64 :: (load (<16 x s32>) from stack, align 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm16:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm3]], 128 :: (load (<16 x s32>) from stack + 64, basealign 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm17:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm3]], 192 :: (load (<16 x s32>) from stack + 128, align 128, basealign 256) - ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm4]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm15]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm16]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm17]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 192 :: (load (<16 x s32>) from stack - 64, basealign 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 128 :: (load (<16 x s32>) from stack, align 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm2:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from stack + 64, basealign 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm [[COPY]], [[COPY2]] :: (load (<16 x s32>) from stack + 128, align 128, basealign 256) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm2]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm3:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm1]], 192 :: (load (<16 x s32>) from stack - 64, basealign 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm4:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm1]], 128 :: (load (<16 x s32>) from stack, align 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm5:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm1]], 64 :: (load (<16 x s32>) from stack + 64, basealign 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm_imm:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm_imm [[VLDA_dmx_lda_bm_pstm_nrm1]], 448 :: (load (<16 x s32>) from stack + 128, align 128, basealign 256) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm_imm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm5]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm4]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm3]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm6:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm1]], 192 :: (load (<16 x s32>) from stack - 64, basealign 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm7:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm1]], 128 :: (load (<16 x s32>) from stack, align 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm8:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm1]], 64 :: (load (<16 x s32>) from stack + 64, basealign 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm_imm2:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm_imm3:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm1]], -512 :: (load (<16 x s32>) from stack + 128, align 128, basealign 256) + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm_imm2]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm8]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm7]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm6]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm9:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm3]], 192 :: (load (<16 x s32>) from stack - 64, basealign 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm10:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm3]], 128 :: (load (<16 x s32>) from stack, align 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm11:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm3]], 64 :: (load (<16 x s32>) from stack + 64, basealign 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm_imm4:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm_imm5:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm3]], 0 :: (load (<16 x s32>) from stack + 128, align 128, basealign 256) + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm_imm4]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm11]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm10]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm9]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm12:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm5]], 192 :: (load (<16 x s32>) from stack - 64, basealign 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm13:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm5]], 128 :: (load (<16 x s32>) from stack, align 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm14:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm5]], 64 :: (load (<16 x s32>) from stack + 64, basealign 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm2:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm3:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm [[VLDA_dmx_lda_bm_pstm_nrm_imm5]], [[MOV_PD_imm11_pseudo]] :: (load (<16 x s32>) from stack + 128, align 128, basealign 256) + ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm2]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm14]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm13]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm12]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm15:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm3]], 192 :: (load (<16 x s32>) from stack - 64, basealign 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm16:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm3]], 128 :: (load (<16 x s32>) from stack, align 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm17:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm3]], 64 :: (load (<16 x s32>) from stack + 64, basealign 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm4:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm5:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm [[VLDA_dmx_lda_bm_pstm_nrm3]], [[MOV_PD_imm11_pseudo1]] :: (load (<16 x s32>) from stack + 128, align 128, basealign 256) + ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm4]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm17]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm16]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm15]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE5]] %0:ptrregbank(p0) = COPY $p0 %1:gprregbank(s32) = COPY $r0 @@ -1655,11 +1655,11 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 3 ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 4 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_count - ; CHECK-NEXT: [[VLDA_2D_dmx_lda_bm:%[0-9]+]]:acc512, [[VLDA_2D_dmx_lda_bm1:%[0-9]+]]:ep, [[VLDA_2D_dmx_lda_bm2:%[0-9]+]]:edc = VLDA_2D_dmx_lda_bm [[COPY]], [[REG_SEQUENCE]] :: (load (<16 x s32>), align 256) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from unknown-address + 64) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 128 :: (load (<16 x s32>) from unknown-address + 128, align 128) - ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm2:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 192 :: (load (<16 x s32>) from unknown-address + 192) - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_2D_dmx_lda_bm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm2]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 192 :: (load (<16 x s32>), align 256) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 128 :: (load (<16 x s32>) from unknown-address + 64) + ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm2:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from unknown-address + 128, align 128) + ; CHECK-NEXT: [[VLDA_2D_dmx_lda_bm:%[0-9]+]]:acc512, [[VLDA_2D_dmx_lda_bm1:%[0-9]+]]:ep, [[VLDA_2D_dmx_lda_bm2:%[0-9]+]]:edc = VLDA_2D_dmx_lda_bm [[COPY]], [[REG_SEQUENCE]] :: (load (<16 x s32>) from unknown-address + 192) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_2D_dmx_lda_bm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm2]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE1]] %0:ptrregbank(p0) = COPY $p0 %1:em(s20) = G_CONSTANT i20 1 @@ -1721,54 +1721,54 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 32 ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:em = MOV_PD_imm11_pseudo 512 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc2048 = COPY $dm0 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo - ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm [[COPY4]], [[COPY]], [[COPY2]] :: (store (<16 x s32>) into stack - 64, basealign 256) - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY5]], [[COPY]], 64 :: (store (<16 x s32>) into stack, align 256) - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_lo - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY6]], [[COPY]], 128 :: (store (<16 x s32>) into stack + 64, basealign 256) - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY7]], [[COPY]], 192 :: (store (<16 x s32>) into stack + 128, align 128, basealign 256) - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo - ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm_imm:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm_imm [[COPY8]], [[VST_dmx_sts_bm_pstm_nrm]], 448 :: (store (<16 x s32>) into stack - 64, basealign 256) - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY9]], [[VST_dmx_sts_bm_pstm_nrm]], 64 :: (store (<16 x s32>) into stack, align 256) - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_lo - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY10]], [[VST_dmx_sts_bm_pstm_nrm]], 128 :: (store (<16 x s32>) into stack + 64, basealign 256) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY11]], [[VST_dmx_sts_bm_pstm_nrm]], 192 :: (store (<16 x s32>) into stack + 128, align 128, basealign 256) - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo - ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm_imm1:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm_imm [[COPY12]], [[VST_dmx_sts_bm_pstm_nrm_imm]], -512 :: (store (<16 x s32>) into stack - 64, basealign 256) - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY13]], [[VST_dmx_sts_bm_pstm_nrm_imm]], 64 :: (store (<16 x s32>) into stack, align 256) - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_lo - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY14]], [[VST_dmx_sts_bm_pstm_nrm_imm]], 128 :: (store (<16 x s32>) into stack + 64, basealign 256) - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY15]], [[VST_dmx_sts_bm_pstm_nrm_imm]], 192 :: (store (<16 x s32>) into stack + 128, align 128, basealign 256) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo - ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm_imm2:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm_imm [[COPY16]], [[VST_dmx_sts_bm_pstm_nrm_imm1]], 0 :: (store (<16 x s32>) into stack - 64, basealign 256) - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY17]], [[VST_dmx_sts_bm_pstm_nrm_imm1]], 64 :: (store (<16 x s32>) into stack, align 256) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_lo - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY18]], [[VST_dmx_sts_bm_pstm_nrm_imm1]], 128 :: (store (<16 x s32>) into stack + 64, basealign 256) - ; CHECK-NEXT: [[COPY19:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY19]], [[VST_dmx_sts_bm_pstm_nrm_imm1]], 192 :: (store (<16 x s32>) into stack + 128, align 128, basealign 256) - ; CHECK-NEXT: [[COPY20:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo - ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm1:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm [[COPY20]], [[VST_dmx_sts_bm_pstm_nrm_imm2]], [[MOV_PD_imm11_pseudo]] :: (store (<16 x s32>) into stack - 64, basealign 256) - ; CHECK-NEXT: [[COPY21:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY21]], [[VST_dmx_sts_bm_pstm_nrm_imm2]], 64 :: (store (<16 x s32>) into stack, align 256) - ; CHECK-NEXT: [[COPY22:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_lo - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY22]], [[VST_dmx_sts_bm_pstm_nrm_imm2]], 128 :: (store (<16 x s32>) into stack + 64, basealign 256) - ; CHECK-NEXT: [[COPY23:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY23]], [[VST_dmx_sts_bm_pstm_nrm_imm2]], 192 :: (store (<16 x s32>) into stack + 128, align 128, basealign 256) - ; CHECK-NEXT: [[COPY24:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo - ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm2:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm [[COPY24]], [[VST_dmx_sts_bm_pstm_nrm1]], [[MOV_PD_imm11_pseudo1]] :: (store (<16 x s32>) into stack - 64, basealign 256) - ; CHECK-NEXT: [[COPY25:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY25]], [[VST_dmx_sts_bm_pstm_nrm1]], 64 :: (store (<16 x s32>) into stack, align 256) - ; CHECK-NEXT: [[COPY26:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_lo - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY26]], [[VST_dmx_sts_bm_pstm_nrm1]], 128 :: (store (<16 x s32>) into stack + 64, basealign 256) - ; CHECK-NEXT: [[COPY27:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY27]], [[VST_dmx_sts_bm_pstm_nrm1]], 192 :: (store (<16 x s32>) into stack + 128, align 128, basealign 256) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY4]], [[COPY]], 192 :: (store (<16 x s32>) into stack - 64, basealign 256) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_lo + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY5]], [[COPY]], 128 :: (store (<16 x s32>) into stack, align 256) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY6]], [[COPY]], 64 :: (store (<16 x s32>) into stack + 64, basealign 256) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo + ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm [[COPY7]], [[COPY]], [[COPY2]] :: (store (<16 x s32>) into stack + 128, align 128, basealign 256) + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY8]], [[VST_dmx_sts_bm_pstm_nrm]], 192 :: (store (<16 x s32>) into stack - 64, basealign 256) + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_lo + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY9]], [[VST_dmx_sts_bm_pstm_nrm]], 128 :: (store (<16 x s32>) into stack, align 256) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY10]], [[VST_dmx_sts_bm_pstm_nrm]], 64 :: (store (<16 x s32>) into stack + 64, basealign 256) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo + ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm_imm:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm_imm [[COPY11]], [[VST_dmx_sts_bm_pstm_nrm]], 448 :: (store (<16 x s32>) into stack + 128, align 128, basealign 256) + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY12]], [[VST_dmx_sts_bm_pstm_nrm_imm]], 192 :: (store (<16 x s32>) into stack - 64, basealign 256) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_lo + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY13]], [[VST_dmx_sts_bm_pstm_nrm_imm]], 128 :: (store (<16 x s32>) into stack, align 256) + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY14]], [[VST_dmx_sts_bm_pstm_nrm_imm]], 64 :: (store (<16 x s32>) into stack + 64, basealign 256) + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo + ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm_imm1:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm_imm [[COPY15]], [[VST_dmx_sts_bm_pstm_nrm_imm]], -512 :: (store (<16 x s32>) into stack + 128, align 128, basealign 256) + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY16]], [[VST_dmx_sts_bm_pstm_nrm_imm1]], 192 :: (store (<16 x s32>) into stack - 64, basealign 256) + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_lo + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY17]], [[VST_dmx_sts_bm_pstm_nrm_imm1]], 128 :: (store (<16 x s32>) into stack, align 256) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY18]], [[VST_dmx_sts_bm_pstm_nrm_imm1]], 64 :: (store (<16 x s32>) into stack + 64, basealign 256) + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo + ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm_imm2:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm_imm [[COPY19]], [[VST_dmx_sts_bm_pstm_nrm_imm1]], 0 :: (store (<16 x s32>) into stack + 128, align 128, basealign 256) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY20]], [[VST_dmx_sts_bm_pstm_nrm_imm2]], 192 :: (store (<16 x s32>) into stack - 64, basealign 256) + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_lo + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY21]], [[VST_dmx_sts_bm_pstm_nrm_imm2]], 128 :: (store (<16 x s32>) into stack, align 256) + ; CHECK-NEXT: [[COPY22:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY22]], [[VST_dmx_sts_bm_pstm_nrm_imm2]], 64 :: (store (<16 x s32>) into stack + 64, basealign 256) + ; CHECK-NEXT: [[COPY23:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo + ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm1:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm [[COPY23]], [[VST_dmx_sts_bm_pstm_nrm_imm2]], [[MOV_PD_imm11_pseudo]] :: (store (<16 x s32>) into stack + 128, align 128, basealign 256) + ; CHECK-NEXT: [[COPY24:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY24]], [[VST_dmx_sts_bm_pstm_nrm1]], 192 :: (store (<16 x s32>) into stack - 64, basealign 256) + ; CHECK-NEXT: [[COPY25:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_lo + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY25]], [[VST_dmx_sts_bm_pstm_nrm1]], 128 :: (store (<16 x s32>) into stack, align 256) + ; CHECK-NEXT: [[COPY26:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY26]], [[VST_dmx_sts_bm_pstm_nrm1]], 64 :: (store (<16 x s32>) into stack + 64, basealign 256) + ; CHECK-NEXT: [[COPY27:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo + ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm2:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm [[COPY27]], [[VST_dmx_sts_bm_pstm_nrm1]], [[MOV_PD_imm11_pseudo1]] :: (store (<16 x s32>) into stack + 128, align 128, basealign 256) ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VST_dmx_sts_bm_pstm_nrm2]] %0:ptrregbank(p0) = COPY $p0 %1:gprregbank(s32) = COPY $r0 @@ -1806,14 +1806,14 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc2048 = COPY $dm0 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_count - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo - ; CHECK-NEXT: [[VST_2D_dmx_sts_bm:%[0-9]+]]:ep, [[VST_2D_dmx_sts_bm1:%[0-9]+]]:edc = VST_2D_dmx_sts_bm [[COPY2]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>), align 256) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_lo - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY4]], [[COPY]], 128 :: (store (<16 x s32>) into unknown-address + 128, align 128) - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY5]], [[COPY]], 192 :: (store (<16 x s32>) into unknown-address + 192) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[COPY]], 192 :: (store (<16 x s32>), align 256) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_lo + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY]], 128 :: (store (<16 x s32>) into unknown-address + 64) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY4]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 128, align 128) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo + ; CHECK-NEXT: [[VST_2D_dmx_sts_bm:%[0-9]+]]:ep, [[VST_2D_dmx_sts_bm1:%[0-9]+]]:edc = VST_2D_dmx_sts_bm [[COPY5]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>) into unknown-address + 192) ; CHECK-NEXT: PseudoRET implicit $lr %0:ptrregbank(p0) = COPY $p0 %1:em(s20) = G_CONSTANT i20 1 @@ -1846,14 +1846,14 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm11_pseudo6:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 7 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc2048 = COPY $dm0 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo5]], %subreg.sub_dim_count, [[MOV_PD_imm11_pseudo4]], %subreg.sub_hi_dim_then_sub_dim_size, [[MOV_PD_imm11_pseudo2]], %subreg.sub_hi_dim_then_sub_dim_stride, [[MOV_PD_imm11_pseudo6]], %subreg.sub_hi_dim_then_sub_dim_count - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo - ; CHECK-NEXT: [[VST_3D_dmx_sts_bm:%[0-9]+]]:ep, [[VST_3D_dmx_sts_bm1:%[0-9]+]]:edcl, [[VST_3D_dmx_sts_bm2:%[0-9]+]]:edch = VST_3D_dmx_sts_bm [[COPY2]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>), align 256) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_lo - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY4]], [[COPY]], 128 :: (store (<16 x s32>) into unknown-address + 128, align 128) - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY5]], [[COPY]], 192 :: (store (<16 x s32>) into unknown-address + 192) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[COPY]], 192 :: (store (<16 x s32>), align 256) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_lo + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY]], 128 :: (store (<16 x s32>) into unknown-address + 64) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY4]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 128, align 128) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo + ; CHECK-NEXT: [[VST_3D_dmx_sts_bm:%[0-9]+]]:ep, [[VST_3D_dmx_sts_bm1:%[0-9]+]]:edcl, [[VST_3D_dmx_sts_bm2:%[0-9]+]]:edch = VST_3D_dmx_sts_bm [[COPY5]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>) into unknown-address + 192) ; CHECK-NEXT: PseudoRET implicit $lr %0:ptrregbank(p0) = COPY $p0 %1:em(s20) = G_CONSTANT i20 1 diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-store.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-store.mir index b82b8928a2e0..30e6a24ff8e1 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-store.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-store.mir @@ -4,7 +4,7 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates # RUN: llc -mtriple aie2p -run-pass=instruction-select %s -verify-machineinstrs -o - | FileCheck %s --- @@ -219,10 +219,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fifo1024 = COPY $lf0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_lo_fifo - ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY2]], [[COPY]], 0 :: (store (<16 x s32>), align 128) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_hi_fifo - ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_hi_fifo + ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY2]], [[COPY]], 64 :: (store (<16 x s32>), align 128) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_lo_fifo + ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY3]], [[COPY]], 0 :: (store (<16 x s32>) into unknown-address + 64) %0:ptrregbank(p0) = COPY $p0 %1:fiforegbank(<32 x s32>) = COPY $lf0 G_STORE %1(<32 x s32>), %0(p0) :: (store (<32 x s32>)) @@ -241,10 +241,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fifo1024 = COPY $lf0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_lo_fifo - ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY2]], [[COPY]], 0 :: (store (<32 x s16>), align 128) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_hi_fifo - ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<32 x s16>) into unknown-address + 64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_hi_fifo + ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY2]], [[COPY]], 64 :: (store (<32 x s16>), align 128) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_lo_fifo + ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY3]], [[COPY]], 0 :: (store (<32 x s16>) into unknown-address + 64) %0:ptrregbank(p0) = COPY $p0 %1:fiforegbank(<64 x s16>) = COPY $lf0 G_STORE %1(<64 x s16>), %0(p0) :: (store (<64 x s16>)) @@ -262,10 +262,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fifo1024 = COPY $lf0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_lo_fifo - ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY2]], [[COPY]], 0 :: (store (<64 x s8>), align 128) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_hi_fifo - ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<64 x s8>) into unknown-address + 64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_hi_fifo + ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY2]], [[COPY]], 64 :: (store (<64 x s8>), align 128) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_lo_fifo + ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY3]], [[COPY]], 0 :: (store (<64 x s8>) into unknown-address + 64) %0:ptrregbank(p0) = COPY $p0 %1:fiforegbank(<128 x s8>) = COPY $lf0 G_STORE %1(<128 x s8>), %0(p0) :: (store (<128 x s8>)) @@ -283,10 +283,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec1024 = COPY $y0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_lo - ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY2]], [[COPY]], 0 :: (store (<32 x s16>), align 128) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_hi - ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<32 x s16>) into unknown-address + 64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_hi + ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY2]], [[COPY]], 64 :: (store (<32 x s16>), align 128) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_lo + ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY3]], [[COPY]], 0 :: (store (<32 x s16>) into unknown-address + 64) %0:ptrregbank(p0) = COPY $p0 %1:vregbank(<64 x s16>) = COPY $y0 G_STORE %1(<64 x s16>), %0(p0) :: (store (<64 x s16>)) @@ -304,10 +304,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec1024 = COPY $y0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_lo - ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY2]], [[COPY]], 0 :: (store (<16 x s32>), align 128) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_hi - ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_hi + ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY2]], [[COPY]], 64 :: (store (<16 x s32>), align 128) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_lo + ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY3]], [[COPY]], 0 :: (store (<16 x s32>) into unknown-address + 64) %0:ptrregbank(p0) = COPY $p0 %1:vregbank(<32 x s32>) = COPY $y0 G_STORE %1(<32 x s32>), %0(p0) :: (store (<32 x s32>)) @@ -325,10 +325,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec1024 = COPY $y0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_lo - ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY2]], [[COPY]], 0 :: (store (<64 x s8>), align 128) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_hi - ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<64 x s8>) into unknown-address + 64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_hi + ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY2]], [[COPY]], 64 :: (store (<64 x s8>), align 128) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_lo + ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY3]], [[COPY]], 0 :: (store (<64 x s8>) into unknown-address + 64) %0:ptrregbank(p0) = COPY $p0 %1:vregbank(<128 x s8>) = COPY $y0 G_STORE %1(<128 x s8>), %0(p0) :: (store (<128 x s8>)) @@ -346,10 +346,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc1024 = COPY $cml0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[COPY]], 0 :: (store (<16 x s32>), align 128) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[COPY]], 64 :: (store (<16 x s32>), align 128) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY]], 0 :: (store (<16 x s32>) into unknown-address + 64) %0:ptrregbank(p0) = COPY $p0 %1:accregbank(<32 x s32>) = COPY $cml0 G_STORE %1(<32 x s32>), %0(p0) :: (store (<32 x s32>)) @@ -367,10 +367,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc1024 = COPY $cml0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[COPY]], 0 :: (store (<8 x s64>), align 128) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<8 x s64>) into unknown-address + 64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[COPY]], 64 :: (store (<8 x s64>), align 128) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY]], 0 :: (store (<8 x s64>) into unknown-address + 64) %0:ptrregbank(p0) = COPY $p0 %1:accregbank(<16 x s64>) = COPY $cml0 G_STORE %1(<16 x s64>), %0(p0) :: (store (<16 x s64>)) @@ -388,14 +388,14 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc2048 = COPY $dm0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[COPY]], 0 :: (store (<16 x s32>), align 256) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_lo - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY4]], [[COPY]], 128 :: (store (<16 x s32>) into unknown-address + 128, align 128) - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY5]], [[COPY]], 192 :: (store (<16 x s32>) into unknown-address + 192) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[COPY]], 192 :: (store (<16 x s32>), align 256) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_lo + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY]], 128 :: (store (<16 x s32>) into unknown-address + 64) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY4]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 128, align 128) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY5]], [[COPY]], 0 :: (store (<16 x s32>) into unknown-address + 192) %0:ptrregbank(p0) = COPY $p0 %1:accregbank(<64 x s32>) = COPY $dm0 G_STORE %1(<64 x s32>), %0(p0) :: (store (<64 x s32>)) @@ -413,14 +413,14 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc2048 = COPY $dm0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[COPY]], 0 :: (store (<8 x s64>), align 256) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<8 x s64>) into unknown-address + 64) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_lo - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY4]], [[COPY]], 128 :: (store (<8 x s64>) into unknown-address + 128, align 128) - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_hi - ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY5]], [[COPY]], 192 :: (store (<8 x s64>) into unknown-address + 192) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[COPY]], 192 :: (store (<8 x s64>), align 256) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_lo + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY]], 128 :: (store (<8 x s64>) into unknown-address + 64) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY4]], [[COPY]], 64 :: (store (<8 x s64>) into unknown-address + 128, align 128) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo + ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY5]], [[COPY]], 0 :: (store (<8 x s64>) into unknown-address + 192) %0:ptrregbank(p0) = COPY $p0 %1:accregbank(<32 x s64>) = COPY $dm0 G_STORE %1(<32 x s64>), %0(p0) :: (store (<32 x s64>)) diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_kernel_red.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_kernel_red.ll index 27500dc87532..9f8a41d31afa 100644 --- a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_kernel_red.ll +++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_kernel_red.ll @@ -22,22 +22,22 @@ define dso_local void @conv2d_bfp16.for.body90.i(<32 x i32> %fW.sroa.0.1489.i, i ; CHECK-NEXT: st p6, [sp, #-60] // 4-byte Folded Spill ; CHECK-NEXT: mov p6, sp ; CHECK-NEXT: padda [p6], #-320 -; CHECK-NEXT: vlda bmll3, [p6, #0] -; CHECK-NEXT: vlda bmlh3, [p6, #64] -; CHECK-NEXT: vlda bmhl3, [p6, #128]; mov m0, #-576 -; CHECK-NEXT: vlda bmhh3, [p6, #192]; mov p6, sp +; CHECK-NEXT: vlda bmhh3, [p6, #192] +; CHECK-NEXT: vlda bmhl3, [p6, #128] +; CHECK-NEXT: vlda bmlh3, [p6, #64]; mov m0, #-576 +; CHECK-NEXT: vlda bmll3, [p6, #0]; mov p6, sp ; CHECK-NEXT: padda [p6], m0 -; CHECK-NEXT: vlda bmll2, [p6, #0] -; CHECK-NEXT: vlda bmlh2, [p6, #64] -; CHECK-NEXT: vlda bmhl2, [p6, #128]; movxm m0, #-1092 -; CHECK-NEXT: vlda bmhh2, [p6, #192]; mov p6, sp +; CHECK-NEXT: vlda bmhh2, [p6, #192] +; CHECK-NEXT: vlda bmhl2, [p6, #128] +; CHECK-NEXT: vlda bmlh2, [p6, #64]; movxm m0, #-1092 +; CHECK-NEXT: vlda bmll2, [p6, #0]; mov p6, sp ; CHECK-NEXT: mova m0, #-832; paddb [p6], m0 ; CHECK-NEXT: lda dj4, [p6, #0]; mov p6, sp ; CHECK-NEXT: padda [p6], m0 -; CHECK-NEXT: vlda bmll1, [p6, #0] -; CHECK-NEXT: vlda bmlh1, [p6, #64] -; CHECK-NEXT: vlda bmhl1, [p6, #128]; st p7, [sp, #-64]; movxm m0, #-1096 // 4-byte Folded Spill -; CHECK-NEXT: vlda bmhh1, [p6, #192]; mov p6, sp +; CHECK-NEXT: vlda bmhh1, [p6, #192] +; CHECK-NEXT: vlda bmhl1, [p6, #128] +; CHECK-NEXT: vlda bmlh1, [p6, #64]; st p7, [sp, #-64]; movxm m0, #-1096 // 4-byte Folded Spill +; CHECK-NEXT: vlda bmll1, [p6, #0]; mov p6, sp ; CHECK-NEXT: padda [p6], m0; mov p7, sp ; CHECK-NEXT: lda m1, [p6, #0]; movxm m0, #-1108 ; CHECK-NEXT: mova dc4, #0; mov p6, sp @@ -50,13 +50,13 @@ define dso_local void @conv2d_bfp16.for.body90.i(<32 x i32> %fW.sroa.0.1489.i, i ; CHECK-NEXT: lda r2, [p6, #0]; movx r25, #0; mov p6, sp ; CHECK-NEXT: padda [p6], m0; vldb.fill.512 [p1, lf1, r25]; mov dn0, p3 ; CHECK-NEXT: // kill: def $p1 killed $p1 def $lf1 -; CHECK-NEXT: vlda bmll0, [p6, #0]; vldb.fill.512 [p1, lf1, r25]; movs dj0, p4; mov dn4, p5 -; CHECK-NEXT: vlda bmlh0, [p6, #64]; vldb.pop.576 ex0, [p1, lf1, r25]; movs dc0, dc4; mov m0, p2 -; CHECK-NEXT: vlda bmhl0, [p6, #128]; vldb.pop.576.3d ex2, [p1, lf1, r25, d0]; movx r24, #0 +; CHECK-NEXT: vlda bmhh0, [p6, #192]; vldb.fill.512 [p1, lf1, r25]; movs dj0, p4; mov dn4, p5 +; CHECK-NEXT: vlda bmhl0, [p6, #128]; vldb.pop.576 ex0, [p1, lf1, r25]; movs dc0, dc4; mov m0, p2 +; CHECK-NEXT: vlda bmlh0, [p6, #64]; vldb.pop.576.3d ex2, [p1, lf1, r25, d0]; movx r24, #0 ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.fill.512 [p1, lf1, r25] ; CHECK-NEXT: // kill: def $p0 killed $p0 def $lf0 ; CHECK-NEXT: vlda.pop.576 ex4, [p0, lf0, r24]; vldb.fill.512 [p1, lf1, r25]; add r1, r6, #-1 -; CHECK-NEXT: vlda bmhh0, [p6, #192]; vldb.pop.576 ex0, [p1, lf1, r25]; movxm ls, #.LBB0_1 +; CHECK-NEXT: vlda bmll0, [p6, #0]; vldb.pop.576 ex0, [p1, lf1, r25]; movxm ls, #.LBB0_1 ; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.pop.576.3d ex2, [p1, lf1, r25, d0]; movxm le, #.L_LEnd0 ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.fill.512 [p1, lf1, r25]; add.nc lc, r1, #-4 ; CHECK-NEXT: vlda.pop.576 ex4, [p0, lf0, r24]; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; nopv @@ -89,23 +89,23 @@ define dso_local void @conv2d_bfp16.for.body90.i(<32 x i32> %fW.sroa.0.1489.i, i ; CHECK-NEXT: vmac.f dm2, dm2, ex8, ex6, r0 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vst bmll0, [p7, #0] -; CHECK-NEXT: vst bmlh0, [p7, #64] -; CHECK-NEXT: vst bmhl0, [p7, #128] ; CHECK-NEXT: vst bmhh0, [p7, #192] -; CHECK-NEXT: vst bmll1, [p0, #0] -; CHECK-NEXT: vst bmlh1, [p0, #64] +; CHECK-NEXT: vst bmhl0, [p7, #128] +; CHECK-NEXT: vst bmlh0, [p7, #64] +; CHECK-NEXT: vst bmll0, [p7, #0] +; CHECK-NEXT: vst bmhh1, [p0, #192] ; CHECK-NEXT: vst bmhl1, [p0, #128] -; CHECK-NEXT: vst bmhh1, [p0, #192]; mov p0, r3 -; CHECK-NEXT: vst bmll2, [p0, #0] -; CHECK-NEXT: vst bmlh2, [p0, #64] +; CHECK-NEXT: vst bmlh1, [p0, #64] +; CHECK-NEXT: vst bmll1, [p0, #0]; mov p0, r3 +; CHECK-NEXT: vst bmhh2, [p0, #192] ; CHECK-NEXT: vst bmhl2, [p0, #128] -; CHECK-NEXT: lda p7, [sp, #-64]; vst bmhh2, [p0, #192] // 4-byte Folded Reload +; CHECK-NEXT: vst bmlh2, [p0, #64] +; CHECK-NEXT: lda p7, [sp, #-64]; vst bmll2, [p0, #0] // 4-byte Folded Reload ; CHECK-NEXT: movs p0, r2; ret lr -; CHECK-NEXT: vst bmll3, [p0, #0] // Delay Slot 5 -; CHECK-NEXT: vst bmlh3, [p0, #64] // Delay Slot 4 -; CHECK-NEXT: vst bmhl3, [p0, #128] // Delay Slot 3 -; CHECK-NEXT: vst bmhh3, [p0, #192]; paddxm [sp], #-64 // Delay Slot 2 +; CHECK-NEXT: vst bmhh3, [p0, #192] // Delay Slot 5 +; CHECK-NEXT: vst bmhl3, [p0, #128] // Delay Slot 4 +; CHECK-NEXT: vst bmlh3, [p0, #64] // Delay Slot 3 +; CHECK-NEXT: vst bmll3, [p0, #0]; paddxm [sp], #-64 // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 newFuncRoot: br label %for.body90.i diff --git a/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll b/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll index 0909d9c113d4..6a12fce26551 100644 --- a/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll +++ b/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll @@ -22,8 +22,8 @@ define dso_local void @_Z17test_fifo_ld_fillRPDv64_DB8_R12fifo_state_t(ptr nocap ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 -; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs p2, p0 +; CHECK-NEXT: vlda lfl0, [p1, #0] ; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -31,8 +31,8 @@ define dso_local void @_Z17test_fifo_ld_fillRPDv64_DB8_R12fifo_state_t(ptr nocap ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -59,8 +59,8 @@ define dso_local noundef <64 x i8> @_Z16test_fifo_ld_popRPDv64_DB8_R12fifo_state ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 -; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs p2, p0 +; CHECK-NEXT: vlda lfl0, [p1, #0] ; CHECK-NEXT: vldb.pop.512 x0, [p0, lf0, r24] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -68,8 +68,8 @@ define dso_local noundef <64 x i8> @_Z16test_fifo_ld_popRPDv64_DB8_R12fifo_state ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -97,8 +97,8 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_1d_byteRPDv64_DB8_R12fi ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 -; CHECK-NEXT: vlda lfh0, [p1, #64]; mov m0, r0 +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs p2, p0 +; CHECK-NEXT: vlda lfl0, [p1, #0]; mov m0, r0 ; CHECK-NEXT: vldb.pop.512 x0, [p0, lf0, r24, m0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -106,8 +106,8 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_1d_byteRPDv64_DB8_R12fi ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -137,8 +137,8 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_2d_byteRPDv64_DB8_R12fi ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: movs p3, p0 -; CHECK-NEXT: vlda lfl0, [p1, #0]; mov m0, r0 -; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn0, r1; mov dj0, r2 +; CHECK-NEXT: vlda lfh0, [p1, #64]; mov m0, r0 +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn0, r1; mov dj0, r2 ; CHECK-NEXT: vldb.pop.512.2d x0, [p0, lf0, r24, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -146,8 +146,8 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_2d_byteRPDv64_DB8_R12fi ; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0]; ret lr ; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -185,8 +185,8 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_3d_byteRPDv64_DB8_R12fi ; CHECK-NEXT: nop ; CHECK-NEXT: movs p4, p0 ; CHECK-NEXT: mov m0, r0 -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn0, r1; mov dj0, r2 -; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn4, r3; mov dj4, r4 +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn0, r1; mov dj0, r2 +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn4, r3; mov dj4, r4 ; CHECK-NEXT: vldb.pop.512.3d x0, [p0, lf0, r24, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -194,8 +194,8 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_3d_byteRPDv64_DB8_R12fi ; CHECK-NEXT: st dc0, [p2, #0] ; CHECK-NEXT: st dc4, [p3, #0]; ret lr ; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p4, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -239,8 +239,8 @@ define dso_local %struct.v64bfp16ebs8 @_Z16test_fifo_ld_popRP22v64bfp16ebs8_unal ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 -; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs p2, p0 +; CHECK-NEXT: vlda lfl0, [p1, #0] ; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -248,8 +248,8 @@ define dso_local %struct.v64bfp16ebs8 @_Z16test_fifo_ld_popRP22v64bfp16ebs8_unal ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -280,8 +280,8 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_1d_byteRP22v64bfp16e ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 -; CHECK-NEXT: vlda lfh0, [p1, #64]; mov m0, r0 +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs p2, p0 +; CHECK-NEXT: vlda lfl0, [p1, #0]; mov m0, r0 ; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24, m0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -289,8 +289,8 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_1d_byteRP22v64bfp16e ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -323,8 +323,8 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_2d_byteRP22v64bfp16e ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: movs p3, p0 -; CHECK-NEXT: vlda lfl0, [p1, #0]; mov m0, r0 -; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn0, r1; mov dj0, r2 +; CHECK-NEXT: vlda lfh0, [p1, #64]; mov m0, r0 +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn0, r1; mov dj0, r2 ; CHECK-NEXT: vldb.pop.576.2d ex0, [p0, lf0, r24, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -332,8 +332,8 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_2d_byteRP22v64bfp16e ; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0]; ret lr ; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -374,8 +374,8 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_3d_byteRP22v64bfp16e ; CHECK-NEXT: nop ; CHECK-NEXT: movs p4, p0 ; CHECK-NEXT: mov m0, r0 -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn0, r1; mov dj0, r2 -; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn4, r3; mov dj4, r4 +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn0, r1; mov dj0, r2 +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn4, r3; mov dj4, r4 ; CHECK-NEXT: vldb.pop.576.3d ex0, [p0, lf0, r24, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -383,8 +383,8 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_3d_byteRP22v64bfp16e ; CHECK-NEXT: st dc0, [p2, #0] ; CHECK-NEXT: st dc4, [p3, #0]; ret lr ; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p4, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -431,8 +431,8 @@ define dso_local %struct.v64bfp16ebs16 @_Z16test_fifo_ld_popRP23v64bfp16ebs16_un ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 -; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs p2, p0 +; CHECK-NEXT: vlda lfl0, [p1, #0] ; CHECK-NEXT: vldb.pop.544 ex0, [p0, lf0, r24] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -440,8 +440,8 @@ define dso_local %struct.v64bfp16ebs16 @_Z16test_fifo_ld_popRP23v64bfp16ebs16_un ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -472,8 +472,8 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_1d_byteRP23v64bfp16 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 -; CHECK-NEXT: vlda lfh0, [p1, #64]; mov m0, r0 +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs p2, p0 +; CHECK-NEXT: vlda lfl0, [p1, #0]; mov m0, r0 ; CHECK-NEXT: vldb.pop.544 ex0, [p0, lf0, r24, m0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -481,8 +481,8 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_1d_byteRP23v64bfp16 ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -515,8 +515,8 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_2d_byteRP23v64bfp16 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: movs p3, p0 -; CHECK-NEXT: vlda lfl0, [p1, #0]; mov m0, r0 -; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn0, r1; mov dj0, r2 +; CHECK-NEXT: vlda lfh0, [p1, #64]; mov m0, r0 +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn0, r1; mov dj0, r2 ; CHECK-NEXT: vldb.pop.544.2d ex0, [p0, lf0, r24, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -524,8 +524,8 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_2d_byteRP23v64bfp16 ; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0]; ret lr ; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -566,8 +566,8 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_3d_byteRP23v64bfp16 ; CHECK-NEXT: nop ; CHECK-NEXT: movs p4, p0 ; CHECK-NEXT: mov m0, r0 -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn0, r1; mov dj0, r2 -; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn4, r3; mov dj4, r4 +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn0, r1; mov dj0, r2 +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn4, r3; mov dj4, r4 ; CHECK-NEXT: vldb.pop.544.3d ex0, [p0, lf0, r24, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -575,8 +575,8 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_3d_byteRP23v64bfp16 ; CHECK-NEXT: st dc0, [p2, #0] ; CHECK-NEXT: st dc4, [p3, #0]; ret lr ; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p4, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -621,8 +621,8 @@ define dso_local noundef <64 x i8> @_Z17test_fifo_ld_popxRPDv64_hR12fifo_state_t ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0] -; CHECK-NEXT: vlda lfh0, [p1, #64]; movs p2, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfe, [p1, #192]; movxm r30, #2015 ; CHECK-NEXT: vldb.popx.512 x0, [p0, lf0, r24] ; CHECK-NEXT: nop @@ -632,8 +632,8 @@ define dso_local noundef <64 x i8> @_Z17test_fifo_ld_popxRPDv64_hR12fifo_state_t ; CHECK-NEXT: nop ; CHECK-NEXT: st r24, [p1, dj0]; ret lr ; CHECK-NEXT: vst lfe, [p1, #192] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -663,9 +663,9 @@ define dso_local void @_Z18test_fifo_ld_fillxRP22v64bfp16ebs8_unalignedR12fifo_s ; CHECK-NEXT: lda r24, [p1, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0] +; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: mova r2, #6 -; CHECK-NEXT: vlda lfh0, [p1, #64]; movs p2, p0; lshl r0, r0, r2 +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0; lshl r0, r0, r2 ; CHECK-NEXT: vlda lfe, [p1, #192]; or r30, r0, r1 ; CHECK-NEXT: vldb.fillx.512 [p0, lf0, r24] ; CHECK-NEXT: nop @@ -675,8 +675,8 @@ define dso_local void @_Z18test_fifo_ld_fillxRP22v64bfp16ebs8_unalignedR12fifo_s ; CHECK-NEXT: nop ; CHECK-NEXT: st r24, [p1, dj0]; ret lr ; CHECK-NEXT: vst lfe, [p1, #192] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -708,8 +708,8 @@ define dso_local void @_Z18test_fifo_ld_resetRrP23v128bfp16ebs8_unalignedR12fifo ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; mov dj0, #128 -; CHECK-NEXT: vlda lfh0, [p1, #64]; movx r24, #0; mov p2, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64]; mov dj0, #128 +; CHECK-NEXT: vlda lfl0, [p1, #0]; movx r24, #0; mov p2, p0 ; CHECK-NEXT: st r24, [p1, dj0] ; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24] ; CHECK-NEXT: nop @@ -718,8 +718,8 @@ define dso_local void @_Z18test_fifo_ld_resetRrP23v128bfp16ebs8_unalignedR12fifo ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -746,8 +746,8 @@ define dso_local void @_Z17test_fifo_ld_fillRrP23v128bfp16ebs8_unalignedR12fifo_ ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 -; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs p2, p0 +; CHECK-NEXT: vlda lfl0, [p1, #0] ; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -755,8 +755,8 @@ define dso_local void @_Z17test_fifo_ld_fillRrP23v128bfp16ebs8_unalignedR12fifo_ ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -783,8 +783,8 @@ define dso_local %struct.v128bfp16ebs8 @_Z16test_fifo_ld_popRrP23v128bfp16ebs8_u ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 -; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs p2, p0 +; CHECK-NEXT: vlda lfl0, [p1, #0] ; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -792,16 +792,16 @@ define dso_local %struct.v128bfp16ebs8 @_Z16test_fifo_ld_popRrP23v128bfp16ebs8_u ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: st r24, [p1, dj0] -; CHECK-NEXT: vst lfl0, [p1, #0] ; CHECK-NEXT: vst lfh0, [p1, #64] +; CHECK-NEXT: vst lfl0, [p1, #0] ; CHECK-NEXT: st p0, [p2, #0] ; CHECK-NEXT: lda r24, [p1, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0] ; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfl0, [p1, #0] ; CHECK-NEXT: vldb.pop.576 ex1, [p0, lf0, r24] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -809,8 +809,8 @@ define dso_local %struct.v128bfp16ebs8 @_Z16test_fifo_ld_popRrP23v128bfp16ebs8_u ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -854,8 +854,8 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_1d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 -; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs p2, p0 +; CHECK-NEXT: vlda lfl0, [p1, #0] ; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -863,16 +863,16 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_1d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: st r24, [p1, dj0] -; CHECK-NEXT: vst lfl0, [p1, #0] ; CHECK-NEXT: vst lfh0, [p1, #64] +; CHECK-NEXT: vst lfl0, [p1, #0] ; CHECK-NEXT: st p0, [p2, #0] ; CHECK-NEXT: lda r24, [p1, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0] -; CHECK-NEXT: vlda lfh0, [p1, #64]; mov m0, r0 +; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfl0, [p1, #0]; mov m0, r0 ; CHECK-NEXT: vldb.pop.576 ex1, [p0, lf0, r24, m0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -880,8 +880,8 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_1d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -926,8 +926,8 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_2d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p3, p0 -; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs p3, p0 +; CHECK-NEXT: vlda lfl0, [p1, #0] ; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -935,8 +935,8 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_2d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: st r24, [p1, dj1] -; CHECK-NEXT: vst lfl0, [p1, #0] ; CHECK-NEXT: vst lfh0, [p1, #64] +; CHECK-NEXT: vst lfl0, [p1, #0] ; CHECK-NEXT: st p0, [p3, #0] ; CHECK-NEXT: lda r24, [p1, dj1] ; CHECK-NEXT: lda dc0, [p2, #0] @@ -944,8 +944,8 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_2d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; mov m0, r0 -; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn0, r1; mov dj0, r2 +; CHECK-NEXT: vlda lfh0, [p1, #64]; mov m0, r0 +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn0, r1; mov dj0, r2 ; CHECK-NEXT: vldb.pop.576.2d ex1, [p0, lf0, r24, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -953,8 +953,8 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_2d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0]; ret lr ; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -1006,8 +1006,8 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_3d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p4, p0 -; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs p4, p0 +; CHECK-NEXT: vlda lfl0, [p1, #0] ; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -1015,8 +1015,8 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_3d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: st r24, [p1, dj1] -; CHECK-NEXT: vst lfl0, [p1, #0] ; CHECK-NEXT: vst lfh0, [p1, #64] +; CHECK-NEXT: vst lfl0, [p1, #0] ; CHECK-NEXT: st p0, [p4, #0] ; CHECK-NEXT: lda r24, [p1, dj1] ; CHECK-NEXT: lda dc0, [p2, #0] @@ -1025,8 +1025,8 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_3d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: mov m0, r0 -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn0, r1; mov dj0, r2 -; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn4, r3; mov dj4, r4 +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn0, r1; mov dj0, r2 +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn4, r3; mov dj4, r4 ; CHECK-NEXT: vldb.pop.576.3d ex1, [p0, lf0, r24, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -1034,8 +1034,8 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_3d_byteRrP23v128bfp ; CHECK-NEXT: st dc0, [p2, #0] ; CHECK-NEXT: st dc4, [p3, #0]; ret lr ; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p0, [p4, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: diff --git a/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll b/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll index 22271422c344..99fb2d6d354e 100644 --- a/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll +++ b/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll @@ -16,9 +16,9 @@ define dso_local void @_Z18test_fifo_st_resetRPDv64_DB8_S0_R12fifo_state_t(ptr nocapture nonnull align 4 dereferenceable(4) %p, <64 x i8> noundef %v, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { ; CHECK-LABEL: _Z18test_fifo_st_resetRPDv64_DB8_S0_R12fifo_state_t: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopx +; CHECK-NEXT: vlda sfh, [p1, #64]; nopx ; CHECK-NEXT: lda p2, [p0, #0] -; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -26,8 +26,8 @@ define dso_local void @_Z18test_fifo_st_resetRPDv64_DB8_S0_R12fifo_state_t(ptr n ; CHECK-NEXT: mova r26, #0 ; CHECK-NEXT: vst.push.512 x0, [p2, sf, r26]; ret lr ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -46,10 +46,10 @@ entry: define dso_local void @_Z17test_fifo_st_pushRPDv64_DB8_S0_R12fifo_state_t(ptr nocapture nonnull align 4 dereferenceable(4) %p, <64 x i8> noundef %v, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { ; CHECK-LABEL: _Z17test_fifo_st_pushRPDv64_DB8_S0_R12fifo_state_t: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops +; CHECK-NEXT: vlda sfh, [p1, #64]; nopb ; nopxm ; nops ; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 ; CHECK-NEXT: lda r26, [p1, dj0] -; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -58,8 +58,8 @@ define dso_local void @_Z17test_fifo_st_pushRPDv64_DB8_S0_R12fifo_state_t(ptr no ; CHECK-NEXT: vst.push.512 x0, [p2, sf, r26] ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -82,10 +82,10 @@ entry: define dso_local void @_Z18test_fifo_st_flushRPDv64_DB8_R12fifo_state_t(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { ; CHECK-LABEL: _Z18test_fifo_st_flushRPDv64_DB8_R12fifo_state_t: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops +; CHECK-NEXT: vlda sfh, [p1, #64]; nopb ; nopxm ; nops ; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 ; CHECK-NEXT: lda r26, [p1, dj0] -; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -94,8 +94,8 @@ define dso_local void @_Z18test_fifo_st_flushRPDv64_DB8_R12fifo_state_t(ptr noca ; CHECK-NEXT: vst.flush.512 [p2, sf, r26] ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -117,10 +117,10 @@ entry: define dso_local void @_Z26test_fifo_st_flush_1d_byteRPDv64_DB8_R12fifo_state_ti(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off) local_unnamed_addr #2 { ; CHECK-LABEL: _Z26test_fifo_st_flush_1d_byteRPDv64_DB8_R12fifo_state_ti: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm +; CHECK-NEXT: vlda sfh, [p1, #64]; nopb ; nopxm ; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 ; CHECK-NEXT: lda r26, [p1, dj0] -; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -129,8 +129,8 @@ define dso_local void @_Z26test_fifo_st_flush_1d_byteRPDv64_DB8_R12fifo_state_ti ; CHECK-NEXT: vst.flush.512 [p2, sf, r26, m0] ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -157,8 +157,8 @@ define dso_local void @_Z26test_fifo_st_flush_2d_byteRPDv64_DB8_R12fifo_state_ti ; CHECK-NEXT: lda dc0, [p2, #0]; mov dj1, #128 ; CHECK-NEXT: lda r26, [p1, dj1] ; CHECK-NEXT: nop -; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: movs p3, p2 ; CHECK-NEXT: movs m0, r0; mov dn0, r1 ; CHECK-NEXT: movs dj0, r2; mov p2, p4 @@ -168,8 +168,8 @@ define dso_local void @_Z26test_fifo_st_flush_2d_byteRPDv64_DB8_R12fifo_state_ti ; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p3, #0]; ret lr ; CHECK-NEXT: st r26, [p1, dj1] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -199,12 +199,12 @@ entry: define dso_local void @_Z26test_fifo_st_flush_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off, i32 noundef %size1, ptr nocapture nonnull align 4 dereferenceable(4) %count1, i32 noundef %inc1, i32 noundef %size2, ptr nocapture nonnull align 4 dereferenceable(4) %count2, i32 noundef %inc2) local_unnamed_addr #2 { ; CHECK-LABEL: _Z26test_fifo_st_flush_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopxm +; CHECK-NEXT: vlda sfh, [p1, #64]; nopxm ; CHECK-NEXT: lda p5, [p0, #0] ; CHECK-NEXT: lda dc0, [p2, #0] ; CHECK-NEXT: lda dc4, [p3, #0]; mov dj1, #128 ; CHECK-NEXT: lda r26, [p1, dj1] -; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: movs p4, p2 ; CHECK-NEXT: movs m0, r0; mov dn0, r1 @@ -215,8 +215,8 @@ define dso_local void @_Z26test_fifo_st_flush_3d_byteRPDv64_DB8_R12fifo_state_ti ; CHECK-NEXT: st dc0, [p4, #0] ; CHECK-NEXT: st dc4, [p3, #0]; ret lr ; CHECK-NEXT: st r26, [p1, dj1] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -256,8 +256,8 @@ define dso_local void @_Z23test_fifo_st_flush_convRPDv64_DB8_R12fifo_state_t(ptr ; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nopx ; mov dj0, #128; nops ; CHECK-NEXT: lda r26, [p1, dj0] ; CHECK-NEXT: nop -; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -266,8 +266,8 @@ define dso_local void @_Z23test_fifo_st_flush_convRPDv64_DB8_R12fifo_state_t(ptr ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -292,8 +292,8 @@ define dso_local void @_Z31test_fifo_st_flush_conv_1d_byteRPDv64_DB8_R12fifo_sta ; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nopx ; mov dj0, #128 ; CHECK-NEXT: lda r26, [p1, dj0] ; CHECK-NEXT: nop -; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: mov m0, r0 @@ -302,8 +302,8 @@ define dso_local void @_Z31test_fifo_st_flush_conv_1d_byteRPDv64_DB8_R12fifo_sta ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -326,11 +326,11 @@ entry: define dso_local void @_Z31test_fifo_st_flush_conv_2d_byteRPDv64_DB8_R12fifo_state_tiiRii(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off, i32 noundef %size1, ptr nocapture nonnull align 4 dereferenceable(4) %count1, i32 noundef %inc1) local_unnamed_addr #2 { ; CHECK-LABEL: _Z31test_fifo_st_flush_conv_2d_byteRPDv64_DB8_R12fifo_state_tiiRii: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopx +; CHECK-NEXT: vlda sfh, [p1, #64]; nopb ; nopx ; CHECK-NEXT: lda p4, [p0, #0] ; CHECK-NEXT: lda dc0, [p2, #0]; mov dj1, #128 ; CHECK-NEXT: lda r26, [p1, dj1] -; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: movs p3, p2 @@ -340,8 +340,8 @@ define dso_local void @_Z31test_fifo_st_flush_conv_2d_byteRPDv64_DB8_R12fifo_sta ; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p3, #0]; ret lr ; CHECK-NEXT: st r26, [p1, dj1] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -376,8 +376,8 @@ define dso_local void @_Z31test_fifo_st_flush_conv_3d_byteRPDv64_DB8_R12fifo_sta ; CHECK-NEXT: lda dc4, [p3, #0]; mov dj1, #128 ; CHECK-NEXT: lda r26, [p1, dj1] ; CHECK-NEXT: nop -; CHECK-NEXT: vlda sfl, [p1, #0] -; CHECK-NEXT: vlda sfh, [p1, #64]; movs p4, p2 +; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: vlda sfl, [p1, #0]; movs p4, p2 ; CHECK-NEXT: movs m0, r0; mov dn0, r1 ; CHECK-NEXT: movs dj0, r2; mov dn4, r3 ; CHECK-NEXT: movs dj4, r4; mov p2, p5 @@ -388,8 +388,8 @@ define dso_local void @_Z31test_fifo_st_flush_conv_3d_byteRPDv64_DB8_R12fifo_sta ; CHECK-NEXT: st dc0, [p4, #0] ; CHECK-NEXT: st dc4, [p3, #0]; ret lr ; CHECK-NEXT: st r26, [p1, dj1] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -427,9 +427,9 @@ entry: define dso_local void @test_fifo_st_reset_v64bfp16ebs16(ptr nocapture nonnull align 4 dereferenceable(4) %p, %struct.v64bfp16ebs16 %v.coerce, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { ; CHECK-LABEL: test_fifo_st_reset_v64bfp16ebs16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopx +; CHECK-NEXT: vlda sfh, [p1, #64]; nopx ; CHECK-NEXT: lda p2, [p0, #0] -; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -437,8 +437,8 @@ define dso_local void @test_fifo_st_reset_v64bfp16ebs16(ptr nocapture nonnull al ; CHECK-NEXT: mova r26, #0 ; CHECK-NEXT: vst.push.544 ex0, [p2, sf, r26]; ret lr ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -458,10 +458,10 @@ entry: define dso_local void @test_fifo_st_push_v64bfp16ebs16(ptr nocapture nonnull align 4 dereferenceable(4) %p, %struct.v64bfp16ebs16 %v.coerce, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { ; CHECK-LABEL: test_fifo_st_push_v64bfp16ebs16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops +; CHECK-NEXT: vlda sfh, [p1, #64]; nopb ; nopxm ; nops ; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 ; CHECK-NEXT: lda r26, [p1, dj0] -; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -470,8 +470,8 @@ define dso_local void @test_fifo_st_push_v64bfp16ebs16(ptr nocapture nonnull ali ; CHECK-NEXT: vst.push.544 ex0, [p2, sf, r26] ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -495,9 +495,9 @@ entry: define dso_local void @test_fifo_st_reset_v64bfp16ebs8(ptr nocapture nonnull align 4 dereferenceable(4) %p, %struct.v64bfp16ebs8 %v.coerce, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { ; CHECK-LABEL: test_fifo_st_reset_v64bfp16ebs8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopx +; CHECK-NEXT: vlda sfh, [p1, #64]; nopx ; CHECK-NEXT: lda p2, [p0, #0] -; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -505,8 +505,8 @@ define dso_local void @test_fifo_st_reset_v64bfp16ebs8(ptr nocapture nonnull ali ; CHECK-NEXT: mova r26, #0 ; CHECK-NEXT: vst.push.576 ex0, [p2, sf, r26]; ret lr ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -526,10 +526,10 @@ entry: define dso_local void @test_fifo_st_push_v64bfp16ebs8(ptr nocapture nonnull align 4 dereferenceable(4) %p, %struct.v64bfp16ebs8 %v.coerce, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { ; CHECK-LABEL: test_fifo_st_push_v64bfp16ebs8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops +; CHECK-NEXT: vlda sfh, [p1, #64]; nopb ; nopxm ; nops ; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 ; CHECK-NEXT: lda r26, [p1, dj0] -; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -538,8 +538,8 @@ define dso_local void @test_fifo_st_push_v64bfp16ebs8(ptr nocapture nonnull alig ; CHECK-NEXT: vst.push.576 ex0, [p2, sf, r26] ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -563,10 +563,10 @@ entry: define dso_local void @_Z17test_fifo_st_pushRrP23v128bfp16ebs8_unaligned13v128bfp16ebs8R12fifo_state_t(ptr nocapture nonnull align 4 dereferenceable(4) %p, %struct.v128bfp16ebs8 %v.coerce, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #1 { ; CHECK-LABEL: _Z17test_fifo_st_pushRrP23v128bfp16ebs8_unaligned13v128bfp16ebs8R12fifo_state_t: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nops ; nopxm ; nopv +; CHECK-NEXT: vlda sfh, [p1, #64]; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 ; CHECK-NEXT: lda r26, [p1, dj0]; nopx -; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -575,12 +575,12 @@ define dso_local void @_Z17test_fifo_st_pushRrP23v128bfp16ebs8_unaligned13v128bf ; CHECK-NEXT: vst.push.576 ex0, [p2, sf, r26] ; CHECK-NEXT: nop ; CHECK-NEXT: st r26, [p1, dj0] -; CHECK-NEXT: vst sfl, [p1, #0] ; CHECK-NEXT: vst sfh, [p1, #64] +; CHECK-NEXT: vst sfl, [p1, #0] ; CHECK-NEXT: st p2, [p0, #0] -; CHECK-NEXT: vlda sfl, [p1, #0] -; CHECK-NEXT: lda r26, [p1, dj0] ; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -589,8 +589,8 @@ define dso_local void @_Z17test_fifo_st_pushRrP23v128bfp16ebs8_unaligned13v128bf ; CHECK-NEXT: vst.push.576 ex1, [p2, sf, r26] ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -624,10 +624,10 @@ entry: define dso_local void @_Z18test_fifo_st_flushRrP23v128bfp16ebs8_unalignedR12fifo_state_t(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #1 { ; CHECK-LABEL: _Z18test_fifo_st_flushRrP23v128bfp16ebs8_unalignedR12fifo_state_t: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nops ; nopxm ; nopv +; CHECK-NEXT: vlda sfh, [p1, #64]; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 ; CHECK-NEXT: lda r26, [p1, dj0]; nopx -; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -636,12 +636,12 @@ define dso_local void @_Z18test_fifo_st_flushRrP23v128bfp16ebs8_unalignedR12fifo ; CHECK-NEXT: vst.flush.512 [p2, sf, r26] ; CHECK-NEXT: nop ; CHECK-NEXT: st r26, [p1, dj0] -; CHECK-NEXT: vst sfl, [p1, #0] ; CHECK-NEXT: vst sfh, [p1, #64] +; CHECK-NEXT: vst sfl, [p1, #0] ; CHECK-NEXT: st p2, [p0, #0] -; CHECK-NEXT: vlda sfl, [p1, #0] -; CHECK-NEXT: lda r26, [p1, dj0] ; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -650,8 +650,8 @@ define dso_local void @_Z18test_fifo_st_flushRrP23v128bfp16ebs8_unalignedR12fifo ; CHECK-NEXT: vst.flush.512 [p2, sf, r26] ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -681,10 +681,10 @@ entry: define dso_local void @_Z26test_fifo_st_flush_1d_byteRrP23v128bfp16ebs8_unalignedR12fifo_state_ti(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off) local_unnamed_addr #1 { ; CHECK-LABEL: _Z26test_fifo_st_flush_1d_byteRrP23v128bfp16ebs8_unalignedR12fifo_state_ti: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nops ; nopxm ; nopv +; CHECK-NEXT: vlda sfh, [p1, #64]; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 ; CHECK-NEXT: lda r26, [p1, dj0] -; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -693,12 +693,12 @@ define dso_local void @_Z26test_fifo_st_flush_1d_byteRrP23v128bfp16ebs8_unaligne ; CHECK-NEXT: vst.flush.512 [p2, sf, r26] ; CHECK-NEXT: nop ; CHECK-NEXT: st r26, [p1, dj0] -; CHECK-NEXT: vst sfl, [p1, #0] ; CHECK-NEXT: vst sfh, [p1, #64] +; CHECK-NEXT: vst sfl, [p1, #0] ; CHECK-NEXT: st p2, [p0, #0] -; CHECK-NEXT: vlda sfl, [p1, #0] -; CHECK-NEXT: lda r26, [p1, dj0] ; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -707,8 +707,8 @@ define dso_local void @_Z26test_fifo_st_flush_1d_byteRrP23v128bfp16ebs8_unaligne ; CHECK-NEXT: vst.flush.512 [p2, sf, r26, m0] ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: diff --git a/llvm/test/CodeGen/AIE/insertelement.ll b/llvm/test/CodeGen/AIE/insertelement.ll index 168e9dcbea84..7798e8d71ad9 100644 --- a/llvm/test/CodeGen/AIE/insertelement.ll +++ b/llvm/test/CodeGen/AIE/insertelement.ll @@ -381,8 +381,8 @@ define <128 x i8> @insert_v128i8_dyn(<128 x i8> %v, i8 %e, i32 %idx) nounwind { ; AIE2P-NEXT: and r1, r1, r2 ; AIE2P-NEXT: mov dj0, r1 ; AIE2P-NEXT: padda [p0], #-128 -; AIE2P-NEXT: vst x6, [p0, #0] ; AIE2P-NEXT: vst x7, [p0, #64] +; AIE2P-NEXT: vst x6, [p0, #0] ; AIE2P-NEXT: st.s8 r0, [p0, dj0] ; AIE2P-NEXT: nop ; AIE2P-NEXT: nop @@ -390,8 +390,8 @@ define <128 x i8> @insert_v128i8_dyn(<128 x i8> %v, i8 %e, i32 %idx) nounwind { ; AIE2P-NEXT: nop ; AIE2P-NEXT: nop ; AIE2P-NEXT: nop -; AIE2P-NEXT: vldb x4, [p0, #0] ; AIE2P-NEXT: vldb x5, [p0, #64] +; AIE2P-NEXT: vldb x4, [p0, #0] ; AIE2P-NEXT: ret lr ; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 @@ -467,8 +467,8 @@ define <64 x i16> @insert_v64i16_dyn(<64 x i16> %v, i16 %e, i32 %idx) nounwind { ; AIE2P-NEXT: lshl r1, r1, r2 ; AIE2P-NEXT: padda [p0], #-128 ; AIE2P-NEXT: mov dj0, r1 -; AIE2P-NEXT: vst x6, [p0, #0] ; AIE2P-NEXT: vst x7, [p0, #64] +; AIE2P-NEXT: vst x6, [p0, #0] ; AIE2P-NEXT: st.s16 r0, [p0, dj0] ; AIE2P-NEXT: nop ; AIE2P-NEXT: nop @@ -476,8 +476,8 @@ define <64 x i16> @insert_v64i16_dyn(<64 x i16> %v, i16 %e, i32 %idx) nounwind { ; AIE2P-NEXT: nop ; AIE2P-NEXT: nop ; AIE2P-NEXT: nop -; AIE2P-NEXT: vldb x4, [p0, #0] ; AIE2P-NEXT: vldb x5, [p0, #64] +; AIE2P-NEXT: vldb x4, [p0, #0] ; AIE2P-NEXT: ret lr ; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 @@ -547,11 +547,11 @@ define <32 x i32> @insert_v32i32_dyn(<32 x i32> %v, i32 %e, i32 %idx) nounwind { ; AIE2P-NEXT: lshl r1, r1, r2 ; AIE2P-NEXT: padda [p0], #-128 ; AIE2P-NEXT: mov dj0, r1 -; AIE2P-NEXT: vst x6, [p0, #0] ; AIE2P-NEXT: vst x7, [p0, #64] +; AIE2P-NEXT: vst x6, [p0, #0] ; AIE2P-NEXT: st r0, [p0, dj0] -; AIE2P-NEXT: vldb x4, [p0, #0] ; AIE2P-NEXT: vldb x5, [p0, #64] +; AIE2P-NEXT: vldb x4, [p0, #0] ; AIE2P-NEXT: ret lr ; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4