diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 4cdd3d553eb4..2b5571ed5682 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -556,6 +556,44 @@ bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) { return true; } + // After this point, it is assumed our shufflevectors work on vectors that can + // be splint into two + if ((DstNumElts % 2) != 0) + return false; + + // {1, 2, ..., n/4, n/2, n/2+1, .... 3n/4} -> G_UNMERGE_VALUES + // Take the first halfs of the two vectors and concatenate them into one + // vector. + GeneratorType FirstEightA = adderGenerator(0, (DstNumElts / 2) - 1, 1); + GeneratorType FirstEightB = + adderGenerator(DstNumElts, DstNumElts + (DstNumElts / 2) - 1, 1); + + auto UnmergeMatcher = SmallVector{FirstEightA, FirstEightB}; + GeneratorType FirstAndThird = concatGenerators(UnmergeMatcher); + if (matchCombineShuffleVector(MI, FirstAndThird, (DstNumElts / 2) - 1)) { + if (DstNumElts <= 2) + return false; + const Register DstReg = MI.getOperand(0).getReg(); + const LLT HalfSrcTy = + LLT::fixed_vector(SrcNumElts / 2, SrcTy.getScalarType()); + const Register HalfOfA = createUnmergeValue( + MI, MI.getOperand(1).getReg(), + MRI.createGenericVirtualRegister(HalfSrcTy), 0, 0, SrcNumElts); + const Register HalfOfB = createUnmergeValue( + MI, MI.getOperand(2).getReg(), + MRI.createGenericVirtualRegister(HalfSrcTy), 0, 0, SrcNumElts); + + const ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + if (Mask[0] <= 0) { + Builder.buildMergeLikeInstr(DstReg, {HalfOfA, HalfOfB}); + } else { + Builder.buildMergeLikeInstr(DstReg, {HalfOfB, HalfOfA}); + } + + MI.eraseFromParent(); + return true; + } + return false; } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shufflevector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shufflevector.mir index 0de989f8be75..b87fdf8bc552 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shufflevector.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shufflevector.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 # RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s +# Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates --- name: shuffle_concat_1 @@ -101,7 +102,9 @@ body: | ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %a(<4 x s8>), %b(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>) ; CHECK-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %c(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>) - ; CHECK-NEXT: %z:_(<16 x s8>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<16 x s8>), [[CONCAT_VECTORS1]], shufflemask(0, undef, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, undef, undef, undef, undef) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s8>), [[UV1:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s8>), [[UV3:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<16 x s8>) + ; CHECK-NEXT: %z:_(<16 x s8>) = G_CONCAT_VECTORS [[UV]](<8 x s8>), [[UV2]](<8 x s8>) ; CHECK-NEXT: $q0 = COPY %z(<16 x s8>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %p1:_(p0) = COPY $x0 @@ -179,7 +182,9 @@ body: | ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %a(<4 x s8>), %b(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>) ; CHECK-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %c(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>) - ; CHECK-NEXT: %z:_(<16 x s8>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<16 x s8>), [[CONCAT_VECTORS1]], shufflemask(undef, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, undef, undef, undef, undef) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s8>), [[UV1:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s8>), [[UV3:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<16 x s8>) + ; CHECK-NEXT: %z:_(<16 x s8>) = G_CONCAT_VECTORS [[UV]](<8 x s8>), [[UV2]](<8 x s8>) ; CHECK-NEXT: $q0 = COPY %z(<16 x s8>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %p1:_(p0) = COPY $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir index 2c9ae5b06b62..1d4651fe70b5 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple aarch64-apple-ios -run-pass=aarch64-prelegalizer-combiner %s -o - | FileCheck %s +# Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates # Check that we canonicalize shuffle_vector(Src1, Src2, mask(0,1,2,3)) # into concat_vector(Src1, Src2). @@ -270,8 +271,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 - ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[COPY1]], shufflemask(4, 5, 0, 1) - ; CHECK-NEXT: RET_ReallyLR implicit [[SHUF]](<4 x s32>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<2 x s32>), [[UV3:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[UV2]](<2 x s32>), [[UV]](<2 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit [[CONCAT_VECTORS]](<4 x s32>) %0:_(<4 x s32>) = COPY $q0 %1:_(<4 x s32>) = COPY $q1 %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1(<4 x s32>), shufflemask(4,5,0,1) diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll index 749d6071c98d..89002fc9de43 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefixes=CHECK,CHECK-SD ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI +; Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates ; CHECK-GI: warning: Instruction selection used fallback path for test_bitcastv2f32tov1f64 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_bitcastv1f64tov2f32 @@ -1776,19 +1777,10 @@ entry: } define <16 x i8> @test_concat_v16i8_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) #0 { -; CHECK-SD-LABEL: test_concat_v16i8_v16i8_v16i8: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_concat_v16i8_v16i8_v16i8: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: adrp x8, .LCPI126_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI126_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_concat_v16i8_v16i8_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecinit30 = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> ret <16 x i8> %vecinit30 @@ -1803,9 +1795,7 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v16i8_v8i8_v16i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: adrp x8, .LCPI127_0 -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: mov b2, v0.b[1] ; CHECK-GI-NEXT: mov b3, v0.b[2] ; CHECK-GI-NEXT: mov b4, v0.b[3] @@ -1814,14 +1804,13 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 { ; CHECK-GI-NEXT: mov b7, v0.b[6] ; CHECK-GI-NEXT: mov b16, v0.b[7] ; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI127_0] ; CHECK-GI-NEXT: mov v0.b[2], v3.b[0] ; CHECK-GI-NEXT: mov v0.b[3], v4.b[0] ; CHECK-GI-NEXT: mov v0.b[4], v5.b[0] ; CHECK-GI-NEXT: mov v0.b[5], v6.b[0] ; CHECK-GI-NEXT: mov v0.b[6], v7.b[0] ; CHECK-GI-NEXT: mov v0.b[7], v16.b[0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <8 x i8> %x, i32 0 @@ -1999,19 +1988,10 @@ entry: } define <8 x i16> @test_concat_v8i16_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) #0 { -; CHECK-SD-LABEL: test_concat_v8i16_v8i16_v8i16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_concat_v8i16_v8i16_v8i16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: adrp x8, .LCPI130_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI130_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_concat_v8i16_v8i16_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecinit14 = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> ret <8 x i16> %vecinit14 @@ -2026,17 +2006,14 @@ define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v8i16_v4i16_v8i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: adrp x8, .LCPI131_0 -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: mov h2, v0.h[1] ; CHECK-GI-NEXT: mov h3, v0.h[2] ; CHECK-GI-NEXT: mov h4, v0.h[3] ; CHECK-GI-NEXT: mov v0.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI131_0] ; CHECK-GI-NEXT: mov v0.h[2], v3.h[0] ; CHECK-GI-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <4 x i16> %x, i32 0 @@ -2142,19 +2119,10 @@ entry: } define <4 x i32> @test_concat_v4i32_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) #0 { -; CHECK-SD-LABEL: test_concat_v4i32_v4i32_v4i32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_concat_v4i32_v4i32_v4i32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: adrp x8, .LCPI134_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI134_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_concat_v4i32_v4i32_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecinit6 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> ret <4 x i32> %vecinit6 @@ -2169,13 +2137,10 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v4i32_v2i32_v4i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: adrp x8, .LCPI135_0 -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: mov s2, v0.s[1] ; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI135_0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <2 x i32> %x, i32 0 diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir index 082554d3ade3..d14ac147679e 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir @@ -581,3 +581,273 @@ body: | %2:_(<4 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %1:_(<128 x s8>), shufflemask(4, 5, 6, 7) PseudoRET implicit $lr, implicit %2 ... + +--- +name: extract_vector_third_half_1024 +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: extract_vector_third_half_1024 + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y3 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV]](<16 x s32>) + %1:_(<32 x s32>) = COPY $y2 + %2:_(<32 x s32>) = COPY $y3 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<32 x s32>), %2:_, shufflemask(32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_third_half_512 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: extract_vector_third_half_512 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV]](<8 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_, shufflemask(16, 17, 18, 19, 20, 21, 22, 23) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_third_half_256 +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: extract_vector_third_half_256 + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl1 + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_UNPAD_VECTOR]](<4 x s32>) + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<8 x s32>) = COPY $wl1 + %0:_(<4 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_third_half_128 +legalized: false +body: | + bb.1.entry: + liveins: $q0, $q1 + ; CHECK-LABEL: name: extract_vector_third_half_128 + ; CHECK: liveins: $q0, $q1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV]](<2 x s32>) + %1:_(<4 x s32>) = COPY $q0 + %2:_(<4 x s32>) = COPY $q1 + %0:_(<2 x s32>) = G_SHUFFLE_VECTOR %1:_(<4 x s32>), %2:_, shufflemask(4, 5) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_fourth_half_1024 +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: extract_vector_fourth_half_1024 + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y3 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<16 x s32>) + %1:_(<32 x s32>) = COPY $y2 + %2:_(<32 x s32>) = COPY $y3 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<32 x s32>), %2:_, shufflemask(48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_fourth_half_512 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: extract_vector_fourth_half_512 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<8 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_, shufflemask(24,25,26,27,28,29,30,31) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_fourth_half_256 +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: extract_vector_fourth_half_256 + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[COPY]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<4 x s32>) + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<8 x s32>) = COPY $wl1 + %0:_(<4 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(12,13,14,15) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_fourth_half_128 +legalized: false +body: | + bb.1.entry: + liveins: $q0, $q1 + ; CHECK-LABEL: name: extract_vector_fourth_half_128 + ; CHECK: liveins: $q0, $q1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<2 x s32>) + %1:_(<4 x s32>) = COPY $q0 + %2:_(<4 x s32>) = COPY $q1 + %0:_(<2 x s32>) = G_SHUFFLE_VECTOR %1:_(<4 x s32>), %2:_, shufflemask(6,7) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: insert_vector_16_elements +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: insert_vector_16_elements + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY1]](<16 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[UV]](<8 x s32>), [[UV2]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<16 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %3:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_(<16 x s32>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_8_elements +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: insert_vector_8_elements + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl1 + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY]](<8 x s32>) + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY1]](<8 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[AIE_UNPAD_VECTOR]](<4 x s32>), [[AIE_UNPAD_VECTOR1]](<4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<8 x s32>) + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<8 x s32>) = COPY $wl1 + %3:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_(<8 x s32>), shufflemask(0, 1, 2, 3, 8, 9, 10, 11) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_128_elements +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: insert_vector_128_elements + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<128 x s8>) = COPY $y3 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<64 x s8>), [[UV3:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY1]](<128 x s8>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<128 x s8>) = G_CONCAT_VECTORS [[UV]](<64 x s8>), [[UV2]](<64 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<128 x s8>) + %1:_(<128 x s8>) = COPY $y2 + %2:_(<128 x s8>) = COPY $y3 + %3:_(<128 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %2:_(<128 x s8>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_16_elements_reverse +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: insert_vector_16_elements_reverse + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY1]](<16 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[UV2]](<8 x s32>), [[UV]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<16 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %3:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_(<16 x s32>), shufflemask(16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_8_elements_reverse +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: insert_vector_8_elements_reverse + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl1 + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY]](<8 x s32>) + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY1]](<8 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[AIE_UNPAD_VECTOR1]](<4 x s32>), [[AIE_UNPAD_VECTOR]](<4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<8 x s32>) + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<8 x s32>) = COPY $wl1 + %3:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_(<8 x s32>), shufflemask(8, 9, 10, 11, 0, 1, 2, 3) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_128_elements_reverse +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: insert_vector_128_elements_reverse + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<128 x s8>) = COPY $y3 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<64 x s8>), [[UV3:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY1]](<128 x s8>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<128 x s8>) = G_CONCAT_VECTORS [[UV2]](<64 x s8>), [[UV]](<64 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<128 x s8>) + %1:_(<128 x s8>) = COPY $y2 + %2:_(<128 x s8>) = COPY $y3 + %3:_(<128 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %2:_(<128 x s8>), shufflemask(128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63) + PseudoRET implicit $lr, implicit %3 +... diff --git a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll index eda80653683b..0284bbbe9d7f 100644 --- a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll +++ b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll @@ -49,96 +49,26 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, < ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopb ; nopa ; nops ; jz r0, #.LBB1_2; nopv -; CHECK-NEXT: nopx // Delay Slot 5 +; CHECK-NEXT: nopa ; nopx // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mov r24, r16 // Delay Slot 2 -; CHECK-NEXT: mova r16, #0 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: vmov wl0, wl4 // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %if.end -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vextract.s32 r0, x2, r16; nopv -; CHECK-NEXT: vextract.s32 r1, x4, r16 -; CHECK-NEXT: mova r16, #1 -; CHECK-NEXT: vextract.s32 r2, x2, r16 -; CHECK-NEXT: vextract.s32 r3, x4, r16 -; CHECK-NEXT: mova r16, #2 -; CHECK-NEXT: vextract.s32 r4, x2, r16 -; CHECK-NEXT: vextract.s32 r5, x4, r16 -; CHECK-NEXT: mova r16, #3 -; CHECK-NEXT: vextract.s32 r6, x2, r16 -; CHECK-NEXT: vextract.s32 r7, x4, r16 -; CHECK-NEXT: mova r16, #4 -; CHECK-NEXT: vextract.s32 r8, x2, r16 -; CHECK-NEXT: vextract.s32 r9, x4, r16 -; CHECK-NEXT: mova r16, #5 -; CHECK-NEXT: vextract.s32 r10, x2, r16 -; CHECK-NEXT: vextract.s32 r11, x4, r16 -; CHECK-NEXT: mova r16, #7 -; CHECK-NEXT: vextract.s32 r12, x2, r16 -; CHECK-NEXT: vextract.s32 r13, x4, r16 -; CHECK-NEXT: mova r16, #6 -; CHECK-NEXT: vextract.s32 r14, x2, r16 -; CHECK-NEXT: vextract.s32 r15, x4, r16 -; CHECK-NEXT: vpush.lo.32 x0, r13, x0 -; CHECK-NEXT: vpush.lo.32 x0, r15, x0 -; CHECK-NEXT: vpush.lo.32 x0, r11, x0 -; CHECK-NEXT: vpush.lo.32 x0, r9, x0 -; CHECK-NEXT: vpush.lo.32 x0, r7, x0 -; CHECK-NEXT: vpush.lo.32 x0, r5, x0 -; CHECK-NEXT: vpush.lo.32 x0, r3, x0 -; CHECK-NEXT: vpush.lo.32 x0, r1, x0 -; CHECK-NEXT: vpush.lo.32 x0, r12, x0 -; CHECK-NEXT: vpush.lo.32 x0, r14, x0 -; CHECK-NEXT: vpush.lo.32 x0, r10, x0 -; CHECK-NEXT: vpush.lo.32 x0, r8, x0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3 -; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r24 // Delay Slot 1 +; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv +; CHECK-NEXT: nopx // Delay Slot 5 +; CHECK-NEXT: vmov wh2, wl0 // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: vmov x0, x2 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_2: // %if.then -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vextract.s32 r0, x4, r16; nopv -; CHECK-NEXT: vextract.s32 r1, x2, r16 -; CHECK-NEXT: mova r16, #1 -; CHECK-NEXT: vextract.s32 r2, x4, r16 -; CHECK-NEXT: vextract.s32 r3, x2, r16 -; CHECK-NEXT: mova r16, #2 -; CHECK-NEXT: vextract.s32 r4, x4, r16 -; CHECK-NEXT: vextract.s32 r5, x2, r16 -; CHECK-NEXT: mova r16, #3 -; CHECK-NEXT: vextract.s32 r6, x4, r16 -; CHECK-NEXT: vextract.s32 r7, x2, r16 -; CHECK-NEXT: mova r16, #4 -; CHECK-NEXT: vextract.s32 r8, x4, r16 -; CHECK-NEXT: vextract.s32 r9, x2, r16 -; CHECK-NEXT: mova r16, #5 -; CHECK-NEXT: vextract.s32 r10, x4, r16 -; CHECK-NEXT: vextract.s32 r11, x2, r16 -; CHECK-NEXT: mova r16, #7 -; CHECK-NEXT: vextract.s32 r12, x4, r16 -; CHECK-NEXT: vextract.s32 r13, x2, r16 -; CHECK-NEXT: mova r16, #6 -; CHECK-NEXT: vextract.s32 r14, x4, r16 -; CHECK-NEXT: vextract.s32 r15, x2, r16 -; CHECK-NEXT: vpush.lo.32 x0, r13, x0 -; CHECK-NEXT: vpush.lo.32 x0, r15, x0 -; CHECK-NEXT: vpush.lo.32 x0, r11, x0 -; CHECK-NEXT: vpush.lo.32 x0, r9, x0 -; CHECK-NEXT: vpush.lo.32 x0, r7, x0 -; CHECK-NEXT: vpush.lo.32 x0, r5, x0 -; CHECK-NEXT: vpush.lo.32 x0, r3, x0 -; CHECK-NEXT: vpush.lo.32 x0, r1, x0 -; CHECK-NEXT: vpush.lo.32 x0, r12, x0 -; CHECK-NEXT: vpush.lo.32 x0, r14, x0 -; CHECK-NEXT: vpush.lo.32 x0, r10, x0 -; CHECK-NEXT: vpush.lo.32 x0, r8, x0 ; CHECK-NEXT: ret lr -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3 -; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r24 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: vmov wh0, wl2 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 entry: %shuffle = shufflevector <8 x i32> %b, <8 x i32> undef, <16 x i32> %cmp = icmp eq i32 %idx, 0